# Malaysia Prison Data Generator

This Jupyter notebook generates realistic synthetic data for the Malaysia Prison Predictive Planning System.

**What this generates:**
- Population data across 33 Malaysian prisons
- Staffing requirements and ratios
- Resource costs and utilization
- State-wise and prison-level breakdowns

**Usage:**
1. Run all cells in order
2. Data files will be created in the `data/` directory
3. Run `streamlit run app.py` to start the application

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import json
import os
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducible results
np.random.seed(42)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 2. Setup Data Directory

In [None]:
# Create data directory if it doesn't exist
if not os.path.exists('data'):
    os.makedirs('data')
    print("Created 'data' directory")
else:
    print("'data' directory already exists")

## 3. Define Malaysian Prison Structure

In [None]:
# Define Malaysian prison structure by state (33 prisons total)
malaysia_prisons = {
    'Kedah': ['Pokok Sena Prison', 'Sungai Petani Prison', 'Alor Star Prison'],
    'Penang': ['Penang Prison', 'Seberang Prai Prison'],
    'Perak': ['Taiping Prison', 'Tapah Prison', 'Kamunting Detention Centre'],
    'Selangor': ['Sungai Buloh Prison', 'Kajang Prison', 'Kajang Women\'s Prison'],
    'Negeri Sembilan': ['Seremban Prison', 'Jelebu Prison'],
    'Melaka': ['Ayer Keroh Prison', 'Sungai Udang Prison', 'Banda Hilir Prison'],
    'Johor': ['Simpang Renggam Prison', 'Kluang Prison'],
    'Pahang': ['Bentong Prison', 'Penor Prison'],
    'Terengganu': ['Marang Prison'],
    'Kelantan': ['Pengkalan Chepa Prison'],
    'Sarawak': ['Puncak Borneo Prison', 'Sibu Prison', 'Miri Prison', 'Bintulu Prison', 'Sri Aman Prison', 'Limbang Prison'],
    'Sabah': ['Kota Kinabalu Prison', 'Kota Kinabalu Women\'s Prison', 'Tawau Prison', 'Sandakan Prison']
}

# Save prison structure
with open('data/malaysia_prisons.json', 'w') as f:
    json.dump(malaysia_prisons, f, indent=2)

total_prisons = sum(len(prisons) for prisons in malaysia_prisons.values())
print(f"Malaysian Prison System Structure:")
print(f"States: {len(malaysia_prisons)}")
print(f"Total Prisons: {total_prisons}")

for state, prisons in malaysia_prisons.items():
    print(f"  {state}: {len(prisons)} prisons")

## 4. Generate Time Series Framework

In [None]:
# Generate 7 years of monthly data (2019-2025)
dates = pd.date_range(start='2019-01-01', periods=84, freq='ME')

print(f"Time Series Framework:")
print(f"Start Date: {dates[0].strftime('%Y-%m-%d')}")
print(f"End Date: {dates[-1].strftime('%Y-%m-%d')}")
print(f"Total Periods: {len(dates)} months")
print(f"Years Covered: {dates[-1].year - dates[0].year + 1}")

## 5. Generate National Population Data

In [None]:
# Malaysia prison population model (realistic parameters)
base_population = 72500  # Baseline around 72,500 prisoners
trend = np.linspace(0, 3500, 84)  # Gradual increase over 7 years
seasonal = 1200 * np.sin(2 * np.pi * np.arange(84) / 12)  # Holiday/court schedule effects
noise = np.random.normal(0, 600, 84)  # Random variation

total_prisoners = base_population + trend + seasonal + noise
total_prisoners = np.maximum(total_prisoners, 68000)  # Minimum threshold

print(f"Population Statistics:")
print(f"Average Population: {total_prisoners.mean():.0f}")
print(f"Min Population: {total_prisoners.min():.0f}")
print(f"Max Population: {total_prisoners.max():.0f}")
print(f"Growth Over Period: {((total_prisoners[-1] - total_prisoners[0]) / total_prisoners[0] * 100):.1f}%")

## 6. Generate Demographics Data

In [None]:
# Gender distribution (Malaysian prison demographics)
male_ratio = np.random.normal(0.93, 0.01, 84)  # 93% male
male_prisoners = (total_prisoners * male_ratio).astype(int)
female_prisoners = total_prisoners.astype(int) - male_prisoners

# Age groups: Young (18-30), Middle (31-50), Old (50+)
young_ratio = np.random.normal(0.45, 0.02, 84)  # 45% young adults
middle_ratio = np.random.normal(0.42, 0.02, 84)  # 42% middle-aged
old_ratio = 1 - young_ratio - middle_ratio

young_prisoners = (total_prisoners * young_ratio).astype(int)
middle_prisoners = (total_prisoners * middle_ratio).astype(int)
old_prisoners = total_prisoners.astype(int) - young_prisoners - middle_prisoners

# Crime types (Malaysian crime patterns)
drug_crimes_ratio = np.random.normal(0.55, 0.03, 84)  # 55% drug-related crimes
violent_crimes_ratio = np.random.normal(0.18, 0.02, 84)  # 18% violent crimes
property_crimes_ratio = np.random.normal(0.15, 0.02, 84)  # 15% property crimes
other_crimes_ratio = 1 - drug_crimes_ratio - violent_crimes_ratio - property_crimes_ratio

drug_crimes = (total_prisoners * drug_crimes_ratio).astype(int)
violent_crimes = (total_prisoners * violent_crimes_ratio).astype(int)
property_crimes = (total_prisoners * property_crimes_ratio).astype(int)
other_crimes = total_prisoners.astype(int) - drug_crimes - violent_crimes - property_crimes

print(f"Demographics Overview:")
print(f"Male Ratio: {male_ratio.mean():.1%}")
print(f"Female Ratio: {(1-male_ratio.mean()):.1%}")
print(f"Young Adults (18-30): {young_ratio.mean():.1%}")
print(f"Middle-aged (31-50): {middle_ratio.mean():.1%}")
print(f"Older (50+): {old_ratio.mean():.1%}")
print(f"")
print(f"Crime Distribution:")
print(f"Drug Crimes: {drug_crimes_ratio.mean():.1%}")
print(f"Violent Crimes: {violent_crimes_ratio.mean():.1%}")
print(f"Property Crimes: {property_crimes_ratio.mean():.1%}")
print(f"Other Crimes: {other_crimes_ratio.mean():.1%}")

## 7. Generate Prison Flow Data

In [None]:
# Sentence lengths and prisoner flow
avg_sentence_months = np.random.normal(28, 8, 84)  # Average 28 months
avg_sentence_months = np.maximum(avg_sentence_months, 6)  # Minimum 6 months

# Monthly flow rates
monthly_releases = np.random.poisson(2400, 84)  # Average 2,400 releases/month
monthly_admissions = np.random.poisson(2450, 84)  # Average 2,450 admissions/month

print(f"Prison Flow Statistics:")
print(f"Average Sentence Length: {avg_sentence_months.mean():.1f} months")
print(f"Average Monthly Releases: {monthly_releases.mean():.0f}")
print(f"Average Monthly Admissions: {monthly_admissions.mean():.0f}")
print(f"Net Monthly Change: {(monthly_admissions.mean() - monthly_releases.mean()):.0f}")

## 8. Create National Population Dataset

In [None]:
# Create national population dataframe
population_data = pd.DataFrame({
    'date': dates,
    'total_prisoners': total_prisoners.astype(int),
    'male_prisoners': male_prisoners,
    'female_prisoners': female_prisoners,
    'young_prisoners': young_prisoners,
    'middle_prisoners': middle_prisoners,
    'old_prisoners': old_prisoners,
    'drug_crimes': drug_crimes,
    'violent_crimes': violent_crimes,
    'property_crimes': property_crimes,
    'other_crimes': other_crimes,
    'avg_sentence_months': avg_sentence_months,
    'monthly_releases': monthly_releases,
    'monthly_admissions': monthly_admissions
})

# Save to CSV
population_data.to_csv('data/population_data.csv', index=False)

print(f"National Population Data Created:")
print(f"Records: {len(population_data)}")
print(f"Columns: {len(population_data.columns)}")
print(f"File: data/population_data.csv")

# Display sample
population_data.head()

## 9. Generate State-Level Distribution

In [None]:
# State population distribution (based on actual Malaysian prison capacity)
state_populations = {
    'Selangor': 0.32,    # Largest - Sungai Buloh complex (12,000+ capacity)
    'Sarawak': 0.16,     # Large state, rural crime, drug trafficking
    'Johor': 0.14,       # High crime rate, Singapore border issues
    'Sabah': 0.11,       # Large state, border security issues
    'Perak': 0.09,       # Kamunting Detention Centre, historic prisons
    'Kedah': 0.06,       # Medium population state
    'Pahang': 0.04,      # Large but sparse population
    'Penang': 0.03,      # Urban crime, smaller capacity
    'Melaka': 0.02,      # Historic, tourist area
    'Negeri Sembilan': 0.02,  # Smaller industrial state
    'Kelantan': 0.005,   # Conservative state, lower crime
    'Terengganu': 0.005  # Oil state, lower urban crime
}

print(f"State Distribution:")
for state, ratio in state_populations.items():
    avg_population = total_prisoners.mean() * ratio
    print(f"  {state}: {ratio:.1%} ({avg_population:.0f} prisoners)")

print(f"\nTotal Distribution: {sum(state_populations.values()):.1%}")

## 10. Generate Prison-Level Detail Data

In [None]:
# Generate detailed prison-level data
state_data_list = []

for date in dates:
    date_idx = list(dates).index(date)
    total_pop = int(total_prisoners[date_idx])
    
    for state, ratio in state_populations.items():
        state_total = int(total_pop * ratio)
        prisons_in_state = malaysia_prisons[state]
        
        # Distribute population among prisons in the state
        if len(prisons_in_state) == 1:
            prison_pops = [state_total]
        else:
            # Create realistic distribution (some prisons are larger)
            if state == 'Selangor':
                # Sungai Buloh is the largest prison complex
                ratios = [0.6, 0.25, 0.15]  # Sungai Buloh, Kajang, Kajang Women's
            elif state == 'Perak':
                # Kamunting is supermax (smaller), Taiping is historic (larger)
                ratios = [0.5, 0.35, 0.15]  # Taiping, Tapah, Kamunting
            else:
                # Even distribution with slight variation
                base_ratio = 1.0 / len(prisons_in_state)
                ratios = [base_ratio + np.random.normal(0, 0.05) for _ in prisons_in_state]
                ratios = [max(0.1, r) for r in ratios]  # Minimum 10%
                total_ratio = sum(ratios)
                ratios = [r/total_ratio for r in ratios]  # Normalize
            
            prison_pops = [int(state_total * r) for r in ratios]
            # Adjust to match state total
            prison_pops[-1] += state_total - sum(prison_pops)
        
        # Add data for each prison
        for i, prison_name in enumerate(prisons_in_state):
            prison_pop = max(50, prison_pops[i])  # Minimum 50 prisoners per prison
            
            # Calculate prison-level demographics
            prison_male = int(prison_pop * male_ratio[date_idx])
            prison_female = prison_pop - prison_male
            
            # Special case for women's prisons
            if 'Women' in prison_name:
                prison_male = 0
                prison_female = prison_pop
            
            prison_young = int(prison_pop * young_ratio[date_idx])
            prison_middle = int(prison_pop * middle_ratio[date_idx])
            prison_old = prison_pop - prison_young - prison_middle
            
            prison_drug = int(prison_pop * drug_crimes_ratio[date_idx])
            prison_violent = int(prison_pop * violent_crimes_ratio[date_idx])
            prison_property = int(prison_pop * property_crimes_ratio[date_idx])
            prison_other = prison_pop - prison_drug - prison_violent - prison_property
            
            state_data_list.append({
                'date': date,
                'state': state,
                'prison_name': prison_name,
                'prison_population': prison_pop,
                'male_prisoners': prison_male,
                'female_prisoners': prison_female,
                'young_prisoners': prison_young,
                'middle_prisoners': prison_middle,
                'old_prisoners': prison_old,
                'drug_crimes': prison_drug,
                'violent_crimes': prison_violent,
                'property_crimes': prison_property,
                'other_crimes': prison_other
            })

# Create detailed prison dataframe
prison_detail_data = pd.DataFrame(state_data_list)
prison_detail_data.to_csv('data/prison_detail_data.csv', index=False)

print(f"Prison Detail Data Created:")
print(f"Total Records: {len(prison_detail_data)}")
print(f"Prisons: {prison_detail_data['prison_name'].nunique()}")
print(f"States: {prison_detail_data['state'].nunique()}")
print(f"Time Periods: {prison_detail_data['date'].nunique()}")
print(f"File: data/prison_detail_data.csv")

# Show sample
prison_detail_data.head()

## 11. Generate Staffing Data

In [None]:
# Generate staffing data
base_ratio = 0.28  # Staff-to-prisoner ratio (28%)
ratio_variation = np.random.normal(0, 0.02, 84)
staff_ratio = base_ratio + ratio_variation

total_staff = (total_prisoners * staff_ratio).astype(int)

# Staff categories
security_staff_ratio = np.random.normal(0.65, 0.03, 84)  # 65% security
admin_staff_ratio = np.random.normal(0.15, 0.02, 84)     # 15% admin
medical_staff_ratio = np.random.normal(0.08, 0.01, 84)   # 8% medical

security_staff = (total_staff * security_staff_ratio).astype(int)
admin_staff = (total_staff * admin_staff_ratio).astype(int)
medical_staff = (total_staff * medical_staff_ratio).astype(int)
other_staff = total_staff - security_staff - admin_staff - medical_staff

# Staff availability factors
overtime_hours = np.random.normal(120, 20, 84)  # Average 120 hours/month
overtime_hours = np.maximum(overtime_hours, 60)

sick_leave_rate = np.random.normal(0.08, 0.02, 84)    # 8% sick leave
vacation_rate = np.random.normal(0.12, 0.02, 84)      # 12% vacation

available_staff = total_staff * (1 - sick_leave_rate - vacation_rate)

# Create staffing dataframe
staffing_data = pd.DataFrame({
    'date': dates,
    'total_staff': total_staff,
    'security_staff': security_staff,
    'admin_staff': admin_staff,
    'medical_staff': medical_staff,
    'other_staff': other_staff,
    'overtime_hours': overtime_hours,
    'sick_leave_rate': sick_leave_rate,
    'vacation_rate': vacation_rate,
    'available_staff': available_staff.astype(int),
    'staff_prisoner_ratio': staff_ratio
})

staffing_data.to_csv('data/staffing_data.csv', index=False)

print(f"Staffing Data Created:")
print(f"Records: {len(staffing_data)}")
print(f"Average Total Staff: {total_staff.mean():.0f}")
print(f"Average Staff-Prisoner Ratio: {staff_ratio.mean():.1%}")
print(f"File: data/staffing_data.csv")

# Display sample
staffing_data.head()

## 12. Generate Resource and Cost Data

In [None]:
# Generate resource and cost data
total_capacity = 95000  # Malaysian prison system total capacity
capacity_utilization = (total_prisoners / total_capacity) * 100

# Cost structure (Malaysian prison costs - lower than developed countries)
base_daily_cost = 35  # MYR per prisoner per day (realistic for Malaysia)
daily_cost_variation = np.random.normal(0, 2, 84)
daily_cost_per_prisoner = base_daily_cost + daily_cost_variation

# Monthly cost calculations
total_monthly_cost = total_prisoners * daily_cost_per_prisoner * 30
monthly_food_cost = total_monthly_cost * 0.40      # 40% food
monthly_medical_cost = total_monthly_cost * 0.15   # 15% medical
monthly_utility_cost = total_monthly_cost * 0.20   # 20% utilities
monthly_other_cost = total_monthly_cost * 0.25     # 25% other expenses

# Operational efficiency metrics
energy_efficiency = np.random.normal(0.75, 0.05, 84)      # 75% energy efficiency
food_waste_rate = np.random.normal(0.13, 0.02, 84)        # 13% food waste
maintenance_cost = total_monthly_cost * np.random.normal(0.08, 0.01, 84)  # 8% maintenance

# Additional cost categories
medical_supplies_cost = monthly_medical_cost * np.random.normal(0.3, 0.05, 84)
security_equipment_cost = total_monthly_cost * np.random.normal(0.02, 0.005, 84)

# Create resource dataframe
resource_data = pd.DataFrame({
    'date': dates,
    'total_capacity': np.full(84, total_capacity),
    'capacity_utilization': capacity_utilization,
    'daily_cost_per_prisoner': daily_cost_per_prisoner,
    'total_monthly_cost': total_monthly_cost,
    'monthly_food_cost': monthly_food_cost,
    'monthly_medical_cost': monthly_medical_cost,
    'monthly_utility_cost': monthly_utility_cost,
    'monthly_other_cost': monthly_other_cost,
    'energy_efficiency': energy_efficiency,
    'food_waste_rate': food_waste_rate,
    'maintenance_cost': maintenance_cost,
    'medical_supplies_cost': medical_supplies_cost,
    'security_equipment_cost': security_equipment_cost
})

resource_data.to_csv('data/resource_data.csv', index=False)

print(f"Resource Data Created:")
print(f"Records: {len(resource_data)}")
print(f"Average Monthly Cost: MYR {total_monthly_cost.mean()/1000000:.1f}M")
print(f"Average Daily Cost per Prisoner: MYR {daily_cost_per_prisoner.mean():.0f}")
print(f"Average Capacity Utilization: {capacity_utilization.mean():.1f}%")
print(f"File: data/resource_data.csv")

# Display sample
resource_data.head()

## 13. Data Visualization

In [None]:
# Create visualizations
plt.figure(figsize=(15, 10))

# Population trends
plt.subplot(2, 3, 1)
plt.plot(dates, total_prisoners, 'b-', linewidth=2)
plt.title('Total Prison Population Over Time')
plt.xlabel('Date')
plt.ylabel('Prisoners')
plt.grid(True, alpha=0.3)

# Gender distribution
plt.subplot(2, 3, 2)
plt.plot(dates, male_prisoners, 'b-', label='Male', linewidth=2)
plt.plot(dates, female_prisoners, 'r-', label='Female', linewidth=2)
plt.title('Gender Distribution')
plt.xlabel('Date')
plt.ylabel('Prisoners')
plt.legend()
plt.grid(True, alpha=0.3)

# Crime types
plt.subplot(2, 3, 3)
plt.plot(dates, drug_crimes, 'g-', label='Drug Crimes', linewidth=2)
plt.plot(dates, violent_crimes, 'r-', label='Violent Crimes', linewidth=2)
plt.plot(dates, property_crimes, 'b-', label='Property Crimes', linewidth=2)
plt.title('Crime Type Distribution')
plt.xlabel('Date')
plt.ylabel('Prisoners')
plt.legend()
plt.grid(True, alpha=0.3)

# Staffing levels
plt.subplot(2, 3, 4)
plt.plot(dates, total_staff, 'purple', linewidth=2)
plt.title('Total Staff Over Time')
plt.xlabel('Date')
plt.ylabel('Staff Members')
plt.grid(True, alpha=0.3)

# Monthly costs
plt.subplot(2, 3, 5)
plt.plot(dates, total_monthly_cost/1000000, 'orange', linewidth=2)
plt.title('Monthly Costs (Million MYR)')
plt.xlabel('Date')
plt.ylabel('Cost (Million MYR)')
plt.grid(True, alpha=0.3)

# Capacity utilization
plt.subplot(2, 3, 6)
plt.plot(dates, capacity_utilization, 'brown', linewidth=2)
plt.axhline(y=100, color='red', linestyle='--', label='100% Capacity')
plt.title('Capacity Utilization (%)')
plt.xlabel('Date')
plt.ylabel('Utilization %')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Data visualization completed!")

## 14. Data Summary and Validation

In [None]:
# Final data summary
print("=" * 60)
print("MALAYSIA PRISON DATA GENERATION COMPLETE")
print("=" * 60)

print(f"\n📊 Dataset Overview:")
print(f"   Total Prisons: {sum(len(prisons) for prisons in malaysia_prisons.values())}")
print(f"   States: {len(malaysia_prisons)}")
print(f"   Time Period: {dates[0].strftime('%Y-%m-%d')} to {dates[-1].strftime('%Y-%m-%d')}")
print(f"   Data Points: {len(dates)} months")

print(f"\n👥 Population Statistics:")
print(f"   Average Population: {total_prisoners.mean():.0f} prisoners")
print(f"   Population Range: {total_prisoners.min():.0f} - {total_prisoners.max():.0f}")
print(f"   Male Ratio: {male_ratio.mean():.1%}")
print(f"   Drug Crime Ratio: {drug_crimes_ratio.mean():.1%}")

print(f"\n👮 Staffing Statistics:")
print(f"   Average Staff: {total_staff.mean():.0f} staff members")
print(f"   Staff-Prisoner Ratio: {staff_ratio.mean():.1%}")
print(f"   Security Staff: {security_staff_ratio.mean():.1%}")

print(f"\n💰 Cost Statistics:")
print(f"   Average Monthly Cost: MYR {total_monthly_cost.mean()/1000000:.1f}M")
print(f"   Average Daily Cost per Prisoner: MYR {daily_cost_per_prisoner.mean():.0f}")
print(f"   Average Capacity Utilization: {capacity_utilization.mean():.1f}%")

print(f"\n📁 Files Created:")
files_created = [
    'data/population_data.csv',
    'data/prison_detail_data.csv', 
    'data/staffing_data.csv',
    'data/resource_data.csv',
    'data/malaysia_prisons.json'
]

for file_path in files_created:
    if os.path.exists(file_path):
        file_size = os.path.getsize(file_path) / 1024  # KB
        print(f"   ✓ {file_path} ({file_size:.1f} KB)")
    else:
        print(f"   ❌ {file_path} (not found)")

print(f"\n🎉 Data generation completed successfully!")
print(f"▶️  You can now run: streamlit run app.py")

## 15. Quick Data Validation

In [None]:
# Validate data integrity
print("Data Validation Checks:")
print("=" * 30)

# Check population totals
pop_check = (male_prisoners + female_prisoners) - total_prisoners.astype(int)
print(f"Population sum check: {pop_check.max()} (should be 0)")

# Check crime totals
crime_check = (drug_crimes + violent_crimes + property_crimes + other_crimes) - total_prisoners.astype(int)
print(f"Crime sum check: {crime_check.max()} (should be 0)")

# Check age totals
age_check = (young_prisoners + middle_prisoners + old_prisoners) - total_prisoners.astype(int)
print(f"Age sum check: {age_check.max()} (should be 0)")

# Check state distribution
latest_prison_data = prison_detail_data[prison_detail_data['date'] == prison_detail_data['date'].max()]
state_totals = latest_prison_data.groupby('state')['prison_population'].sum()
grand_total = state_totals.sum()
latest_national = population_data[population_data['date'] == population_data['date'].max()]['total_prisoners'].iloc[0]

print(f"State distribution check: {grand_total} vs {latest_national} (difference: {abs(grand_total - latest_national)})")

if all([pop_check.max() == 0, crime_check.max() == 0, age_check.max() == 0, abs(grand_total - latest_national) < 100]):
    print("\n✅ All validation checks passed!")
else:
    print("\n⚠️  Some validation checks failed - please review data generation")

print(f"\nState Population Distribution (Latest Month):")
for state in sorted(state_totals.index):
    percentage = (state_totals[state] / grand_total) * 100
    print(f"  {state}: {state_totals[state]:,} ({percentage:.1f}%)")