In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotnine as p9
import os

from lib.lib import Import_data

path = Import_data()
# get all files in the directory
files = os.listdir(path)
print(files)

Path to dataset files: /Users/daniel/.cache/kagglehub/datasets/arashnic/earthquake-magnitude-damage-and-impact/versions/6
['csv_household_resources.csv', 'csv_building_structure.csv', 'ward_vdcmun_district_name_mapping.csv', 'mapping.csv', 'csv_building_ownership_and_use.csv', 'csv_building_damage_assessment.csv', 'csv_household_demographics.csv', 'csv_individual_demographics.csv', 'csv_household_earthquake_impact.csv']


In [88]:
building_structure = pd.read_csv(os.path.join(path, "csv_building_structure.csv"))

features = [
    "plinth_area_sq_ft",
    "height_ft_pre_eq",
    "age_building",
    "count_floors_pre_eq",
    "land_surface_condition",
    "foundation_type",
    "roof_type",
    "ground_floor_type",
    "other_floor_type",
    "plan_configuration",
    "has_superstructure_adobe_mud",
    "has_superstructure_mud_mortar_stone",
    "has_superstructure_stone_flag",
    "has_superstructure_cement_mortar_stone",
    "has_superstructure_mud_mortar_brick",
    "has_superstructure_cement_mortar_brick",
    "has_superstructure_timber",
    "has_superstructure_bamboo",
    "has_superstructure_rc_non_engineered",
    "has_superstructure_rc_engineered",
    "has_superstructure_other",
]

In [89]:
def categorize_building_age(df):
    # Define category labels
    categories = [
        "New (0-4 years)",
        "Recent (5-19 years)",
        "Established (20-49 years)",
        "Historic (50-99 years)",
        "Antique (100+ years)"
    ]
    
    # Create the new column
    df['building_age_category'] = pd.cut(
        df['age_building'], 
        bins=[0, 5, 20, 50, 100, float('inf')], 
        labels=categories, 
        right=False
    )
    
    return df['building_age_category']

In [90]:
data = building_structure[features].copy()

data["age_building"] = categorize_building_age(building_structure)

In [91]:
def bin_column(df, col):
    mean = df[col].mean()
    std_dev = df[col].std()

    min_edge = max(0, mean - 5 * std_dev)  
    bin_edges = [min_edge] + [mean + i * std_dev for i in range(-4, 6)]
    bin_edges = sorted(list(set(bin_edges))) 

    labels = [f"{bin_edges[i]:.0f} to {bin_edges[i+1]:.0f}" for i in range(len(bin_edges)-1)]


    df[col + "_bins"] = pd.cut(
        df[col], 
        bins=bin_edges, 
        labels=labels, 
        include_lowest=True
    )

    return df[col + "_bins"]

In [92]:
data["plinth_area_sq_ft"] = bin_column(data, "plinth_area_sq_ft")
data.drop("plinth_area_sq_ft", axis=1, inplace=True)
data["height_ft_pre_eq"] = bin_column(data, "height_ft_pre_eq")
data.drop("height_ft_pre_eq", axis=1, inplace=True)

In [93]:
data.columns

Index(['age_building', 'count_floors_pre_eq', 'land_surface_condition',
       'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'plinth_area_sq_ft_bins', 'height_ft_pre_eq_bins'],
      dtype='object')

In [94]:
from kmodes.kmodes import KModes

def create_building_presets(data, n_clusters=10):
    """
    Create scientifically-based building presets using clustering on structural features
    """
    # Select only the structural features that impact earthquake damage
    structural_features = ['age_building', 'count_floors_pre_eq', 'land_surface_condition',
       'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'plinth_area_sq_ft_bins', 'height_ft_pre_eq_bins']

    age_features = ['age_building']
    building_size_features = ['plinth_area_sq_ft_bins', 'height_ft_pre_eq_bins', 'count_floors_pre_eq']
    building_material_features = [
       'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       ]

    building_data = data[building_material_features].dropna()

    km = KModes(n_clusters=n_clusters, init='random', n_init=10, verbose=1)
    clusters = km.fit_predict(building_data[building_material_features])
    centroids = km.cluster_centroids_
    
    # Add cluster labels to the data
    building_data['cluster'] = clusters
    
    # Analyze each cluster
    cluster_stats = {}
    for i in range(n_clusters):
        cluster_i = building_data[building_data['cluster'] == i]
        
        # Get mode for categorical variables and mean for numerical
        cluster_profile = {}
        for feat in building_material_features:
            cluster_profile[feat] = cluster_i[feat].mode()[0]

        # Also get the damage grade distribution (if in data)
        if 'damage_grade' in data.columns:
            damage_dist = data.loc[cluster_i.index, 'damage_grade'].value_counts(normalize=True)
            cluster_profile['damage_grade_distribution'] = damage_dist.to_dict()
            # Get the most common damage grade
            cluster_profile['most_common_damage'] = damage_dist.idxmax()
            
        cluster_stats[f"Building_Type_{i+1}"] = cluster_profile
    
    structure = cluster_stats

    building_data = data[building_size_features].dropna()

    km = KModes(n_clusters=n_clusters, init='random', n_init=10, verbose=1)
    clusters = km.fit_predict(building_data[building_size_features])
    centroids = km.cluster_centroids_
    
    # Add cluster labels to the data
    building_data['cluster'] = clusters
    
    # Analyze each cluster
    cluster_stats = {}
    for i in range(n_clusters):
        cluster_i = building_data[building_data['cluster'] == i]
        
        # Get mode for categorical variables and mean for numerical
        cluster_profile = {}
        for feat in building_size_features:
            cluster_profile[feat] = cluster_i[feat].mode()[0]

        # Also get the damage grade distribution (if in data)
        if 'damage_grade' in data.columns:
            damage_dist = data.loc[cluster_i.index, 'damage_grade'].value_counts(normalize=True)
            cluster_profile['damage_grade_distribution'] = damage_dist.to_dict()
            # Get the most common damage grade
            cluster_profile['most_common_damage'] = damage_dist.idxmax()
            
        cluster_stats[f"Building_Type_{i+1}"] = cluster_profile
    
    size = cluster_stats

    return structure, size

structure, size = create_building_presets(data.sample(2000).copy(), n_clusters=6)



# display(pd.DataFrame(size))
print(pd.DataFrame(size).to_latex())


Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 94, cost: 2448.0
Run 1, iteration: 2/100, moves: 7, cost: 2448.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 228, cost: 2140.0
Run 2, iteration: 2/100, moves: 78, cost: 2118.0
Run 2, iteration: 3/100, moves: 29, cost: 2118.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 238, cost: 2069.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 224, cost: 2556.0
Run 4, iteration: 2/100, moves: 3, cost: 2556.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 243, cost: 2335.0
Run 5, iteration: 2/100, moves: 16, cost: 2335.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 6, iteration: 1/100, mov

In [95]:
person_table = pd.read_csv(os.path.join(path, "csv_household_demographics.csv"))
person_table['education_level_household_head'].value_counts()


education_level_household_head
Illiterate                    263157
Non-formal education          157724
Class 5                        45267
SLC or equivalent              42988
Intermediate or equivalent     29181
Class 4                        27836
Class 8                        27520
Class 3                        26217
Class 10                       24097
Class 2                        22331
Class 7                        21460
Class 6                        17099
Class 9                        12184
Bachelors or equivalent        10925
Class 1                         9996
Masters or equivalent           4507
Other                           2838
Nursery/K.G./Kindergarten       1741
Ph.D. or equivalent               69
Name: count, dtype: int64

In [96]:
def convert_income(income):
    mapping = {
        "Rs. 10 thousand": 10000,
        "Rs. 10-20 thousand": 15000,
        "Rs. 20-30 thousand": 25000,
        "Rs. 30-50 thousand": 40000,
        "Rs. 50 thousand or more": 50000
    }
    return mapping.get(income, None)  # Convert or return None if not found


def create_household_presets(data, n_clusters=10):
    data = data.drop(columns=["household_id", "district_id", "vdcmun_id", "ward_id"])
    data = data.dropna()
    data["income_level_household"] = data["income_level_household"].apply(convert_income)
    print("income Level avg :", data['income_level_household'].mean())
    #data = data[(data['education_level_household_head'] == 'Bachelors or equivalent') | 
    #        (data['education_level_household_head'] == 'Masters or equivalent')]

    categorical_features = ['gender_household_head', 'caste_household', 'education_level_household_head']
    numerical_features = ['income_level_household', 'size_household', 'age_household_head']
    km = KModes(n_clusters=n_clusters, init='random', n_init=10)
    clusters = km.fit_predict(data)
    #centroids = km.cluster_centroids_
    person_features = data.columns
    # Add cluster labels to the data
    data['cluster'] = clusters
    
    # Analyze each cluster
    cluster_stats = {}
    for i in range(n_clusters):
        cluster_i = data[data['cluster'] == i]
        
        # Get mode for categorical variables and mean for numerical
        cluster_profile = {}
        for feat in person_features:
            if feat in categorical_features:
                mode_value = cluster_i[feat].mode()
                cluster_profile[feat] = mode_value.iat[0] if not mode_value.empty else None  # Handle empty mode
            elif feat in numerical_features:
                cluster_profile[feat] = cluster_i[feat].mean()  # Use mean for numerical
            else:
                cluster_profile[feat] = "Unknown Type"  # Catch any unexpected cases


        # Also get the damage grade distribution (if in data)
        if 'damage_grade' in data.columns:
            damage_dist = data.loc[cluster_i.index, 'damage_grade'].value_counts(normalize=True)
            cluster_profile['damage_grade_distribution'] = damage_dist.to_dict()
            # Get the most common damage grade
            cluster_profile['most_common_damage'] = damage_dist.idxmax()
            
        cluster_stats[f"Person_type{i+1}"] = cluster_profile
    
    return cluster_stats

cluster_stats = create_household_presets(person_table.sample(2000).copy(), n_clusters=6)
print(pd.DataFrame(cluster_stats))

income Level avg : 14437.5
                                              Person_type1  Person_type2  \
gender_household_head                                 Male        Female   
age_household_head                               44.417981     45.751121   
caste_household                                   Chhetree        Tamang   
education_level_household_head        Non-formal education    Illiterate   
income_level_household                        13777.602524   13318.38565   
size_household                                    4.681388      4.295964   
is_bank_account_present_in_household          Unknown Type  Unknown Type   

                                      Person_type3          Person_type4  \
gender_household_head                         Male                  Male   
age_household_head                       51.136612             44.121339   
caste_household                             Tamang          Brahman-Hill   
education_level_household_head          Illiterate  Non-form

In [97]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from kmodes.kmodes import KModes
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

    

def create_building_presets(data, features, type_of_preset, n_clusters=5):
    """
    Create scientifically-based building presets using clustering on structural features
    """
    
    # Filter only needed columns and drop rows with missing values
    building_data = data[features].dropna()
    
    # Separate numerical and categorical features
    numerical_features = building_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = building_data.select_dtypes(include=['object']).columns.tolist()
    
    # Process categorical features with KModes
    if categorical_features:
        km = KModes(n_clusters=n_clusters, init='Huang', n_init=10, verbose=1)
        clusters = km.fit_predict(building_data[categorical_features])
        centroids = km.cluster_centroids_
    else:
        # If no categorical features, use regular KMeans
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        clusters = kmeans.fit_predict(building_data[numerical_features])
    
    # Add cluster labels to the data
    building_data['cluster'] = clusters
    
    # Analyze each cluster
    cluster_stats = {}
    for i in range(n_clusters):
        cluster_i = building_data[building_data['cluster'] == i]
        
        # Get mode for categorical variables and mean for numerical
        cluster_profile = {}
        for feat in categorical_features:
            cluster_profile[feat] = cluster_i[feat].mode()[0]
        
        for feat in numerical_features:
            cluster_profile[feat] = cluster_i[feat].mean()
            
        # Also get the damage grade distribution (if in data)
        if 'damage_grade' in data.columns:
            damage_dist = data.loc[cluster_i.index, 'damage_grade'].value_counts(normalize=True)
            cluster_profile['damage_grade_distribution'] = damage_dist.to_dict()
            # Get the most common damage grade
            cluster_profile['most_common_damage'] = damage_dist.idxmax()
            
        cluster_stats[f"Building_Type_{i+1}"] = cluster_profile
    

    if type_of_preset == 'building_type':
        # Calculate price tiers based on materials and size
        # This can be refined based on your exact requirements
        for building_type, profile in cluster_stats.items():
            # Simple pricing model based on:
            # 1. Floor area
            # 2. Number of floors
            # 3. Higher quality materials (RC, cement vs mud, etc.)
            
            base_price = profile.get('plinth_area_sq_ft', 300) * 100  # Base price per sq ft
            
            # Adjust for number of floors
            floor_multiplier = profile.get('count_floors_pre_eq', 1) * 0.3 + 1
            
            # Adjust for material quality
            material_multiplier = 1.0
            if profile.get('has_superstructure_rc_engineered', 0) > 0.5:
                material_multiplier = 2.0
            elif profile.get('has_superstructure_rc_non_engineered', 0) > 0.5:
                material_multiplier = 1.8
            elif profile.get('has_superstructure_cement_mortar_brick', 0) > 0.5:
                material_multiplier = 1.5
            
            # Calculate final price
            profile['estimated_cost'] = base_price * floor_multiplier * material_multiplier
            
            # Create a budget tier based on cost
            if profile['estimated_cost'] < 50000:
                profile['budget_tier'] = 'Low'
            elif profile['estimated_cost'] < 100000:
                profile['budget_tier'] = 'Medium'
            else:
                profile['budget_tier'] = 'High'

    elif type_of_preset == 'person_type':
        pass


    return cluster_stats

def create_household_budget_tiers(data):
    """
    Create household profiles that determine budget constraints
    """
    # Features related to household economics
    household_features = [
        'income_level_household', 'education_level_household_head',
        'is_bank_account_present_in_household'
    ]
    
    # Create budget profiles based on household characteristics
    budget_profiles = {
        "Low_Income_Household": {
            "income_level_household": "Low income",
            "budget_multiplier": 0.7,
            "description": "Limited resources, may need to prioritize basic construction"
        },
        "Middle_Income_Household": {
            "income_level_household": "Medium income",
            "is_bank_account_present_in_household": 1,
            "budget_multiplier": 1.0,
            "description": "Can afford standard construction techniques"
        },
        "High_Income_Household": {
            "income_level_household": "High income",
            "education_level_household_head": "Higher education",
            "budget_multiplier": 1.5,
            "description": "Can afford premium materials and engineering"
        }
    }
    
    return budget_profiles

def analyze_damage_by_features(data):
    """
    Analyze which features most strongly correlate with damage outcomes
    """
    if 'damage_grade' not in data.columns:
        return "Damage grade data not available"
    
    # Convert damage_grade to numeric if it's categorical
    if data['damage_grade'].dtype == 'object':
        damage_map = {'Grade 1': 1, 'Grade 2': 2, 'Grade 3': 3, 
                      1: 1, 2: 2, 3: 3,
                      'Slight': 1, 'Moderate': 2, 'Heavy': 3}
        data['damage_numeric'] = data['damage_grade'].map(damage_map)
    else:
        data['damage_numeric'] = data['damage_grade']
    
    # Analyze numerical features
    numerical_features = [
        'count_floors_pre_eq', 'plinth_area_sq_ft', 'height_ft_pre_eq'
    ]
    
    numerical_impact = {}
    for feature in numerical_features:
        if feature in data.columns:
            # Calculate mean damage by feature bins
            data[f'{feature}_bin'] = pd.qcut(data[feature], q=5, duplicates='drop')
            damage_by_bin = data.groupby(f'{feature}_bin')['damage_numeric'].mean()
            numerical_impact[feature] = damage_by_bin.to_dict()
    
    # Analyze categorical features
    categorical_features = [
        'foundation_type', 'roof_type', 'ground_floor_type',
        'has_superstructure_mud_mortar_stone', 'has_superstructure_rc_engineered'
    ]
    
    categorical_impact = {}
    for feature in categorical_features:
        if feature in data.columns:
            damage_by_category = data.groupby(feature)['damage_numeric'].mean()
            categorical_impact[feature] = damage_by_category.to_dict()
    
    return {
        'numerical_impact': numerical_impact,
        'categorical_impact': categorical_impact
    }

def simulate_game(data, building_presets, budget_profiles):
    """
    Simulate the game flow with the presets
    """
    # Step 1: Player selects household profile (determines budget)
    print("Step 1: Select your household type (determines your budget)")
    for i, (profile_name, profile) in enumerate(budget_profiles.items()):
        print(f"{i+1}. {profile_name.replace('_', ' ')}: {profile['description']}")
    
    # In a real game, player would select here
    selected_profile = list(budget_profiles.keys())[1]  # Middle income for simulation
    player_budget = 100000 * budget_profiles[selected_profile]['budget_multiplier']
    print(f"\nSelected: {selected_profile.replace('_', ' ')}")
    print(f"Your budget: ${player_budget:,.2f}")
    
    # Step 2: Show building options that fit within budget
    print("\nStep 2: Select your building type")
    affordable_options = []
    
    for building_name, building in building_presets.items():
        cost = building['estimated_cost']
        if cost <= player_budget:
            affordable_options.append((building_name, building))
            damage_risk = building.get('most_common_damage', 'Unknown')
            print(f"{len(affordable_options)}. {building_name.replace('_', ' ')} - Cost: ${cost:,.2f}")
            print(f"   Features: {building.get('count_floors_pre_eq', 1):.0f} floors, "
                  f"{building.get('plinth_area_sq_ft', 0):.0f} sq ft")
            
            # Show the primary construction material
            material = "Traditional"
            if building.get('has_superstructure_rc_engineered', 0) > 0.5:
                material = "Engineered Reinforced Concrete"
            elif building.get('has_superstructure_rc_non_engineered', 0) > 0.5:
                material = "Reinforced Concrete"
            elif building.get('has_superstructure_cement_mortar_brick', 0) > 0.5:
                material = "Cement-Mortar Brick"
            print(f"   Primary Material: {material}")
    
    # Player selects a building
    selected_building_idx = 0  # In real game, player would choose
    selected_building = affordable_options[selected_building_idx][0]
    building_details = affordable_options[selected_building_idx][1]
    
    print(f"\nSelected: {selected_building.replace('_', ' ')}")
    
    # Step 3: Simulate earthquake and show outcome
    if 'damage_grade_distribution' in building_details:
        damage_dist = building_details['damage_grade_distribution']
        most_likely_damage = max(damage_dist.items(), key=lambda x: x[1])
        
        print("\nEarthquake Simulation Result:")
        print(f"Your building experienced {most_likely_damage[0]} damage")
        print("Damage probability distribution:")
        for grade, prob in damage_dist.items():
            print(f"  {grade}: {prob*100:.1f}%")
    
    return {
        'household': selected_profile,
        'building': selected_building,
        'budget': player_budget,
        'cost': building_details['estimated_cost'],
        'outcome': building_details.get('most_common_damage', 'Unknown')
    }

# Example usage:
# building_presets = create_building_presets(data)
# budget_profiles = create_household_budget_tiers(data)
# feature_impact = analyze_damage_by_features(data)
# game_result = simulate_game(data, building_presets, budget_profiles)

In [98]:
building_features = [
        'count_floors_pre_eq', 'plinth_area_sq_ft_bins', 'height_ft_pre_eq_bins', 
        'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type',
        'has_superstructure_mud_mortar_stone', 'has_superstructure_cement_mortar_stone',
        'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick',
        'has_superstructure_timber', 'has_superstructure_bamboo',
        'has_superstructure_rc_non_engineered', 'has_superstructure_rc_engineered']
person_features = []
building_presets = create_building_presets(data, building_features, "building_type")
budget_profiles = create_household_budget_tiers(data)
feature_impact = analyze_damage_by_features(data)
game_result = simulate_game(data, building_presets, budget_profiles)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 13619, cost: 371168.0
Run 1, iteration: 2/100, moves: 8556, cost: 371168.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 162851, cost: 424583.0
Run 2, iteration: 2/100, moves: 123745, cost: 424583.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 47289, cost: 495574.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 110197, cost: 335407.0
Run 4, iteration: 2/100, moves: 13021, cost: 335407.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 98320, cost: 431763.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 6, iteration: 1/100, moves: 4943, cost: 425758.0
Init: initializing centroids
Init

In [99]:
# Use KModes to cluster the data
from kmodes.kmodes import KModes

km = KModes(n_clusters=5, init='random', n_init=5, verbose=1, n_jobs=2)
clusters = km.fit_predict(data.dropna())
clusters


Init: initializing centroids
Init: initializing centroids
Init: initializing clusters
Init: initializing clusters
Starting iterations...
Starting iterations...
Run 2, iteration: 1/100, moves: 82336, cost: 2493969.0
Init: initializing centroids
Init: initializing clusters
Run 1, iteration: 1/100, moves: 210154, cost: 2209713.0
Starting iterations...
Run 1, iteration: 2/100, moves: 16360, cost: 2209713.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 147055, cost: 2266358.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 253271, cost: 2193156.0
Run 4, iteration: 2/100, moves: 68906, cost: 2193156.0
Run 5, iteration: 1/100, moves: 234263, cost: 2158837.0
Run 5, iteration: 2/100, moves: 34440, cost: 2137493.0
Run 5, iteration: 3/100, moves: 3796, cost: 2137493.0
Best run was number 5


array([3, 3, 3, ..., 2, 1, 1], dtype=uint16)

In [100]:

# need to find typical feature values for each cluster
cluster_data = data.dropna().copy()
cluster_data["cluster"] = clusters
cluster_data = cluster_data.set_index("cluster")
cluster_data

# get the mode of each cluster
cluster_modes = cluster_data.groupby('cluster').agg(lambda x: x.mode().iloc[0])
cluster_modes.T


cluster,0,1,2,3,4
age_building,Recent (5-19 years),Recent (5-19 years),Established (20-49 years),Recent (5-19 years),Recent (5-19 years)
count_floors_pre_eq,1,2,2,1,2
land_surface_condition,Flat,Flat,Flat,Flat,Flat
foundation_type,RC,Mud mortar-Stone/Brick,Mud mortar-Stone/Brick,Mud mortar-Stone/Brick,Bamboo/Timber
roof_type,RCC/RB/RBC,Bamboo/Timber-Light roof,Bamboo/Timber-Light roof,Bamboo/Timber-Light roof,Bamboo/Timber-Heavy roof
ground_floor_type,RC,Mud,Mud,Mud,Mud
other_floor_type,Not applicable,TImber/Bamboo-Mud,TImber/Bamboo-Mud,Not applicable,Timber-Planck
plan_configuration,Rectangular,Rectangular,Rectangular,Rectangular,Rectangular
has_superstructure_adobe_mud,0,0,0,0,0
has_superstructure_mud_mortar_stone,0,1,1,1,0


In [101]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your DataFrame is named df
plt.figure(figsize=(10, 6))
sns.histplot(df_one_hot['plinth_area_sq_ft'], kde=False, bins=300)  # adjust bins as needed
plt.title('Distribution of Plinth Area (sq ft)')
plt.xlabel('Plinth Area (sq ft)')
plt.ylabel('Frequency')
plt.show()

NameError: name 'df_one_hot' is not defined

<Figure size 1000x600 with 0 Axes>

In [None]:
k = 0
b = 0
for i in df_one_hot["plinth_area_sq_ft"]:
    if i < 10:
        if i > b:
            b = i
        k += 1
        print(i)

print("Count:", k)
print("Biggest:", b)

Count: 0
Biggest: 0


In [None]:
# Conclustion, restrict the plinth area to 0-1500 sq ft
df_one_hot.query("plinth_area_sq_ft <= 1500", inplace=True)

In [None]:
df_one_hot["damage_grade"].value_counts()

damage_grade
Grade 5    274079
Grade 4    182711
Grade 3    135183
Grade 2     86137
Grade 1     77153
Name: count, dtype: int64

In [None]:
# Transform the plinth area column so that the values are normilzed
# divide it by 1500
df_one_hot["plinth_area_sq_ft"] = df_one_hot["plinth_area_sq_ft"] / 1500

In [None]:
df_one_hot["plinth_area_sq_ft"]

0         0.192000
1         0.242667
2         0.256000
3         0.208000
4         0.205333
            ...   
762101    0.110000
762102    0.228000
762103    0.228000
762104    0.204000
762105    0.560000
Name: plinth_area_sq_ft, Length: 755274, dtype: float64

In [None]:
# plot the distribution of the building age
plt.figure(figsize=(10, 6))
sns.histplot(df_one_hot['age_building'], kde=False, bins=200)  # adjust bins as needed
plt.title('Distribution of Building Age')
plt.xlabel('Building Age')
plt.ylabel('Frequency')
plt.show()

Unnamed: 0,household_id,district_id,vdcmun_id,ward_id,has_asset_land_pre_eq,has_asset_tv_pre_eq,has_asset_cable_pre_eq,has_asset_computer_pre_eq,has_asset_internet_pre_eq,has_asset_telephone_pre_eq,...,has_asset_computer_post_eq,has_asset_internet_post_eq,has_asset_telephone_post_eq,has_asset_mobile_phone_post_eq,has_asset_fridge_post_eq,has_asset_motorcycle_post_eq,has_asset_four_wheeler_family_use_post_eq,has_asset_four_wheeler_commercial_post_eq,has_asset_none_post_eq,cluster_id,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41
0,28687230000000.0,29.439222,2949.940137,294999.450752,0.953159,0.308153,0.261499,0.045002,0.05538233,0.022911,...,0.042799,0.05549,0.02169,0.841983,0.04744,0.062951,0.002590833,0.006655,0.006146462,0,1.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,True,False,False,False,,,
1,23841300000000.0,22.652925,2271.09116,227114.526169,0.954278,0.305976,0.257898,0.04509,0.05493436,0.022918,...,0.04288,0.055026,0.021725,0.841144,0.046498,0.062885,0.002590067,0.006659,0.006130443,1,15.0,0.242667,0.0,1.0,0.0,0.0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
2,27545410000000.0,27.255595,2730.858355,273091.780828,0.927349,0.767712,0.896338,0.632332,0.489785,0.316154,...,0.621508,0.485153,0.309742,0.983934,0.643157,0.674265,0.0467112,0.082793,-1.905594e-15,2,,,,,,,,,,,,,,,,,,,,
3,26406050000000.0,26.091795,2614.524777,261458.437652,0.009748,0.014013,0.010154,0.001828,-1.228184e-15,0.002234,...,0.000203,0.001015,0.000609,0.007717,0.000203,0.000203,6.245005e-17,0.000203,0.9900487,3,,,,,,,,,,,,,,,,,,,,


In [None]:
features = []

Unnamed: 0,household_id,district_id,vdcmun_id,ward_id,has_asset_land_pre_eq,has_asset_tv_pre_eq,has_asset_cable_pre_eq,has_asset_computer_pre_eq,has_asset_internet_pre_eq,has_asset_telephone_pre_eq,...,has_asset_cable_post_eq,has_asset_computer_post_eq,has_asset_internet_post_eq,has_asset_telephone_post_eq,has_asset_mobile_phone_post_eq,has_asset_fridge_post_eq,has_asset_motorcycle_post_eq,has_asset_four_wheeler_family_use_post_eq,has_asset_four_wheeler_commercial_post_eq,has_asset_none_post_eq
0,1.201010e+13,12.0,1207.0,120703.0,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.201010e+13,12.0,1207.0,120703.0,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,1.201010e+13,12.0,1207.0,120703.0,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,1.201010e+13,12.0,1207.0,120703.0,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,1.201010e+13,12.0,1207.0,120703.0,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
747360,3.667090e+13,36.0,3603.0,360302.0,0.953285,0.308438,0.261637,0.047059,0.056616,0.023919,...,0.242623,0.044823,0.056699,0.022694,0.841673,0.048967,0.064999,0.002742,0.006918,0.006524
747361,3.667090e+13,36.0,3603.0,360302.0,0.953285,0.308438,0.261637,0.047059,0.056616,0.023919,...,0.242623,0.044823,0.056699,0.022694,0.841673,0.048967,0.064999,0.002742,0.006918,0.006524
747362,3.667090e+13,36.0,3603.0,360302.0,0.953285,0.308438,0.261637,0.047059,0.056616,0.023919,...,0.242623,0.044823,0.056699,0.022694,0.841673,0.048967,0.064999,0.002742,0.006918,0.006524
747363,3.667090e+13,36.0,3603.0,360302.0,0.953285,0.308438,0.261637,0.047059,0.056616,0.023919,...,0.242623,0.044823,0.056699,0.022694,0.841673,0.048967,0.064999,0.002742,0.006918,0.006524
