In [1]:
import pandas as pd
import numpy as np
import os
import prince

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

#numerical
int_columns = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
               'count_floors_pre_eq', 'age', 'area_percentage', 
               'height_percentage', 'count_families']

#categorical
categ_columns = ['land_surface_condition', 'foundation_type', 'roof_type',
                 'ground_floor_type', 'other_floor_type', 'position',
                 'plan_configuration', 'legal_ownership_status',
                 ]
#binary
binary_columns = ['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
                  'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone',
                  'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick',
                  'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_engineered',
                  'has_superstructure_other', 'legal_ownership_status', 'has_secondary_use', 'has_secondary_use_agriculture', 
                  'has_secondary_use_hotel', 'has_secondary_use_rental', 'has_secondary_use_institution', 'has_secondary_use_school', 'has_secondary_use_industry',
                  'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'has_secondary_use_other'
                  ]

In [21]:
data_dir = "../data/"
features_df = pd.read_csv(os.path.join(data_dir, "train_values.csv"))
labels_df = pd.read_csv(os.path.join(data_dir, "train_labels.csv"))

In [4]:
# Getting the numeric and binary variables
numeric_values = features_df.select_dtypes(include=['int64'])
print("Numeric Variables")
print("-------------------------------------------------")
numeric_values.columns

Numeric Variables
-------------------------------------------------


Index(['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
       'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_secondary_use_health_post',
       'has_secondary_use_gov_office', 'has_secondary_use_use_police',
       'has_secondary_use_other'],
      dtype='object')

In [5]:
# Getting the categorical variables
categorical_values = features_df.select_dtypes(include=['object'])
print("Categorical Variables")
print("-------------------------------------------------")
categorical_values.columns

Categorical Variables
-------------------------------------------------


Index(['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'legal_ownership_status'],
      dtype='object')

In [6]:
# Encoding the categorical variables
# categorical_encoded = pd.get_dummies(categorical_values, drop_first=True)

In [22]:
# Create pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# select only the columns we want
X = pd.concat([numeric_values, categorical_values], axis=1)
y = labels_df
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# print("Size information:")
# print("\t Number of Training Samples: {}".format(X_train.size))
# print("\t Number of Validation Samples: {}".format(X_val.size))

# numeric_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])

# categ_transformer = Pipeline(steps=[
#     ('encoder', OneHotEncoder())
# ])


# preprocessor = ColumnTransformer(
#     transformers=[
#         ('numeric', numeric_transformer, int_columns),
#         ('categorical', categ_transformer, categ_columns),
#         ('passthrough', 'passthrough', binary_columns)
#     ]
# )

# # preprocess the data
# X_train = preprocessor.fit_transform(X_train)
# X_val = preprocessor.fit_transform(X_val)

In [23]:
X_dropped_geo = X.drop(['building_id', 'geo_level_1_id' ,'geo_level_2_id', 'geo_level_3_id'], axis=1)


In [36]:
import prince 
famd = prince.FAMD(n_components = 10, n_iter = 3, random_state = 42)
famd = famd.fit(X_dropped_geo)
reduced_dim = famd.transform(X_dropped_geo)

In [47]:
print(famd.explained_inertia_)
print(np.sum(famd.explained_inertia_))

[0.08442846 0.05618977 0.0442284  0.04046081 0.03294954 0.02506262
 0.0242953  0.02415883 0.02327745 0.02288433]
0.37793549962784134


In [42]:
X_geo = X[['building_id', 'geo_level_1_id' ,'geo_level_2_id', 'geo_level_3_id']]
X_reduced = pd.concat([X_geo, reduced_dim], axis = 1)

In [43]:
X_reduced #new output

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,0,1,2,3,4,5,6,7,8,9
0,802906,6,487,12198,-0.485633,0.635320,-0.371859,-0.082445,-0.911475,-0.001641,0.358178,-0.331238,-0.244699,0.038499
1,28830,8,900,2812,-0.375071,0.232567,0.080790,-0.632932,0.154144,-0.276581,-0.130276,-0.003495,0.211666,-0.039807
2,94947,21,363,8973,-0.544199,0.004228,0.028401,-0.506286,-0.073712,-0.135051,0.119371,-0.168377,-0.050177,0.022746
3,590882,22,418,10694,-0.817362,-0.427850,-0.423014,1.215211,1.239559,0.507262,-0.182084,0.100870,-0.011949,0.014509
4,201944,11,131,1488,0.377090,1.529580,-0.855466,0.359871,-0.695013,-0.097116,0.200330,-0.181648,0.013576,0.039707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,25,1335,1621,-0.718045,-1.366810,-0.030507,-0.519403,-0.624889,-0.129805,0.395077,-0.266429,-0.425341,-0.063564
260597,669485,17,715,2060,-0.656637,-0.021517,0.095249,-0.637509,-0.040087,0.081829,-0.038595,-0.039132,-0.004104,-0.045574
260598,602512,17,51,8163,-0.446217,0.876170,0.031373,-0.610406,0.376977,0.183334,-0.198848,0.063673,0.074096,0.043824
260599,151409,26,39,1851,2.968436,-0.440105,-0.829106,-0.665267,-0.387914,0.108498,-0.287910,0.230592,0.293853,0.004812
