In [2]:
import pandas as pd
import numpy as np
import os

### Description of Data (from DrivenData)
There are 39 columns in the dataset. The first column is the `building_id`, and other 38 columns are features about the buildings. All the categorical type variables represented by lowercase characters.



- `geo_level_1_id`, `geo_level_2_id`, `geo_level_3_id` **(type: int)**: geographic region in which building exists, from largest (level 1) to most specific sub-region (level 3). Possible values: level 1: 0-30, level 2: 0-1427, level 3: 0-12567.
- `count_floors_pre_eq` **(type: int)**: number of floors in the building before the earthquake.
- `age` **(type: int)**: age of the building in years.
- `area_percentage` **(type: int)**: normalized area of the building footprint.
- `height_percentage` **(type: int)**: normalized height of the building footprint.
- `land_surface_condition` **(type: categorical)**: surface condition of the land where the building was built. Possible values: n, o, t.
- `foundation_type` **(type: categorical)**: type of foundation used while building. Possible values: h, i, r, u, w.
- `roof_type` **(type: categorical)**: type of roof used while building. Possible values: n, q, x.
- `ground_floor_type` **(type: categorical)**: type of the ground floor. Possible values: f, m, v, x, z.
- `other_floor_type` **(type: categorical)**: type of constructions used in higher than the ground floors (except of roof). Possible values: j, q, s, x.
- `position` **(type: categorical)**: position of the building. Possible values: j, o, s, t.
- `plan_configuration` **(type: categorical)**: building plan configuration. Possible values: a, c, d, f, m, n, o, q, s, u.
- `has_superstructure_adobe_mud` **(type:  binary)**: flag variable that indicates if the superstructure was made of Adobe/Mud.
- `has_superstructure_mud_mortar_stone` **(type:  binary)**: flag variable that indicates if the superstructure was made of Mud Mortar - Stone.
- `has_superstructure_stone_flag` **(type:  binary)**: flag variable that indicates if the superstructure was made of Stone.
- `has_superstructure_cement_mortar_stone` **(type:  binary)**: flag variable that indicates if the superstructure was made of Cement Mortar - Stone.
- `has_superstructure_mud_mortar_brick` **(type:  binary)**: flag variable that indicates if the superstructure was made of Mud Mortar - Brick.
- `has_superstructure_cement_mortar_brick` **(type:  binary)**: flag variable that indicates if the superstructure was made of Cement Mortar - Brick.
- `has_superstructure_timber` **(type:  binary)**: flag variable that indicates if the superstructure was made of Timber.
- `has_superstructure_bamboo` **(type:  binary)**: flag variable that indicates if the superstructure was made of Bamboo.
- `has_superstructure_rc_non_engineered` **(type:  binary)**: flag variable that indicates if the superstructure was made of non-engineered reinforced concrete.
- `has_superstructure_rc_engineered` **(type:  binary)**: flag variable that indicates if the superstructure was made of engineered reinforced concrete.
- `has_superstructure_other` **(type:  binary)**: flag variable that indicates if the superstructure was made of any other material.
- `legal_ownership_status` **(type: categorical)**: legal ownership status of the land where building was built. Possible values: a, r, v, w.
- `count_families` **(type: int)**: number of families that live in the building.
- `has_secondary_use` **(type:  binary)**: flag variable that indicates if the building was used for any secondary purpose.
- `has_secondary_use_agriculture` **(type:  binary)**: flag variable that indicates if the building was used for agricultural purposes.
- `has_secondary_use_hotel` **(type:  binary)**: flag variable that indicates if the building was used as a hotel.
- `has_secondary_use_rental` **(type:  binary)**: flag variable that indicates if the building was used for rental purposes.
- `has_secondary_use_institution` **(type:  binary)**: flag variable that indicates if the building was used as a location of any institution.
- `has_secondary_use_school` **(type:  binary)**: flag variable that indicates if the building was used as a school.
- `has_secondary_use_industry` **(type:  binary)**: flag variable that indicates if the building was used for industrial purposes.
- `has_secondary_use_health_post` **(type:  binary)**: flag variable that indicates if the building was used as a health post.
- `has_secondary_use_gov_office` **(type:  binary)**: flag variable that indicates if the building was used fas a government office.
- `has_secondary_use_use_police` **(type:  binary)**: flag variable that indicates if the building was used as a police station.
- `has_secondary_use_other` **(type:  binary)**: flag variable that indicates if the building was secondarily used for other purposes.

In [9]:
data_dir = "../data/"
features_df = pd.read_csv(os.path.join(data_dir, "train_values.csv"))
labels_df = pd.read_csv(os.path.join(data_dir, "train_labels.csv"))

In [36]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
# I am starting out not using all variables
int_columns = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
               'count_floors_pre_eq', 'age', 'area_percentage', 
               'height_percentage']
data_dir = "../data/"
features_df = pd.read_csv(os.path.join(data_dir, "train_values.csv"))
labels_df = pd.read_csv(os.path.join(data_dir, "train_labels.csv"))
categ_columns = ['land_surface_condition', 'foundation_type', 'roof_type',
                 'ground_floor_type', 'other_floor_type', 'position',
                 'plan_configuration',
                 ]
binary_columns = ['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
                  'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone',
                  'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick',
                  'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_engineered',
                  'has_superstructure_other'
                  ]



In [39]:
# Create pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# select only the columns we want
X = features_df[ int_columns + categ_columns + binary_columns]
y = labels_df
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Size information:")
print("\t Number of Training Samples: {}".format(X_train.size))
print("\t Number of Validation Samples: {}".format(X_val.size))

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categ_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])


preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, int_columns),
        ('categorical', categ_transformer, categ_columns),
        ('passthrough', 'passthrough', binary_columns)
    ]
)

# preprocess the data
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.fit_transform(X_val)



Size information:
	 Number of Training Samples: 5003520
	 Number of Validation Samples: 1250904
(52121, 51)
[-0.48883051  1.26838563  1.33425667 -1.55599329 -0.08965615 -0.23745165
 -1.26606033  0.          0.          1.          0.          0.
  1.          0.          0.          1.          0.          0.
  1.          0.          0.          0.          0.          1.
  0.          0.          0.          0.          0.          1.
  0.          0.          0.          1.          0.          0.
  0.          0.          0.          0.          0.          0.
  1.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
