In [3]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [4]:
train_values = pd.read_csv('train_values.csv', index_col='building_id')
train_labels = pd.read_csv('train_labels.csv', index_col='building_id')

In [5]:
test_values = pd.read_csv('test_values.csv', index_col='building_id')

In [6]:
train_values.dtypes

geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_cement_mortar_brick     int64
has_superstructure_t

In [7]:
columnas = ['land_surface_condition',
            'foundation_type',
            'roof_type',
            'ground_floor_type',
            'other_floor_type',
            'position',
            'plan_configuration',
            'legal_ownership_status',
            'geo_level_1_id',
            'geo_level_2_id',
            'geo_level_3_id'];

In [8]:
for row in columnas:
  # Calculamos la frecuencia por cada grado de cada valor de la variable
  Auxiliar = test_values.groupby([row]).size()/len(test_values);

  # Reemplazamos la columna por la columna de frecuencias
  test_values.loc[:,row] = test_values[row].map(Auxiliar);

In [9]:
for row in columnas:
  # Calculamos la frecuencia por cada grado de cada valor de la variable
  Auxiliar = train_values.groupby([row]).size()/len(train_values);

  # Reemplazamos la columna por la columna de frecuencias
  train_values.loc[:,row] = train_values[row].map(Auxiliar);

In [10]:
train_values.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 38 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   geo_level_1_id                          260601 non-null  float64
 1   geo_level_2_id                          260601 non-null  float64
 2   geo_level_3_id                          260601 non-null  float64
 3   count_floors_pre_eq                     260601 non-null  int64  
 4   age                                     260601 non-null  int64  
 5   area_percentage                         260601 non-null  int64  
 6   height_percentage                       260601 non-null  int64  
 7   land_surface_condition                  260601 non-null  float64
 8   foundation_type                         260601 non-null  float64
 9   roof_type                               260601 non-null  float64
 10  ground_floor_type                      

In [11]:
selected_features = ['land_surface_condition',
            'foundation_type',
            'roof_type',
            'ground_floor_type',
            'other_floor_type',
            'position',
            'plan_configuration',
            'legal_ownership_status',
            'geo_level_1_id',
            'geo_level_2_id',
            'geo_level_3_id'];
train_values_subset = train_values[selected_features]

In [12]:
train_values_subset

Unnamed: 0_level_0,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,legal_ownership_status,geo_level_1_id,geo_level_2_id,geo_level_3_id
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
802906,0.831758,0.841117,0.701617,0.804368,0.634234,0.164604,0.959597,0.962924,0.093557,0.001036,0.000142
28830,0.031911,0.841117,0.701617,0.095460,0.634234,0.775477,0.959597,0.962924,0.073215,0.000764,0.000061
94947,0.831758,0.841117,0.701617,0.804368,0.166722,0.164604,0.959597,0.962924,0.057133,0.006754,0.000522
590882,0.831758,0.841117,0.701617,0.804368,0.166722,0.775477,0.959597,0.962924,0.023991,0.000787,0.000119
201944,0.831758,0.841117,0.701617,0.804368,0.166722,0.775477,0.959597,0.962924,0.031542,0.003983,0.000468
...,...,...,...,...,...,...,...,...,...,...,...
688636,0.136331,0.841117,0.701617,0.804368,0.152889,0.775477,0.021842,0.962924,0.021581,0.000445,0.000054
669485,0.831758,0.841117,0.701617,0.804368,0.634234,0.775477,0.959597,0.962924,0.083703,0.001002,0.000188
602512,0.831758,0.841117,0.236285,0.804368,0.634234,0.775477,0.959597,0.962924,0.083703,0.001278,0.000084
151409,0.831758,0.841117,0.062099,0.094370,0.046155,0.050967,0.959597,0.962924,0.086780,0.015495,0.000906


In [13]:
X_train, X_test, y_train, y_test = train_test_split(train_values_subset,train_labels,test_size = 0.2);

In [14]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

In [15]:
pipe = make_pipeline(StandardScaler(), 
                     RandomForestClassifier(random_state=2018))
pipe

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=2018))])

In [16]:
param_grid = {'randomforestclassifier__n_estimators': [50, 100],
              'randomforestclassifier__min_samples_leaf': [1, 5]}
gs = GridSearchCV(pipe, param_grid, cv=5)

In [None]:
gs.fit(X_train, y_train.values.ravel())

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(random_state=2018))]),
             param_grid={'randomforestclassifier__min_samples_leaf': [1, 5],
                         'randomforestclassifier__n_estimators': [50, 100]})

In [24]:
from sklearn.metrics import f1_score

In [25]:
in_sample_preds = gs.predict(X_train)
f1_score(y_train, in_sample_preds, average='micro')

0.7617709132770529

In [26]:
in_sample_preds = gs.predict(X_test)
f1_score(y_test, in_sample_preds, average='micro')

0.7279407532472516

In [27]:
#test_values = pd.read_csv('test_values.csv', index_col='building_id')

In [28]:
test_values_subset = test_values[selected_features]
#test_values_subset = pd.get_dummies(test_values_subset)

In [29]:
predictions = gs.predict(test_values_subset)

In [30]:
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')

In [31]:
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [32]:
my_submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,2
99355,2
890251,2
745817,2
421793,2


In [33]:
my_submission.to_csv('submission.csv')

In [34]:
!head submission.csv

'head' is not recognized as an internal or external command,
operable program or batch file.


In [35]:
my_submission.damage_grade.value_counts()

2    66861
3    16363
1     3644
Name: damage_grade, dtype: int64