<a href="https://colab.research.google.com/github/mohanpartha/ML_preprocessing/blob/master/EarthQuake.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np

In [0]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
df_labels = pd.read_csv('/content/drive/My Drive/data/EarthQuake/train_labels.csv',sep=',')
df_train = pd.read_csv('/content/drive/My Drive/data/EarthQuake/train_values.csv',sep=',')
df_submission = pd.read_csv('/content/drive/My Drive/data/EarthQuake/submission_format.csv',sep=',')

In [0]:
#df_train = df_train.set_index('building_id',inplace=True)

In [4]:
df_train.columns

Index(['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_i

In [67]:
df_train.dtypes

age                                        int64
area_percentage                            int64
count_families                             int64
count_floors_pre_eq                        int64
foundation_type                           object
geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
ground_floor_type                         object
has_secondary_use                          int64
has_secondary_use_agriculture              int64
has_secondary_use_gov_office               int64
has_secondary_use_health_post              int64
has_secondary_use_hotel                    int64
has_secondary_use_industry                 int64
has_secondary_use_institution              int64
has_secondary_use_other                    int64
has_secondary_use_rental                   int64
has_secondary_use_school                   int64
has_secondary_use_use_police               int64
has_superstructure_a

In [5]:
df_train['land_surface_condition'].value_counts()

t    216757
n     35528
o      8316
Name: land_surface_condition, dtype: int64

In [0]:
features = df_train.columns.difference(["building_id"])

In [0]:
df_train = df_train[features]

In [69]:
col_for_dummy = df_train.columns[df_train.dtypes == "object"].values
print(col_for_dummy)

['foundation_type' 'ground_floor_type' 'land_surface_condition'
 'legal_ownership_status' 'other_floor_type' 'plan_configuration'
 'position' 'roof_type']


In [0]:
df_X = pd.concat([df_train.drop(columns=col_for_dummy), 
                pd.get_dummies(df_train[col_for_dummy]),], axis=1)

In [0]:
features = df_X.columns.difference(['building_id'])

In [0]:
df_X = df_X[features]

In [0]:
df_y = df_labels['damage_grade']

In [0]:
X_train, X_valid, y_train, y_valid = train_test_split(df_X, df_y, test_size=0.2)

In [0]:
np.testing.assert_array_equal(df_X.index.values, df_y.index.values)

In [0]:
logreg_earth = LogisticRegression(solver='liblinear', random_state=1)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [53]:
logreg_earth.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
def compute_accuracy(model, X, y):
    y_pred = model.predict(X)
    return accuracy_score(y, y_pred)

In [55]:
print("Earthquack Training", 
      compute_accuracy(logreg_earth, X_train, y_train))

print("Earthquack Valid", 
      compute_accuracy(logreg_earth, X_valid, y_valid))

Earthquack Training 0.5848618572524943
Earthquack Valid 0.5851959862627348


In [0]:
eq_rand_class = RandomForestClassifier(min_samples_leaf=5)

In [61]:
eq_rand_class.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [62]:
print("Earthquack Training", 
      compute_accuracy(eq_rand_class, X_train, y_train))

print("Earthquack Valid", 
      compute_accuracy(eq_rand_class, X_valid, y_valid))

Earthquack Training 0.7584084804297775
Earthquack Valid 0.7080255559179601


In [0]:
# Prepare test data
df_test = pd.read_csv('/content/drive/My Drive/data/EarthQuake/test_values.csv',sep=',')


In [0]:
df_test_feature = pd.concat([df_test.drop(columns=col_for_dummy),
           pd.get_dummies(df_test[col_for_dummy]),
          ], axis=1)

In [0]:
df_test_feature = df_test_feature[features]

In [81]:
df_test_feature.head()

Unnamed: 0,age,area_percentage,count_families,count_floors_pre_eq,foundation_type_h,foundation_type_i,foundation_type_r,foundation_type_u,foundation_type_w,geo_level_1_id,geo_level_2_id,geo_level_3_id,ground_floor_type_f,ground_floor_type_m,ground_floor_type_v,ground_floor_type_x,ground_floor_type_z,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_gov_office,has_secondary_use_health_post,has_secondary_use_hotel,has_secondary_use_industry,has_secondary_use_institution,has_secondary_use_other,has_secondary_use_rental,has_secondary_use_school,has_secondary_use_use_police,has_superstructure_adobe_mud,has_superstructure_bamboo,has_superstructure_cement_mortar_brick,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_mud_mortar_stone,has_superstructure_other,has_superstructure_rc_engineered,has_superstructure_rc_non_engineered,has_superstructure_stone_flag,has_superstructure_timber,height_percentage,land_surface_condition_n,land_surface_condition_o,land_surface_condition_t,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w,other_floor_type_j,other_floor_type_q,other_floor_type_s,other_floor_type_x,plan_configuration_a,plan_configuration_c,plan_configuration_d,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,position_j,position_o,position_s,position_t,roof_type_n,roof_type_q,roof_type_x
0,20,7,1,3,0,0,1,0,0,17,596,11307,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,6,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1,25,13,1,2,0,0,1,0,0,6,141,11987,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,5,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2,5,4,1,2,0,0,1,0,0,22,19,10044,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,5,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
3,0,19,2,1,0,0,1,0,0,26,39,633,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1
4,15,8,1,3,0,0,1,0,0,17,289,7970,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,7,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0


In [83]:
df_submission

Unnamed: 0,building_id,damage_grade
0,300051,1
1,99355,1
2,890251,1
3,745817,1
4,421793,1
...,...,...
86863,310028,1
86864,663567,1
86865,1049160,1
86866,442785,1


In [0]:
y_pred = eq_rand_class.predict(df_test_feature)

In [85]:
y_pred.shape

(86868,)

In [86]:
df_test_feature.shape

(86868, 68)

In [0]:
results_df = pd.DataFrame({
    "building_id": df_test["building_id"],
    "damage_grade": y_pred
})

In [0]:
results_df.to_csv("/content/drive/My Drive/data/EarthQuake/submission_v1_eq.csv", index=False)