<a href="https://colab.research.google.com/github/mohanpartha/ML_preprocessing/blob/master/EarthQuake.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Importing some of the packages used
# Pandas used for Machine learning
# numpy used for processing array based data
# sklearn has bunch of standard m/c learning models...We can use it and train it with our data
# we will use 3 models here LogisticRegression,RandomForestClassifier,kfold
# in sklearn.metrics package, we wil use these metrics accuracy_score, confusion_matrix, f1_score 
# to evaluate our trained model

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [112]:
# Since we can't store the data  in Google colab, we store the data in our Googe drive and 
# use it for training the models. In this cell establishes a temperory secured link to Google colab and
# your Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# I have uploaded the csv files and uploaded to my google drive in My Drive/data/EarthQuake folder
# Since all the 3 files are csv files, i and reading all the 3 files into google Colab from my 
# Google drive. We use Pandas packages (pd) to load the csv files into pandas Dataframe
df_labels = pd.read_csv('/content/drive/My Drive/data/EarthQuake/train_labels.csv',sep=',')
df_train = pd.read_csv('/content/drive/My Drive/data/EarthQuake/train_values.csv',sep=',')
df_submission = pd.read_csv('/content/drive/My Drive/data/EarthQuake/submission_format.csv',sep=',')

In [135]:
# Looking at the shape of the dataset(X), we see it has 260601 rows and 38 columns
df_train.shape

(260601, 38)

In [136]:
# Looking at the shape of the dataset(y), we see it has 260601 rows and 2 columns
df_labels.shape

(260601, 2)

In [137]:
# our submission dataset(y) should be of this shape 
df_submission.shape

(86868, 2)

In [138]:
# We look at the first 5 rows of training data like this 
df_train.head()

Unnamed: 0,age,area_percentage,count_families,count_floors_pre_eq,foundation_type,geo_level_1_id,geo_level_2_id,geo_level_3_id,ground_floor_type,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_gov_office,has_secondary_use_health_post,has_secondary_use_hotel,has_secondary_use_industry,has_secondary_use_institution,has_secondary_use_other,has_secondary_use_rental,has_secondary_use_school,has_secondary_use_use_police,has_superstructure_adobe_mud,has_superstructure_bamboo,has_superstructure_cement_mortar_brick,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_mud_mortar_stone,has_superstructure_other,has_superstructure_rc_engineered,has_superstructure_rc_non_engineered,has_superstructure_stone_flag,has_superstructure_timber,height_percentage,land_surface_condition,legal_ownership_status,other_floor_type,plan_configuration,position,roof_type
0,30,6,1,2,r,6,487,12198,f,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,5,t,v,q,d,t,n
1,10,8,1,2,r,8,900,2812,x,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,7,o,v,q,d,s,n
2,10,5,1,2,r,21,363,8973,f,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,5,t,v,x,d,t,n
3,10,6,1,2,r,22,418,10694,f,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,5,t,v,x,d,s,n
4,30,8,1,3,r,11,131,1488,f,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,9,t,v,x,d,s,n


In [139]:
# We look at the first 5 rows of training result data like this 
df_labels.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [140]:
# We look at the first 5 rows of submission dataset like this 
df_submission.head()

Unnamed: 0,building_id,damage_grade
0,300051,1
1,99355,1
2,890251,1
3,745817,1
4,421793,1


In [114]:
# I just want to look at the column in the data we just loaded to colab
# df_labels, df_train, df_submission are pandas dataframe
df_train.columns

Index(['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_i

In [115]:
# 1 st step in examining the data 
# dtypes- Datatypes ...This will display the datatypes of each columns in the training dataset
df_train.dtypes

building_id                                int64
geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_c

In [116]:
# Examining the y data, For any machine learning problem, we need 3 things
# 1 X - data to train (Capital X); y - result for each row (observation), 3 a model to train
# Value_counts method gives the distribution ofthe data ...this will tell us some insights about our data
# here we can see how the data is distributed
df_train['land_surface_condition'].value_counts()

t    216757
n     35528
o      8316
Name: land_surface_condition, dtype: int64

In [117]:
# .head method gives some insight about the data...I want to get rid of Building id and it is just a 
# sequence number
df_train.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [0]:
# preparing the columns we need for training. I dont need Building_id
features = df_train.columns.difference(["building_id"])

In [119]:
# All features without the building_id
features

Index(['age', 'area_percentage', 'count_families', 'count_floors_pre_eq',
       'foundation_type', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'ground_floor_type', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_gov_office',
       'has_secondary_use_health_post', 'has_secondary_use_hotel',
       'has_secondary_use_industry', 'has_secondary_use_institution',
       'has_secondary_use_other', 'has_secondary_use_rental',
       'has_secondary_use_school', 'has_secondary_use_use_police',
       'has_superstructure_adobe_mud', 'has_superstructure_bamboo',
       'has_superstructure_cement_mortar_brick',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_other',
       'has_superstructure_rc_engineered',
       'has_superstructure_rc_non_engineered', 'has_superstructure_stone_flag',
       'has_superstructure_timber', 'height_percen

In [0]:
# Filtering that column from the original Data frame
df_train = df_train[features]

In [122]:
# Looking at the data types above in line 10, we see there are object data types and integer data types
# in machine Learning we should not have Text data(object datatypes)
# Here we are idendifiying the columns of Object data type
col_for_dummy = df_train.columns[df_train.dtypes == "object"].values
print(col_for_dummy)

['foundation_type' 'ground_floor_type' 'land_surface_condition'
 'legal_ownership_status' 'other_floor_type' 'plan_configuration'
 'position' 'roof_type']


In [0]:
# so our next step is to convert the data in the columns of datatype (object) to a numeric value, 
# here we use pd.get_dummies method to do it for us.
# Here we define our X, We are concatinating 2 sets, original set without the object data types
# and the object bject datatype translated by get_dummies 
df_X = pd.concat([df_train.drop(columns=col_for_dummy), 
                pd.get_dummies(df_train[col_for_dummy]),], axis=1)

In [0]:
features = df_X.columns.difference(['building_id'])

In [0]:
# dropping building_id from dataframe X
df_X = df_X[features]

In [0]:
# Here we create y df_y = dataframe y from the df_labels  given to us in the competetion
df_y = df_labels['damage_grade']

In [0]:
# Here we use the train_test_split method to split the df_X, df_y in to training, validation sets
# we will use the 2 sets to train the standard model to train to our data
X_train, X_valid, y_train, y_valid = train_test_split(df_X, df_y, test_size=0.2)

In [134]:
# Looking at the shape of df_X..Note the number of columns increased to 68 columns from 
df_X.shape

(260601, 68)

In [21]:
df_y.shape

(260601,)

In [0]:
# This check will ensure that the each row instance X has a y value (observed result) and it belongs to the 
# same index of the X
np.testing.assert_array_equal(df_X.index.values, df_y.index.values)

In [0]:
# Now that we have out X and y we are ready to train model the to suit our data
# Instanciating and instance of LogisticRegression model
logreg_earth = LogisticRegression(solver='liblinear', random_state=1)

In [145]:
# after instianciating the model, next step is to train it with the data...
#For this we call the fit method...Now we have a trained model
logreg_earth.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
# Here we define a function to do the prediction, and calcuation of accuracy as we will be reusing it
# several times; Note that we are calculating 2 accuracy scores Accuracy score and f1_score
def compute_accuracy(model, X, y):
    y_pred = model.predict(X)
    print(confusion_matrix(y, y_pred))
    return accuracy_score(y, y_pred), f1_score(y, y_pred,average='micro')

In [148]:
# Here we are are getting the accuracy of the training data
print("Earthquack Training", 
      compute_accuracy(logreg_earth, X_train, y_train))

# Here we are are getting the accuracy of the validation data
print("Earthquack Valid", 
      compute_accuracy(logreg_earth, X_valid, y_valid))

[[  4346  15279    436]
 [  2797 107730   8122]
 [   191  58893  10686]]
Earthquack Training (0.5888430544896393, 0.5888430544896393)
[[ 1104  3847   112]
 [  698 26943  1969]
 [   39 14653  2756]]
Earthquack Valid (0.5909901958903321, 0.5909901958903321)


In [0]:
#In the above cell result, we see that the accuracy is only mediocre so i decide to try 
# another model RandomForestClassifier..We will follow the same process fit, 
# predict and calculate accuracy
eq_rand_class = RandomForestClassifier(min_samples_leaf=3, n_estimators=100, n_jobs=-1)

In [150]:
eq_rand_class.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [151]:
print("Earthquack Training", 
      compute_accuracy(eq_rand_class, X_train, y_train))

print("Earthquack Valid", 
      compute_accuracy(eq_rand_class, X_valid, y_valid))

[[ 10528   9312    221]
 [  1618 110392   6639]
 [   251  24482  45037]]
Earthquack Training (0.7960331926323868, 0.7960331926323868)
[[ 2059  2946    58]
 [  836 25849  2925]
 [   62  8157  9229]]
Earthquack Valid (0.7125151090731183, 0.7125151090731183)


In [0]:
#In the above cell result, we get a validation score of 71% /..
#I am satisfied ..i will submit this score # Prepare test data Line 65
#...Lets try another model StratifiedKFold
y_pred = eq_rand_class.predict(X_valid)
X_valid[y_pred != y_valid]

In [0]:
# K fold mode splits the data in to 5 partations, and in each partation, we further split the 
# data into training and validation and 5 models also...we traing each model in each data partation
# and average the results
from sklearn.model_selection import StratifiedKFold

In [0]:
skf = StratifiedKFold(n_splits=5)

In [63]:
df_y.head()

0    3
1    2
2    3
3    2
4    3
Name: damage_grade, dtype: int64

In [65]:
models = []
for train_index, test_index in skf.split(df_X, df_y):
  print(train_index.shape, test_index.shape)
  X_train, X_test = df_X.iloc[train_index,:], df_X.iloc[test_index,:]
  y_train, y_test = df_y[train_index], df_y[test_index]
  model = RandomForestClassifier(min_samples_leaf=3, n_estimators=100, n_jobs=-1)
  model.fit(X_train,y_train)
  print(compute_accuracy(model, X_test, y_test))
  models.append(model)

(208480,) (52121,)
[[ 2082  2884    59]
 [  839 25845  2968]
 [   59  7988  9397]]
(0.716102914372326, 0.7161029143723261)
(208481,) (52120,)
[[ 1992  2970    62]
 [  867 25883  2902]
 [   75  8217  9152]]
(0.7104182655410591, 0.710418265541059)
(208481,) (52120,)
[[ 2014  2949    62]
 [  788 25950  2913]
 [   56  8009  9379]]
(0.7164811972371451, 0.7164811972371451)
(208481,) (52120,)
[[ 2067  2899    59]
 [  799 25970  2883]
 [   80  8026  9337]]
(0.7170759785111281, 0.7170759785111281)
(208481,) (52120,)
[[ 2052  2919    54]
 [  849 25935  2868]
 [   66  8018  9359]]
(0.7165387567152725, 0.7165387567152725)


In [0]:
y_presd = []
for model in models:
  y_prnd = model.predict(df_test_feature)
  y_presd.append(y_prnd)


In [0]:
y_pred = np.stack(y_presd)

In [0]:
from scipy.stats import mode

In [0]:
y_pred = mode(y_pred)[0]

In [0]:
y_pred = y_pred.flatten()

In [86]:
y_pred

array([3, 2, 2, ..., 2, 2, 2])

In [0]:
 y_predf  = eq_rand_class.predict(X_train)

In [0]:
temp = (y_predf != y_train)

In [0]:
X_filter = X_train[temp]

In [0]:
y_filter =  y_train[temp]

In [43]:
filter_rand_class.fit(X_filter, y_filter)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [44]:
print("Earthquack Training", 
      compute_accuracy(filter_rand_class, X_train, y_train))

print("Earthquack Valid", 
      compute_accuracy(filter_rand_class, X_valid, y_valid))

Earthquack Training (0.3156321949347659, 0.3156321949347659)
Earthquack Valid (0.3016250647531705, 0.3016250647531705)


In [0]:
#Random forest We can try other models like random forest,svc,linearSvc etc

In [0]:
import numpy as np
import cvxopt
from sklearn.datasets.samples_generator import make_blobs
from matplotlib import pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix

In [0]:
from sklearn import svm

In [0]:
    y_pred = filter_rand_class.predict(X_valid)
    accuracy_score(y_valid, y_pred)

In [0]:
confusion_matrix(y_valid, y_pred)

In [0]:
confusion_matrix(y_valid, y_pred)

array([[  512,  1169,  3410],
       [  486,  7759, 21331],
       [   50,  4193, 13211]])

In [0]:
y_pred = lsvm.predict(X_test)
confusion_matrix(y_test, y_pred)

In [0]:
# Prepare test data
df_test = pd.read_csv('/content/drive/My Drive/data/EarthQuake/test_values.csv',sep=',')


In [0]:
df_test_feature = pd.concat([df_test.drop(columns=col_for_dummy),
           pd.get_dummies(df_test[col_for_dummy]),
          ], axis=1)

In [0]:
df_test_feature = df_test_feature[features]

In [69]:
df_test_feature.head()

Unnamed: 0,age,area_percentage,count_families,count_floors_pre_eq,foundation_type_h,foundation_type_i,foundation_type_r,foundation_type_u,foundation_type_w,geo_level_1_id,geo_level_2_id,geo_level_3_id,ground_floor_type_f,ground_floor_type_m,ground_floor_type_v,ground_floor_type_x,ground_floor_type_z,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_gov_office,has_secondary_use_health_post,has_secondary_use_hotel,has_secondary_use_industry,has_secondary_use_institution,has_secondary_use_other,has_secondary_use_rental,has_secondary_use_school,has_secondary_use_use_police,has_superstructure_adobe_mud,has_superstructure_bamboo,has_superstructure_cement_mortar_brick,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_mud_mortar_stone,has_superstructure_other,has_superstructure_rc_engineered,has_superstructure_rc_non_engineered,has_superstructure_stone_flag,has_superstructure_timber,height_percentage,land_surface_condition_n,land_surface_condition_o,land_surface_condition_t,legal_ownership_status_a,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w,other_floor_type_j,other_floor_type_q,other_floor_type_s,other_floor_type_x,plan_configuration_a,plan_configuration_c,plan_configuration_d,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,position_j,position_o,position_s,position_t,roof_type_n,roof_type_q,roof_type_x
0,20,7,1,3,0,0,1,0,0,17,596,11307,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,6,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1,25,13,1,2,0,0,1,0,0,6,141,11987,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,5,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2,5,4,1,2,0,0,1,0,0,22,19,10044,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,5,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
3,0,19,2,1,0,0,1,0,0,26,39,633,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1
4,15,8,1,3,0,0,1,0,0,17,289,7970,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,7,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0


In [70]:
df_submission

Unnamed: 0,building_id,damage_grade
0,300051,1
1,99355,1
2,890251,1
3,745817,1
4,421793,1
...,...,...
86863,310028,1
86864,663567,1
86865,1049160,1
86866,442785,1


In [0]:
y_pred = eq_rand_class.predict(df_test_feature)

In [0]:
y_pred_m2 = filter_rand_class.predict(df_test_feature)

In [0]:
temp_mask = (y_pred == y_pred_m2)

In [0]:
k= y_pred_m2[temp_mask]

In [0]:
k.shape

(10902,)

In [0]:
y_pred.shape

(86868,)

In [0]:
[temp3 = y_pred y_pred_m2[temp_mask]

SyntaxError: ignored

In [0]:
#Gradient Boost

In [0]:
pip install chefboost

In [0]:
from chefboost import Chefboost as chef
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
from chefboost.training import Preprocess
from chefboost.commons import functions

In [0]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

In [0]:
gb_model = GradientBoostingClassifier(max_leaf_nodes= 3)

In [0]:
gb_model.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=3,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [0]:
print("GB Earthquack Training", 
      compute_accuracy(gb_model, X_train, y_train))

print("GB Earthquack Valid", 
      compute_accuracy(gb_model, X_valid, y_valid))

GB Earthquack Training 0.6565665771297007
GB Earthquack Valid 0.6576044204831066


In [0]:
#HistGradientBoostingClassifier

In [0]:
from sklearn.ensemble import HistGradientBoostingRegressor

ImportError: ignored

In [0]:
from sklearn.ensemble import HistGradientBoostingClassifier

ImportError: ignored

In [0]:
pip install HistGradientBoostingClassifier

[31mERROR: Could not find a version that satisfies the requirement HistGradientBoostingClassifier (from versions: none)[0m
[31mERROR: No matching distribution found for HistGradientBoostingClassifier[0m


In [0]:
df_test_feature.shape

(86868, 68)

In [0]:
results_df = pd.DataFrame({
    "building_id": df_test["building_id"],
    "damage_grade": y_pred
})

In [0]:
results_df.to_csv("/content/drive/My Drive/data/EarthQuake/submission_v3_eq.csv", index=False)