In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
from ydata_profiling import ProfileReport

from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

df = pd.read_csv('train.csv')
df.head()

  df = pd.read_csv('train.csv')


Unnamed: 0.1,Unnamed: 0,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),land_surface_condition,type_of_foundation,type_of_roof,type_of_ground_floor,type_of_other_floor,...,type_of_reinforcement_concrete,residential_type,no_family_residing,public_place_type,industrial_use_type,govermental_use_type,flexible_superstructure,wall_binding,wall_material,damage_grade
0,0,floor two,1.0,256 ft^2,22.0,Flat,Bamboo or Timber,Bamboo/Timber Light roof,Clay,TImber/Bamboo-Mud,...,0.0,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,0.0,0.0,1.0
1,1,Floor 3,3.0,985 ft^2,18.0,Flat,Clay Sand Mixed mortar-Stone/Brick,Wood Light Roof or Bamboo Heavy Roof,Clay,TImber/Bamboo-Mud,...,0.0,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,5.0
2,2,Two Floor,7.0,,14.0,Flat,Mud mortar-Stone/Brick,,Clay,Wood-Mud or Bamboo Mud,...,0.0,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,5.0
3,3,two,18.0,185 ft^2,15.0,Flat,Clay Sand Mixed mortar-Stone/Brick,Wood Light Roof or Bamboo Light Roof,Clay,TImber/Bamboo-Mud,...,0.0,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,4.0
4,4,just 2 floor,22.0,290 ft^2,17.0,Flat,Clay Sand Mixed mortar-Stone/Brick,Bamboo or Timber Light roof,Clay,Timber Mud or Bamboo-Mud,...,0.0,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,1.0


---
# Data Cleaning
---

In [4]:
profile = ProfileReport(df, title='Joints Data Competition UGM')

In [None]:
profile.to_notebook_iframe()

In [2]:
df = df.rename(columns={'Unnamed: 0':'id'})

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 722815 entries, 0 to 722814
Data columns (total 25 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              722815 non-null  int64  
 1   floors_before_eq (total)        390009 non-null  object 
 2   old_building                    483611 non-null  float64
 3   plinth_area (ft^2)              301607 non-null  object 
 4   height_before_eq (ft)           390009 non-null  float64
 5   land_surface_condition          421209 non-null  object 
 6   type_of_foundation              483611 non-null  object 
 7   type_of_roof                    301607 non-null  object 
 8   type_of_ground_floor            390009 non-null  object 
 9   type_of_other_floor             421209 non-null  object 
 10  position                        410809 non-null  object 
 11  building_plan_configuration     421209 non-null  object 
 12  technical_soluti

In [None]:
for i in df.columns[1:]:
    print(f"\t\t-------{i}-------")
    display(df[i].unique())

---
# Data Preprocessing
---

In [3]:
def prep(df):
    df['plinth_area (ft^2)'] = df['plinth_area (ft^2)'].str.replace("\s*ft\s*\^2", "", regex=True)
    df['plinth_area (ft^2)'] = df['plinth_area (ft^2)'].str.replace('\s*More\s*\ than ' , '', regex=True)
    # benerin lantai
    lantai = {'floor two':2, 'Two Floor':2, 'two':2, ' just 2 floor':2, 'There is 2 Floor/Story':2, 'Floor two':2, 'Floor 2':2, 'two story':2, 'floor second':2, '2':2, 
           '1':1, '3':3, '2 floor':2, '1st Story':1, 'floor one':1, 'Has 1 floor':1, 'one story':1, 'Floor 1':1, 'floor 1st':1, 'one':1, 'Just 1 floor':1, 'Floor 3':3, 
           'Three floor':3, 'floor third':3, '3.00':3, ' has 3 Floor':3, 'three Story':3, 'Floor-three':3, 'Three':3, 'four Floor':4, ' has Four fl':4, 'Fl four':4, 
           '4':4, 'Floor 4':4, 'floor four':4, 'Four Story':4, 'Four':4, 'fifth':5, 'Fl Five':5, '5':5, 'Floor Fifth':5, ' Has Five fl':5, 'Has Five fl':5, '6':6, '7':7, 
           '8':8, '9':9,'five Floor':5}
    df['floors_before_eq (total)'] = df['floors_before_eq (total)'].replace(lantai, regex=True)
    # benerin pondasi
    fondasi = {'Bamboo or Timber': 'b/t', 'Clay Sand Mixed mortar-Stone/Brick': 'csmms/b', 'Mud mortar-Stone/Brick': 'mms/b', 'Clay mortar-Stone/Brick': 'cms/b',
           'Reinforced Concrete': 'rc', 'Cement-Stone or Cement-Brick': 'cs/cb', 'Bamboo/TImber': 'b/t', 'Bamboo/Timber': 'b/t', 'RC': 'rc',
           'Others': 'o', 'Other': 'o', 'Cement-Stone/Brick': 'cs/b'}
    df['type_of_foundation'] = df['type_of_foundation'].replace(fondasi, regex=True)
    # benerin genteng
    roof = {'Bamboo/Timber Light roof': 'b/tlr', 'Wood Light Roof or Bamboo Heavy Roof': 'wlr/bhr', 'Wood Light Roof or Bamboo Light Roof': 'wlr/blr',
           'Bamboo or Timber Light roof': 'b/tlr', 'Bamboo/TImber-Light Roof': 'b/tlr', 'Reinforced Brick Slab/rcc/rbc': 'rbs/rcc/rbc',
           'Bamboo/Timber Heavy roof': 'b/thr', 'reinforced cement concrete/rb/rbc': 'rcc/rb/rbc', 'Bamboo or Timber Heavy roof': 'b/thr',
           'Bamboo/TImber-Heavy Roof': 'b/thr', 'Reinforced brick concrete/rcc/rbc': 'rbc/rcc/rbc'}
    df['type_of_roof'] = df['type_of_roof'].replace(roof, regex=True)
    # benerin ground floor
    ground_floor = {'Clay': 'c', 'Mud': 'm', 'Brick or Stone': 'b/s', 'Reinforced Concrete': 'rc', 'soil, water, loam Mixed': 's/w/lm', 
                    'Lumber': 'l', 'Other': 'o', 'TImber': 't', 'Timber': 't', 'Wood': 'w'}
    df['type_of_ground_floor'] = df['type_of_ground_floor'].replace(ground_floor, regex=True)
    # benerin other floor
    df['type_of_other_floor'] = df['type_of_other_floor'].replace({
    'Bamboo/Timber Light roof|Bamboo or Timber Light roof|Bamboo/TImber-Light Roof': 'b/tlr',
    'Bamboo/Timber Heavy roof|Bamboo or Timber Heavy roof|Bamboo/TImber-Heavy Roof': 'b/thr',
    'reinforced cement concrete/rb/rbc': 'rcc/rb/rbc',
    'Reinforced brick concrete/rcc/rbc': 'rbc/rcc/rbc',
    'Reinforced Brick Slab/rcc/rbc': 'rbs/rcc/rbc'
    }, regex=True)
    # benerin legal ownership
    df['legal_ownership_status'] = df['legal_ownership_status'].replace({'Private.*':'p', 'Public.*':'pub', 'Institutional.*':'i', 'Other':'o'}, regex=True)
    # benerin residential type
    df['residential_type'] = df['residential_type'].replace({'Non-residential':'nr', 'Hotel/Motel':'hm', 'Rental Residential':'rr', 'Housing':'h', 'Other Residential Type':'ort', 'Other':'o'}, regex=True)
    # benerin number family residing
    df['no_family_residing'] = df['no_family_residing'].replace({'None':'0.0'})
    # benerin 
    df['public_place_type'] = df['public_place_type'].replace({'Non-public':'np'}, regex=True)
    df['industrial_use_type'] = df['industrial_use_type'].str.replace('Non-industrial', 'ni').str.replace('Service/Tourism', 'st').str.replace('Forest-based', 'fb')
    df['govermental_use_type'] = df['govermental_use_type'].replace({'Non-govermental':'ng'}, regex=True)
#     df.drop(['id', 'no_family_residing', 'industrial_use_type'], axis=1, inplace=True)
    for col in df.columns:
       mode = df[col].mode()[0] #mencari nilai modus pada masing2 kolom 
       df[col].fillna(value=mode, inplace=True) #mengganti nilai null yang tipe datanya object dengan nilai modus pada masing2 kolom


In [4]:
df_copy = df.copy()

In [5]:
prep(df_copy)
df_copy.head()

Unnamed: 0,id,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),land_surface_condition,type_of_foundation,type_of_roof,type_of_ground_floor,type_of_other_floor,...,type_of_reinforcement_concrete,residential_type,no_family_residing,public_place_type,industrial_use_type,govermental_use_type,flexible_superstructure,wall_binding,wall_material,damage_grade
0,0,2.0,1.0,256,22.0,Flat,b/t,b/tlr,c,TImber/Bamboo-Mud,...,0.0,nr,1.0,np,ni,ng,unavailable,0.0,0.0,1.0
1,1,3.0,3.0,985,18.0,Flat,csmms/b,wlr/bhr,c,TImber/Bamboo-Mud,...,0.0,nr,1.0,np,ni,ng,unavailable,5.0,2.0,5.0
2,2,2.0,7.0,300,14.0,Flat,mms/b,b/tlr,c,Wood-Mud or Bamboo Mud,...,0.0,nr,1.0,np,ni,ng,unavailable,5.0,2.0,5.0
3,3,2.0,18.0,185,15.0,Flat,csmms/b,wlr/blr,c,TImber/Bamboo-Mud,...,0.0,nr,1.0,np,ni,ng,unavailable,5.0,2.0,4.0
4,4,2.0,22.0,290,17.0,Flat,csmms/b,b/tlr,c,Timber Mud or Bamboo-Mud,...,0.0,nr,1.0,np,ni,ng,unavailable,5.0,2.0,1.0


In [6]:
df_copy.head(16)

Unnamed: 0,id,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),land_surface_condition,type_of_foundation,type_of_roof,type_of_ground_floor,type_of_other_floor,...,type_of_reinforcement_concrete,residential_type,no_family_residing,public_place_type,industrial_use_type,govermental_use_type,flexible_superstructure,wall_binding,wall_material,damage_grade
0,0,2.0,1.0,256,22.0,Flat,b/t,b/tlr,c,TImber/Bamboo-Mud,...,0.0,nr,1.0,np,ni,ng,unavailable,0.0,0.0,1.0
1,1,3.0,3.0,985,18.0,Flat,csmms/b,wlr/bhr,c,TImber/Bamboo-Mud,...,0.0,nr,1.0,np,ni,ng,unavailable,5.0,2.0,5.0
2,2,2.0,7.0,300,14.0,Flat,mms/b,b/tlr,c,Wood-Mud or Bamboo Mud,...,0.0,nr,1.0,np,ni,ng,unavailable,5.0,2.0,5.0
3,3,2.0,18.0,185,15.0,Flat,csmms/b,wlr/blr,c,TImber/Bamboo-Mud,...,0.0,nr,1.0,np,ni,ng,unavailable,5.0,2.0,4.0
4,4,2.0,22.0,290,17.0,Flat,csmms/b,b/tlr,c,Timber Mud or Bamboo-Mud,...,0.0,nr,1.0,np,ni,ng,unavailable,5.0,2.0,1.0
5,5,2.0,15.0,300,18.0,Flat,csmms/b,b/tlr,c,TImber/Bamboo-Mud,...,0.0,nr,1.0,np,ni,ng,unavailable,5.0,2.0,5.0
6,6,2.0,33.0,300,16.0,Flat,csmms/b,b/tlr,c,Wood-Mud or Bamboo Mud,...,0.0,nr,1.0,np,ni,ng,unavailable,5.0,2.0,4.0
7,7,2.0,40.0,504,14.0,Flat,cms/b,b/tlr,c,TImber/Bamboo-Mud,...,0.0,nr,1.0,np,ni,ng,unavailable,5.0,2.0,4.0
8,8,2.0,27.0,616,14.0,Flat,csmms/b,b/tlr,c,TImber/Bamboo-Mud,...,0.0,nr,1.0,np,ni,ng,unavailable,5.0,2.0,4.0
9,9,1.0,33.0,600,9.0,Flat,cms/b,b/tlr,m,Not applicable,...,0.0,nr,1.0,np,ni,ng,unavailable,5.0,2.0,5.0


In [19]:
for i in df_copy.columns[1:]:
    print(f"\t\t-------{i}-------")
    display(df_copy[i].unique())

		-------floors_before_eq (total)-------


array([2., 3., 1., 5., 4., 6., 8., 9., 7.])

		-------old_building-------


array([  1.,   3.,   7.,  18.,  22.,  15.,  33.,  40.,  27.,  68.,  25.,
        20.,  28.,  35.,  10.,   4.,   8.,   5.,  12.,  32.,  45.,  13.,
        50.,   6.,  36.,  70.,  30.,  14.,  75.,  90.,  34.,  24.,   2.,
         9.,  16.,  29., 999.,  67.,  60.,  26.,  38.,   0.,  17.,  37.,
        19.,  80.,  21.,  11.,  55.,  44.,  39.,  23.,  65.,  48.,  85.,
        52.,  31., 100.,  57.,  42.,  41.,  62., 120.,  64.,  43.,  49.,
        47.,  46.,  88.,  82.,  87.,  69., 105.,  99.,  54., 140.,  51.,
        72.,  95.,  63., 150., 176.,  59.,  81.,  58.,  61.,  53.,  56.,
        84., 110.,  66.,  74.,  98.,  86., 109., 111., 200.,  73., 103.,
        79., 112.,  78.,  97.,  92., 160.,  71., 180.,  76.,  91.,  96.,
       101., 166., 106., 118., 108.,  83.,  77., 145., 130., 117.,  93.,
       102., 135., 104., 170., 131., 115.,  89., 125., 128., 119.,  94.,
       190., 174., 126., 132., 113., 116., 144., 162., 138., 195., 141.,
       122., 175., 161., 196., 107., 121., 151., 14

		-------plinth_area (ft^2)-------


array([ 256,  985,  300,  185,  290,  504,  616,  600,  500, 1000,  366,
        150,  352,  358,  250,  360,  550,  310,  400,  735,  464,  380,
        342,  384,  432,  375,  490,  780,  494,  278,  228,  475,  440,
        280,  288,  368,  312,  450,  383,  425,  968,  200,  140,  410,
        264,  194,  230,  438,  405,  130,  297,  525,  392,  176,  420,
        242,  286,  364,  210,  445,  495,  240,  340,  660,  396,  233,
        260,  289,  508,  299,  320,  648,  222,  301,  265,  270,  544,
        220,  153,  700,  350,  285,  351,  174,  800,  315,  318,  173,
        478,  465,  324,  338,  462,  356,  483,  496,  325,  253,  155,
        246,  190,  435,  820,  225,  216,  272,  385,  180,  810,  382,
        620,  363,  590,  160,  434,  357,  531,  152,  430,  245,  263,
        416,  567,  594,  204,  576,  486,  309,  378,  284,  562,  177,
        255,  545,  305,  390,  171,  456,  588,  100,  348,  231,  850,
        729,  510,  125,  460,  437,  409,  247,  3

		-------height_before_eq (ft)-------


array([22., 18., 14., 15., 17., 16.,  9., 45., 19., 10., 12., 20., 21.,
       25.,  6., 24.,  7., 13.,  8., 11., 30., 23., 27., 36., 26., 40.,
       56., 32., 28., 29., 35., 38., 31., 74., 34., 50., 65., 99., 33.,
       60., 48., 75., 51., 54., 46., 44., 47., 70., 55., 80., 61., 37.,
       93., 52., 77., 96., 72., 81., 42., 64., 85., 39., 57., 63., 95.,
       67., 43., 78., 49., 41., 58., 68., 76., 90., 66., 71., 89.])

		-------land_surface_condition-------


array([0, 1, 2])

		-------type_of_foundation-------


array([0, 4, 5, 1, 7, 3, 6, 2])

		-------type_of_roof-------


array([1, 5, 6, 3, 0, 4, 2])

		-------type_of_ground_floor-------


array([ 4,  6,  2,  9,  7, 10,  1, 11,  3,  8,  5,  0, 12, 13])

		-------type_of_other_floor-------


array([2, 6, 3, 1, 5, 9, 7, 4, 0, 8])

		-------position-------


array([3, 0, 1, 2])

		-------building_plan_configuration-------


array([6, 7, 3, 8, 4, 9, 5, 1, 0, 2])

		-------technical_solution_proposed-------


array([3, 0, 2, 1])

		-------legal_ownership_status-------


array([6, 1, 7, 0, 2, 4, 3, 5])

		-------has_secondary_use-------


array([0., 1.])

		-------type_of_reinforcement_concrete-------


array([0., 2., 1., 3.])

		-------residential_type-------


array([2, 1, 5, 0, 3, 4])

		-------no_family_residing-------


array([ 1,  0,  3,  4,  5,  8,  6,  7, 10,  9,  2])

		-------public_place_type-------


array([12,  9,  6,  0,  1, 10,  3,  2, 11,  4,  7,  8,  5])

		-------industrial_use_type-------


array([7, 5, 8, 2, 4, 3, 0, 1, 6])

		-------govermental_use_type-------


array([2, 1, 0])

		-------flexible_superstructure-------


array([1, 0])

		-------wall_binding-------


array([0., 5., 2., 1., 7., 3.])

		-------wall_material-------


array([0., 2., 1., 3.])

		-------damage_grade-------


array([1., 5., 4., 2., 3.])

In [7]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 722815 entries, 0 to 722814
Data columns (total 25 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              722815 non-null  int64  
 1   floors_before_eq (total)        722815 non-null  float64
 2   old_building                    722815 non-null  float64
 3   plinth_area (ft^2)              722815 non-null  object 
 4   height_before_eq (ft)           722815 non-null  float64
 5   land_surface_condition          722815 non-null  object 
 6   type_of_foundation              722815 non-null  object 
 7   type_of_roof                    722815 non-null  object 
 8   type_of_ground_floor            722815 non-null  object 
 9   type_of_other_floor             722815 non-null  object 
 10  position                        722815 non-null  object 
 11  building_plan_configuration     722815 non-null  object 
 12  technical_soluti

---
# Labelling Categorical Data
---

In [8]:
le = preprocessing.LabelEncoder()
le

In [9]:
df_copy[['plinth_area (ft^2)','land_surface_condition', 'type_of_foundation', 'type_of_roof', 'type_of_ground_floor', 'type_of_other_floor',
        'position', 'building_plan_configuration', 'technical_solution_proposed', 'legal_ownership_status', 'residential_type', 'no_family_residing',
        'public_place_type', 'industrial_use_type', 'govermental_use_type', 'flexible_superstructure']] = df_copy[['plinth_area (ft^2)','land_surface_condition', 'type_of_foundation', 'type_of_roof', 'type_of_ground_floor', 'type_of_other_floor',
        'position', 'building_plan_configuration', 'technical_solution_proposed', 'legal_ownership_status', 'residential_type', 'no_family_residing',
        'public_place_type', 'industrial_use_type', 'govermental_use_type', 'flexible_superstructure']].apply(lambda x: le.fit_transform(x))


In [None]:
df_copy.isna().sum()

In [None]:
df_copy.head()

---
# Decide Independent and Dependent Data
---
and train the data

In [10]:
x = df_copy.drop(['id','has_secondary_use', 'no_family_residing', 'public_place_type', 'industrial_use_type', 'govermental_use_type', 'damage_grade'], axis=1)
y = df_copy['damage_grade']

In [11]:
x.shape

(722815, 18)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

---
# RandomForestClassifier
---

In [16]:
rf = RandomForestClassifier(n_estimators=300)

In [17]:
rf.fit(x_train, y_train)

In [14]:
accuracy = rf.score(x_test, y_test)
accuracy

0.4715936996326861

In [14]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(n_estimators=100))
])

In [15]:
pipeline.fit(x_train, y_train)

In [16]:
y_pred = pipeline.predict(x_test)

In [17]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.4710956468805988


---
# AdaBoostClassifier
---

In [None]:
ada = AdaBoostClassifier(algorithm="SAMME", n_estimators=200, random_state=42)

In [None]:
ada.fit(x_train, y_train)

In [None]:
accur = ada.score(x_test, y_test)
accur

---
# GradientBoostClassifier
---

In [None]:
gb = GradientBoostingClassifier(n_estimators=200)

In [None]:
gb.fit(x_train, y_train)

---
## Ensemble Voting
---

In [33]:
lr = LogisticRegression(random_state=42, max_iter=5000)
knn = KNeighborsClassifier(n_neighbors=3)
dtc = DecisionTreeClassifier(random_state=42)
gbc = GradientBoostingClassifier(random_state=42)
rfc = RandomForestClassifier(random_state=42)

In [35]:
voting_hard = VotingClassifier(estimators=[('lr', lr), ('knn', knn), ('dtc', dtc), ('gbc', gbc), ('rfc', rfc)], voting='hard')
voting_soft = VotingClassifier(estimators=[('lr', lr), ('knn', knn), ('dtc', dtc), ('gbc', gbc), ('rfc', rfc)], voting='soft')

In [32]:
voting_hard.fit(x_train, y_train)
voting_soft.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
pred_hard = voting_hard.predict(x_test)
score_hard = accuracy_score(y_test, pred_hard)
print(f'Voting Hard Accuracy : ', score_hard)

Voting Hard Accuracy :  0.4178178372059241


In [27]:
pred_soft = voting_soft.predict(x_test)
score_soft = accuracy_score(y_test, pred_soft)
print(f'Voting Hard Accuracy : ', score_soft)

Voting Hard Accuracy :  0.43399071685009305


---
# Evaluate
---

In [26]:
test_frame = pd.read_csv('test.csv')
test_copy = test_frame.copy()

In [27]:
prep(test_copy)

In [28]:
test_copy.drop(['id','has_secondary_use', 'no_family_residing', 'public_place_type', 'industrial_use_type', 'govermental_use_type'],axis=1, inplace=True)

In [30]:
test_copy[['plinth_area (ft^2)', 'land_surface_condition', 'type_of_foundation', 'type_of_roof', 'type_of_ground_floor', 'type_of_other_floor',
        'position', 'building_plan_configuration', 'technical_solution_proposed', 'legal_ownership_status', 'residential_type',
        'flexible_superstructure']] = test_copy[['plinth_area (ft^2)', 'land_surface_condition', 'type_of_foundation', 'type_of_roof', 'type_of_ground_floor', 'type_of_other_floor',
        'position', 'building_plan_configuration', 'technical_solution_proposed', 'legal_ownership_status', 'residential_type',
        'flexible_superstructure']].apply(lambda x: le.fit_transform(x))

In [31]:
test_copy.shape

(242082, 18)

In [32]:
prediction = pipeline.predict(test_copy)

In [None]:
pred = gb.predict(test_copy)

In [33]:
submission = pd.DataFrame([test_frame['id'], prediction]).transpose()
submission.columns = ['id', 'damage_grade']

In [34]:
submission.head()

Unnamed: 0,id,damage_grade
0,0.0,2.0
1,1.0,5.0
2,2.0,5.0
3,3.0,4.0
4,4.0,2.0


In [35]:
submission.to_csv('Submission Pipeline.csv', index=False)

In [36]:
sub2 = pd.read_csv('Submission Pipeline.csv')
sub2 = submission.astype(int)
sub2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242082 entries, 0 to 242081
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   id            242082 non-null  int32
 1   damage_grade  242082 non-null  int32
dtypes: int32(2)
memory usage: 1.8 MB


In [37]:
sub2.to_csv('Submission Pipeline Fix.csv', index=False)

In [None]:
sub = pd.read_csv('Submission Mageena.csv')
sub.head()

In [None]:
sub.info()

In [None]:
sub.to_csv('Submission Mageena.csv', index=False)