In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import os.path

%matplotlib inline

sns.set(style="ticks", color_codes=True)

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
if  os.path.exists('Dataset/full_train.csv') and os.path.exists('Dataset/full_test.csv'):
    
    #check for the full train and test set
    df_train = pd.read_csv('Dataset/full_train.csv')
    df_test = pd.read_csv('Dataset/full_test.csv')
else:
    
    # Load train and test data
    df_train = pd.read_csv('Dataset/train.csv')
    df_test = pd.read_csv('Dataset/test.csv')
    df_struct = pd.read_csv('Dataset/Building_Structure.csv')
    df_own = pd.read_csv('Dataset/Building_Ownership_Use.csv')

    df_merge = pd.merge(df_struct,df_own,on=['building_id', 'district_id', 'vdcmun_id', 'ward_id'])
    df_train = pd.merge(df_train,df_merge,on=['building_id', 'district_id', 'vdcmun_id'])
    df_test  = pd.merge(df_test, df_merge, on =['building_id', 'district_id', 'vdcmun_id'])

    del df_struct, df_own

    df_train["has_repair_started"].fillna(0.0,inplace=True)
    df_test["has_repair_started"].fillna(0.0,inplace=True)
    df_train["count_families"].fillna(1.0,inplace=True)

    df_train.to_csv('Dataset/full_train.csv',index=False)
    df_test.to_csv('Dataset/full_test.csv',index=False)


In [3]:
def clean_land_surface_condition(raw):
    if raw.lower() != 'flat':
        return "slope"
    else:
        return raw.lower()

def clean_plan_configuration(raw):
    if raw.lower() != 'rectangular':
        return "others"
    else:
        return raw.lower()

def clean_position(raw):
    if raw.lower() != 'not attached':
        return "attached"
    else:
        return raw.lower()

def clean_ground_floor_type(raw):
    if 'mud' in raw.lower():
        return 'mud'
    else:
        return 'hard_floor'

def clean_area_assesed(raw):
    if raw.lower() == 'exterior' or raw.lower() == 'interior':
        return 'visible'
    else:
        return raw.lower()

def clean_foundation_type(raw):
    if 'mud' in raw.lower():
        return 'mud'
    elif 'bamboo' in raw.lower():
        return 'wooden'
    else:
        return 'cemented'

def clean_roof_type(raw):
    if 'light roof' in raw.lower():
        return 'light roof'
    elif 'heavy roof' in raw.lower():
        return 'heavy roof'
    else:
        return 'rcc'

In [4]:
datasets = [df_train, df_test]

for dataset in datasets:
    
    dataset.land_surface_condition = dataset.land_surface_condition.apply(clean_land_surface_condition)

    dataset.plan_configuration = dataset.plan_configuration.apply(clean_plan_configuration)

    dataset.position = dataset.position.apply(clean_position)

    dataset.ground_floor_type = dataset.ground_floor_type.apply(clean_ground_floor_type)

    dataset.area_assesed = dataset.area_assesed.apply(clean_area_assesed)

    dataset.foundation_type = dataset.foundation_type.apply(clean_foundation_type)

    dataset.roof_type = dataset.roof_type.apply(clean_roof_type)

    dataset['isBuildingOld'] = (dataset.age_building > 30).astype('int')

    dataset['isFloorRemoved'] = ((dataset['count_floors_pre_eq']-dataset['count_floors_post_eq']) > 0 ).astype('int')

    dataset['isHeightChanged'] = ((dataset.height_ft_pre_eq - dataset.height_ft_post_eq) > 0 ).astype('int')

In [5]:
cols_to_drop = ['has_geotechnical_risk_fault_crack', 'has_geotechnical_risk_flood',
                'has_geotechnical_risk_land_settlement', 'has_geotechnical_risk_landslide',
                'has_geotechnical_risk_liquefaction', 'has_geotechnical_risk_other', 
                'has_geotechnical_risk_rock_fall','has_secondary_use_agriculture',
                'has_secondary_use_hotel', 'has_secondary_use_rental',
                'has_secondary_use_institution', 'has_secondary_use_school',
                'has_secondary_use_industry', 'has_secondary_use_health_post',
                'has_secondary_use_gov_office', 'has_secondary_use_use_police',
                'has_secondary_use_other','legal_ownership_status','age_building',
                'count_floors_pre_eq','count_floors_post_eq','height_ft_pre_eq','height_ft_post_eq',
                'has_superstructure_adobe_mud', 'has_superstructure_stone_flag', 
                'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 
                'has_superstructure_cement_mortar_brick', 'has_superstructure_other',
                'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
                'has_superstructure_rc_engineered']

df_train.drop(columns=cols_to_drop,inplace=True,axis=1)
df_test.drop(columns=cols_to_drop,inplace=True,axis=1)

In [None]:
for col in df_train.columns.tolist():
    if col != 'building_id':
        if df_train[col].dtype == 'object':
            sns.countplot(y=col,data=df_train)
            plt.show()
        else:
            plt.xlabel(col)
            plt.hist(x=col,data=df_train)
            plt.show()

In [6]:
features_to_encode = ["area_assesed","foundation_type", "land_surface_condition","roof_type",
                      "ground_floor_type","other_floor_type","position","plan_configuration","condition_post_eq"]
df_train = pd.get_dummies(df_train, columns=features_to_encode)
df_test  = pd.get_dummies(df_test, columns=features_to_encode)

In [7]:
df_train.drop(columns=['condition_post_eq_Covered by landslide'],inplace=True,axis=1)
df_test.drop(columns=['condition_post_eq_Covered by landslide'],inplace=True,axis=1)

In [8]:
from sklearn.preprocessing import LabelEncoder

target = df_train.damage_grade
le = LabelEncoder().fit(target)

Y = le.transform(target)

In [10]:
X = df_train.drop(columns=['building_id','damage_grade'],axis=1)

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
rf = RandomForestClassifier().fit(X_train,Y_train)

In [None]:
f1_score(Y_train,rf.predict(X_train),average='weighted')

In [None]:
preds = rf.predict(X_test)

In [None]:
f1_score(Y_test,preds,average='weighted')

In [11]:
from imblearn.combine import SMOTETomek

smotet = SMOTETomek(random_state=42,m=10)
X, Y = smotet.fit_sample(X,Y)
X= pd.DataFrame(X)
y=pd.DataFrame(Y)



In [12]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [15]:
rfc = RandomForestClassifier(n_estimators=100).fit(X_train,Y_train)

In [16]:
from sklearn.metrics import classification_report,f1_score
f1_score(Y_train,rfc.predict(X_train),average='weighted')

0.9877230872383114

In [17]:
preds = rfc.predict(X_test)

In [18]:
f1_score(Y_test,preds,average='weighted')

0.8095201454106639

In [None]:
print(classification_report(Y_test,preds))

In [None]:
df_test.head()

In [None]:
df_train.head()

In [19]:
XX_test = df_test.drop(columns=['building_id'],axis=1)

In [20]:
test_preds = rfc.predict(XX_test)

In [21]:
submission = pd.DataFrame({
    'building_id' : df_test['building_id'],
    'damage_grade' : test_preds
})

In [22]:
submission.damage_grade = submission.damage_grade.apply(lambda x : "Grade " + str(x+1) )

In [23]:
submission.to_csv('final_submission.csv',index=False)

In [24]:
submission.head()

Unnamed: 0,building_id,damage_grade
0,a3380c4f75,Grade 3
1,a338a4e653,Grade 5
2,a338a4e6b7,Grade 5
3,a33a6eaa3a,Grade 3
4,a33b073ff6,Grade 5
