In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import os.path

%matplotlib inline

sns.set(style="ticks", color_codes=True)

In [2]:
PATH = '../data/datasets/rahul110/building-dataset-hackerearth-ml-6/'

In [3]:
if  os.path.exists(PATH + 'full_train.csv') and os.path.exists(PATH + 'full_test.csv'):
    
    #check for the full train and test set
    df_train = pd.read_csv(PATH + 'full_train.csv')
    df_test = pd.read_csv(PATH + 'full_test.csv')
    
    df_train["has_repair_started"].fillna(0.0,inplace=True)
    df_test["has_repair_started"].fillna(0.0,inplace=True)
    df_train["count_families"].fillna(1.0,inplace=True)
else:
    
    # Load train and test data
    df_train = pd.read_csv(PATH + 'train.csv')
    df_test = pd.read_csv(PATH + 'test.csv')
    df_struct = pd.read_csv(PATH + 'Building_Structure.csv')
    df_own = pd.read_csv(PATH + 'Building_Ownership_Use.csv')

    df_merge = pd.merge(df_struct,df_own,on=['building_id', 'district_id', 'vdcmun_id', 'ward_id'])
    df_train = pd.merge(df_train,df_merge,on=['building_id', 'district_id', 'vdcmun_id'])
    df_test  = pd.merge(df_test, df_merge, on =['building_id', 'district_id', 'vdcmun_id'])

    del df_struct, df_own, df_merge

    df_train.to_csv(PATH + 'full_train.csv',index=False)
    df_test.to_csv(PATH + 'full_test.csv',index=False)
    
    df_train["has_repair_started"].fillna(0.0,inplace=True)
    df_test["has_repair_started"].fillna(0.0,inplace=True)
    df_train["count_families"].fillna(1.0,inplace=True)


In [4]:
def clean_land_surface_condition(raw):
    if raw.lower() != 'flat':
        return "slope"
    else:
        return raw.lower()

def clean_plan_configuration(raw):
    if raw.lower() != 'rectangular':
        return "others"
    else:
        return raw.lower()

def clean_position(raw):
    if raw.lower() != 'not attached':
        return "attached"
    else:
        return raw.lower()

def clean_ground_floor_type(raw):
    if 'mud' in raw.lower():
        return 'mud'
    else:
        return 'hard_floor'

def clean_area_assesed(raw):
    if raw.lower() == 'exterior' or raw.lower() == 'interior':
        return 'visible'
    else:
        return raw.lower()

def clean_foundation_type(raw):
    if 'mud' in raw.lower():
        return 'mud'
    elif 'bamboo' in raw.lower():
        return 'wooden'
    else:
        return 'cemented'

def clean_roof_type(raw):
    if 'light roof' in raw.lower():
        return 'light roof'
    elif 'heavy roof' in raw.lower():
        return 'heavy roof'
    else:
        return 'rcc'

In [5]:
datasets = [df_train, df_test]

for dataset in datasets:
    
    dataset.land_surface_condition = dataset.land_surface_condition.apply(clean_land_surface_condition)

    dataset.plan_configuration = dataset.plan_configuration.apply(clean_plan_configuration)

    dataset.position = dataset.position.apply(clean_position)

    dataset.ground_floor_type = dataset.ground_floor_type.apply(clean_ground_floor_type)

    dataset.area_assesed = dataset.area_assesed.apply(clean_area_assesed)

    dataset.foundation_type = dataset.foundation_type.apply(clean_foundation_type)

    dataset.roof_type = dataset.roof_type.apply(clean_roof_type)

    dataset['isBuildingOld'] = (dataset.age_building > 30).astype('int')

    dataset['isFloorRemoved'] = ((dataset['count_floors_pre_eq']-dataset['count_floors_post_eq']) > 0 ).astype('int')

    dataset['isHeightChanged'] = ((dataset.height_ft_pre_eq - dataset.height_ft_post_eq) > 0 ).astype('int')
    
    dataset['isBuildingRemoved'] = (dataset.count_floors_post_eq == 0).astype('int')
        
    #dataset['height_per_floor_pre_eq'] = dataset.height_ft_pre_eq / dataset.count_floors_pre_eq
    
    dataset['volume_per_floor_pre_eq'] = np.log(dataset.plinth_area_sq_ft * (dataset.height_ft_pre_eq / dataset.count_floors_pre_eq))
    
    dataset['volume_per_floor_post_eq'] = ((dataset.height_ft_post_eq + 1) / (dataset.count_floors_post_eq + 1) * dataset.plinth_area_sq_ft).apply(np.log)
    #dataset.plinth_area_sq_ft = dataset.plinth_area_sq_ft.apply(lambda x : np.log(x))
     
    dataset['isResidential'] = (dataset.count_families > 0).astype('int')
    
    dataset.age_building = dataset.age_building.apply(lambda x : np.log(x + 1))

In [None]:
#cols_to_drop = ['has_geotechnical_risk_fault_crack', 'has_geotechnical_risk_flood',
#                'has_geotechnical_risk_land_settlement', 'has_geotechnical_risk_landslide',
#                'has_geotechnical_risk_liquefaction', 'has_geotechnical_risk_other', 
#                'has_geotechnical_risk_rock_fall','has_secondary_use_agriculture',
#                'has_secondary_use_hotel', 'has_secondary_use_rental',
#                'has_secondary_use_institution', 'has_secondary_use_school',
#                'has_secondary_use_industry', 'has_secondary_use_health_post',
#                'has_secondary_use_gov_office', 'has_secondary_use_use_police',
#                'has_secondary_use_other','legal_ownership_status','age_building',
#                'count_floors_pre_eq','count_floors_post_eq','height_ft_pre_eq','height_ft_post_eq',
#                'has_superstructure_adobe_mud', 'has_superstructure_stone_flag', 
#                'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 
#                'has_superstructure_cement_mortar_brick', 'has_superstructure_other',
#                'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
#                'has_superstructure_rc_engineered']

#df_train.drop(columns=cols_to_drop,inplace=True,axis=1)
#df_test.drop(columns=cols_to_drop,inplace=True,axis=1)

In [None]:
cols_to_drop = ['count_floors_pre_eq','count_floors_post_eq','height_ft_pre_eq','height_ft_post_eq', 'plinth_area_sq_ft']

df_train.drop(columns=cols_to_drop,inplace=True,axis=1)
df_test.drop(columns=cols_to_drop,inplace=True,axis=1)

In [None]:
for col in df_train.columns.tolist():
    if col != 'building_id':
        if df_train[col].dtype == 'object':
            sns.countplot(y=col,data=df_train)
            plt.show()
        else:
            plt.xlabel(col)
            plt.hist(x=col,data=df_train)
            plt.show()

In [6]:
features_to_encode = ["area_assesed","foundation_type", "land_surface_condition","roof_type",'legal_ownership_status',
                      "ground_floor_type","other_floor_type","position","plan_configuration","condition_post_eq"]

from sklearn.preprocessing import LabelEncoder

for dataset in datasets:
    for feature in features_to_encode:
        #df_train[feature] = df_train[feature].astype('category').cat.codes
        le = LabelEncoder().fit(dataset[feature])
        dataset[feature] = le.transform(dataset[feature])                  

In [None]:
df_train.drop(columns=['condition_post_eq_Covered by landslide'],inplace=True,axis=1)
df_test.drop(columns=['condition_post_eq_Covered by landslide'],inplace=True,axis=1)

In [7]:
from sklearn.preprocessing import LabelEncoder

target = df_train.damage_grade
le = LabelEncoder().fit(target)

Y = le.transform(target)

In [8]:
X = df_train.drop(columns=['building_id','damage_grade'],axis=1)

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [9]:
import xgboost as xgb

In [10]:
def compute_f1(preds,dtrain):
    
    labels = dtrain.get_label()
    
    pred_label = np.argmax(preds, axis=1)
    
    f1 = f1_score(labels,pred_label,average='weighted')
    
    return 'f1_score',f1

In [16]:
param = {
    'max_depth': 10,  # the maximum depth of each tree
    'eta': 0.05,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 5,
    'subsample' : 0.7,
    'colsample_bytree': 0.7,
    'seed':3,
    #'predictor':'gpu_predictor',
    'lambda':10}  # the number of classes that exist in this datset
num_round = 10  # the number of training iterations

In [31]:
def train_xgb(X_train, y_train, X_val, y_val):
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    
    dtest = xgb.DMatrix(X_val, label=y_val)
    
    watchlist  = [(dtest,'test'), (dtrain,'train')]
    bst = xgb.train(param, dtrain, num_round, watchlist, early_stopping_rounds=2,feval=compute_f1)
    
    return bst

In [32]:
def train_and_predict(X_train, y_train, X_val, y_val, X_test):

    bst = train_xgb(X.iloc[train_index].values, Y[train_index], X.iloc[test_index].values, Y[test_index])
    
    bst.save_model('mymodel')
    bst = xgb.Booster(param)
    bst.load_model('mymodel')
    
    #xgboost issue : https://github.com/dmlc/xgboost/issues/1238
    
    dpredict = xgb.DMatrix(X_test)
    pred = bst.predict(dpredict)
    
    return pred

In [29]:
#from sklearn.model_selection import KFold
#kf = KFold(n_splits=3, shuffle=True, random_state=3)

#models= []
#for train_index, test_index in kf.split(df_train):    
#    print("Split")
#    model = train_xgb(X.iloc[train_index].values, Y[train_index], X.iloc[test_index].values, Y[test_index])
#    models.append(model)`

In [14]:
X_test = df_test.drop(columns=['building_id'],axis=1)

In [33]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=3)

preds = []

for train_index, test_index in kf.split(df_train):    
    print("Split")
    pred = train_and_predict(X.iloc[train_index].values, Y[train_index], X.iloc[test_index].values, Y[test_index],X_test)
    preds.append(pred)

Split
[0]	test-merror:0.311866	train-merror:0.306386	test-f1_score:0.684526	train-f1_score:0.689858
Multiple eval metrics have been passed: 'train-f1_score' will be used for early stopping.

Will train until train-f1_score hasn't improved in 2 rounds.
[1]	test-merror:0.274585	train-merror:0.270748	test-f1_score:0.722167	train-f1_score:0.725852
[2]	test-merror:0.27515	train-merror:0.270518	test-f1_score:0.719045	train-f1_score:0.723741
Stopping. Best iteration:
[0]	test-merror:0.311866	train-merror:0.306386	test-f1_score:0.684526	train-f1_score:0.689858

Split
[0]	test-merror:0.308015	train-merror:0.304461	test-f1_score:0.691131	train-f1_score:0.694471
Multiple eval metrics have been passed: 'train-f1_score' will be used for early stopping.

Will train until train-f1_score hasn't improved in 2 rounds.
[1]	test-merror:0.275924	train-merror:0.27227	test-f1_score:0.721153	train-f1_score:0.724781
[2]	test-merror:0.275634	train-merror:0.272006	test-f1_score:0.719294	train-f1_score:0.722882
S

In [None]:
submission = pd.DataFrame({
    'building_id' : df_test['building_id'],
    'damage_grade' : test_preds.astype(int)
})

submission.damage_grade = submission.damage_grade.apply(lambda x : "Grade " + str(x+1) )

submission.to_csv('final_submission.csv',index=False)

In [None]:
test_preds.astype(int)

In [None]:
submission.damage_grade.unique()

In [None]:
submission.head()

In [None]:
df_test.head()

In [25]:
avg = (preds[0] + preds[1] + preds[2])/3

In [26]:
pred_label = np.argmax(avg, axis=1)

In [27]:
pred_label

array([1, 4, 4, ..., 0, 4, 4])