In [1]:
import pandas as pd
import numpy as np
import pandasql as ps

from sklearn.impute import SimpleImputer
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report

In [2]:
# raw
train_val = pd.read_csv("/Users/jiamin/Documents/869 team project/Training_set_values.csv")
train_lbl = pd.read_csv("/Users/jiamin/Documents/869 team project/Training_set_labels.csv")
test_val = pd.read_csv("/Users/jiamin/Documents/869 team project/Test_set_values.csv")

# ordinal encoded raw
train_prep = pd.read_csv("/Users/jiamin/Documents/869 team project/preprocessed data/train_data.csv")
test_prep = pd.read_csv("/Users/jiamin/Documents/869 team project/preprocessed data/test.csv")

In [3]:
train_set = train_val.merge(train_lbl, on='id',how='left')

In [4]:
################ payment 
# bin 'other' category to 'unknown' group

train_set.loc[train_set.payment_type == 'other', 'payment_type'] = 'unknown'

Ord_enc = OrdinalEncoder()
Ord_enc = Ord_enc.fit(train_set[['payment_type']])
train_set['payment_type_ord'] = Ord_enc.transform(train_set[['payment_type']])

In [5]:
############# quality
# 'good' has the highest number of all target -> create individual dummy

train_set['quality_ind'] = np.where(train_set['water_quality'] == 'good',1,0)

In [6]:
############ construction year
# bin years into new and old and create unknown for all the year = 0

def construction(df):
    if df['construction_year'] >= 1986:
        val = 'New'
    elif df['construction_year'] == 0:
        val = 'Unknown'
    else:
        val = 'Old'
    return val

train_set['construction_cat'] = train_set.apply(construction, axis=1)
train_set.drop('construction_year',axis=1)

Ord_enc_2 = OrdinalEncoder()
Ord_enc_2 = Ord_enc_2.fit(train_set[['construction_cat']])
train_set['construction_cat'] = Ord_enc_2.transform(train_set[['construction_cat']])

In [7]:
############ permit 
# missing imputation and create dummy to indicate whether the value was imputed or not

imp = SimpleImputer(missing_values = np.nan, strategy='median') 
train_set['permit_imp'] = imp.fit_transform(train_set[['permit']]).ravel()

train_set['permit_imp_ind'] = np.where(train_set['permit'].isna(),1,0)

In [8]:
############ longitude
#impute missing data to median of the same location (subvillage, ward, lga)

train_set.loc[train_set.longitude == 0, 'longitude'] = np.nan
train_set.longitude.fillna(train_set.groupby(['subvillage'])['longitude'].transform('median'), inplace = True)
train_set.longitude.fillna(train_set.groupby(['ward'])['longitude'].transform('median'), inplace = True)
train_set.longitude.fillna(train_set.groupby(['lga'])['longitude'].transform('median'), inplace = True)

In [9]:
############ latitude
#impute missing data to median of the same location (subvillage, ward, lga)

train_set_latitude = train_set[train_set['latitude'].round(0) == -0]
train_set.loc[train_set.latitude.round(0) == -0.0, 'latitude'] = np.nan
train_set.latitude.fillna(train_set.groupby(['subvillage'])['latitude'].transform('median'), inplace = True)
train_set.latitude.fillna(train_set.groupby(['ward'])['latitude'].transform('median'), inplace = True)
train_set.latitude.fillna(train_set.groupby(['lga'])['latitude'].transform('median'), inplace = True)

In [10]:
############ gps_height
#impute missing data to mean of the same location (subvillage, ward, lga,region_code)

train_set.loc[train_set.gps_height == 0, 'gps_height'] = np.nan
train_set.gps_height.fillna(train_set.groupby(['subvillage'])['gps_height'].transform('mean'), inplace = True)
train_set.gps_height.fillna(train_set.groupby(['ward'])['gps_height'].transform('mean'), inplace = True)
train_set.gps_height.fillna(train_set.groupby(['lga'])['gps_height'].transform('mean'), inplace = True)
train_set.gps_height.fillna(train_set.groupby(['region_code'])['gps_height'].transform('mean'), inplace=True)

In [11]:
############ population
#impute missing data to median of the same location (subvillage, ward, lga,region_code)
train_set['popu_ind'] = np.where(train_set['population'] == 0,1,0)

train_set.loc[train_set.population == 0, 'population'] = np.nan
train_set.population.fillna(train_set.groupby(['subvillage'])['population'].transform('median'), inplace = True)
train_set.population.fillna(train_set.groupby(['ward'])['population'].transform('median'), inplace = True)
train_set.population.fillna(train_set.groupby(['lga'])['population'].transform('median'), inplace = True)
train_set.population.fillna(train_set.groupby(['region_code'])['population'].transform('median'), inplace=True)



In [None]:
import matplotlib.pyplot as plt

In [None]:
def plot_hist(feature, title):
    plt.figure(figsize=(8, 5));
    plt.hist(feature, bins=200, edgecolor='black', linewidth=1.2);
    plt.title(title, fontsize=20);
    #ax.tick_params(axis='both', which='major', labelsize=18);
    plt.grid(True);
    

In [None]:
plot_hist(train_set['population'], "dist")

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_set[['population']])

train_set['popu_Std'] = scaler.transform(train_set[['population']])   


In [None]:
plot_hist(train_set['popu_Std'], "Standard");

In [13]:
from sklearn.preprocessing import FunctionTransformer

scaler_log = FunctionTransformer(np.log1p, validate=True)
scaler_log.fit(train_set[['popu_Std']])
train_set['popu_StdLog'] = scaler_log.transform(train_set[['popu_Std']]) 

In [None]:
plot_hist(train_set['popu_StdLog'], "StdLog");

In [14]:
############ fix typo installer

def fix_typo(df):
    df['installer'].replace(to_replace = ('ADRA /Government') , value ='ADRA/Government' , inplace=True)
    df['installer'].replace(to_replace = ('British colonial government') , value ='British government' , inplace=True)
    df['installer'].replace(to_replace = ('Central Government','Tanzania Government', 'central government','Cental Government', 'Cebtral Government', 'Tanzanian Government','Tanzania government', 'Centra Government', 'CENTRAL GOVERNMENT', 'TANZANIAN GOVERNMENT','Central govt', 'Centr', 'Centra govt') , value ='Central government' , inplace=True)
    df['installer'].replace(to_replace =('Commu','Communit','commu','COMMU', 'COMMUNITY'), value ='Community' , inplace=True)
    df['installer'].replace(to_replace = ('Concern /government') , value ='Concern/Government' , inplace=True)
    df['installer'].replace(to_replace = ('Colonial Government') , value ='Colonial government' , inplace=True)
    df['installer'].replace(to_replace = ('COUN', 'District COUNCIL', 'DISTRICT COUNCIL','District Counci', 'District Council','Council','Counc','District Council','Distri'), value ='District council' , inplace=True)
    df['installer'].replace(to_replace = ('District Water Department', 'District water depar','Distric Water Department'), value ='District water department' , inplace=True)
    df['installer'].replace(to_replace = 'DANID', value ='DANIDA' , inplace=True)
    df['installer'].replace(to_replace = ('FinW','Fini water','FINI WATER'), value ='Fini Water' , inplace=True)
    df['installer'].replace(to_replace = ('GOVERNMENT', 'GOVER', 'GOVERNME', 'GOVERM','GOVERN','Gover','Gove','Governme','Governmen' ) ,value ='Government' , inplace=True)
    df['installer'].replace(to_replace = ('Government of Misri') , value ='Misri Government' , inplace=True)
    df['installer'].replace(to_replace = ('Government and Community') , value ='Government /Community' , inplace=True)
    df['installer'].replace(to_replace = ('Government /TCRS','Government/TCRS') , value ='TCRS /Government' , inplace=True)
    df['installer'].replace(to_replace = 'Hesawa', value ='HESAWA' , inplace=True)
    df['installer'].replace(to_replace = ('Italy government') , value ='Italian government' , inplace=True)
    df['installer'].replace(to_replace = 'JAICA', value ='Jaica' , inplace=True)
    df['installer'].replace(to_replace = ('RC CHURCH', 'RC Churc', 'RC','RC Ch','RC C', 'RC CH','RC church', 'RC CATHORIC',) , value ='RC Church' , inplace=True)
    df['installer'].replace(to_replace = ('World vision', 'World Division','World Vision'), value ='world vision' , inplace=True)
    df['installer'].replace(to_replace = ('Unisef','UNICEF'),value ='Unicef' , inplace=True)
    df['installer'].replace(to_replace = ('villigers', 'villager', 'Villagers', 'Villa', 'Village', 'Villi', 'Village Council','Village Counil', 'Villages', 'Vill', 'Village community', 'Villaers', 'Village Community', 'Villag','Villege Council', 'Village council', 'Village Council','Villagerd', 'Villager', 'Village Technician', 'Village Office','Village community members'), value ='villagers' , inplace=True)
    df['installer'].replace(to_replace = ('Village Government') , value ='Village government' , inplace=True)
    df['installer'].replace(to_replace = ('Cetral government /RC') , value ='RC church/Central Gover' , inplace=True)
    return df

train_set= fix_typo(train_set)

In [15]:
############ installer
train_set.installer.fillna('other', inplace=True)
train_set.loc[train_set.installer.str.len() == 1, 'installer'] = 'other'

Ord_enc_installer = OrdinalEncoder()
Ord_enc_installer = Ord_enc_installer.fit(train_set[['installer']])
train_set['installer_ord'] = Ord_enc_installer.transform(train_set[['installer']])

In [16]:
############## funder
train_set['funder'].replace(to_replace = '0', value ='None' , inplace=True)
train_set['funder'].replace(to_replace = 'Unknown', value ='None' , inplace=True)

columns_to_keep = ['Government Of Tanzania','Unknown','Danida','Hesawa','Rwssp','World Bank','Kkkt', 'World Vision',
         'Unicef','Tasaf','District Council', 'Dhv', 'Private Individual', 'Dwsp','Norad','Germany Republi',
         'Tcrs','Ministry Of Water','Water','Dwe']

train_set.loc[~train_set["funder"].isin(columns_to_keep), "funder"] = "Others"
train_set['funder'] = train_set['funder'].astype("category").cat.remove_unused_categories()


Ord_enc_funder = OrdinalEncoder()
Ord_enc_funder = Ord_enc_funder.fit(train_set[['funder']])
train_set['funder_ord'] = Ord_enc_funder.transform(train_set[['funder']])

In [None]:
############# subvillage
# train_set.loc[train_set.subvillage.isnull(), 'subvillage'] = 'unknown'
# train_set.subvillage.value_counts()[:20]


# print(ps.sqldf("select subvillage,count(*) as num from train_set \
# group by subvillage"))

In [17]:
############# scheme_management

train_set.loc[train_set.scheme_management == 'None', 'scheme_management'] = 'Other'
imp2 = SimpleImputer(missing_values = np.nan, strategy='most_frequent') 
train_set['scheme_management'] = imp2.fit_transform(train_set[['scheme_management']]).ravel()

Ord_enc_3 = OrdinalEncoder()
Ord_enc_3 = Ord_enc.fit(train_set[['scheme_management']])
train_set['scheme_management_ord'] = Ord_enc_3.transform(train_set[['scheme_management']])

# print(ps.sqldf("select scheme_management,count(*) as num from train_set \
#  group by scheme_management"))

In [18]:
############# public_meeting

train_set['public_meeting'].astype('float')
train_set.loc[train_set.public_meeting.isnull(), 'public_meeting'] = 2
# print(ps.sqldf("select public_meeting,count(*) as num from train_set \
#  group by public_meeting"))

train_set['public_meeting']= train_set['public_meeting'].astype('float64')

In [None]:
train_set.info()

numerical: lga, ward, region, longitude, latitude,gps_height,population
categorcial: basin,installer, extraction_type_class, management_group, source, waterpoint_type, quality_group,public_meeting

In [None]:
train_prep = pd.read_csv("/Users/jiamin/Documents/869 team project/preprocessed data/train_data.csv")

In [19]:
# combine datasets
train_set_pre = train_set[['quality_ind',
                           'construction_cat',
                           'permit_imp',
                           'permit_imp_ind',
                           'id',
                           'payment_type_ord',
                           'longitude',
                           'latitude',
                           'gps_height',
                           'population',
                           'popu_ind',                 
                           'popu_Std',               
                            'popu_StdLog',           
                            'installer_ord',           
                            'funder_ord',              
                            'scheme_management_ord', 
                           'public_meeting',
                           'status_group']]



train_set_ord = train_prep.drop(['construction_year',
                                 'permit',
                                 'amount_tsh',
                                 'gps_height',
                                 'longitude',
                                 'latitude',
                                 'population',
                                 'funder',
                                 'installer',
                                 'wpt_name',
                                 'num_private',
                                 'date_recorded','recorded_by',
                                 'subvillage',
                                 'region_code','district_code',
                                 'extraction_type','extraction_type_group',
                                 'scheme_management','scheme_name',
                                 'public_meeting',
                                 'management',
                                 'water_quality',
                                 'payment_type','payment',
                                 'quantity_group',
                                 'source_type','source_class',
                                 'waterpoint_type_group',
                                 'status_group'],axis=1)

train_full = train_set_pre.merge(train_set_ord, on='id',how='left')

In [20]:
# test control split
train_full_y = train_full['status_group']
train_full_x = train_full.drop(['status_group'], axis=1)
train_full_x_train, train_full_x_test, train_full_y_train, train_full_y_test = train_test_split(train_full_x, train_full_y, test_size=0.2, random_state=42)

In [None]:
train_full_x_train.info()

# Modelling

In [21]:
params = {
      "learning_rate": 0.03,
      "max_depth": 6,
      "min_child_samples": 63,
      "min_data_per_group": 100,
      "n_estimators": 200,
      "num_leaves": 63,
      "reg_alpha":  0.05,
      "reg_lambda": 0.05,
      "subsample": 0.8,
      "boosting_type": 'gbtree',
      "n_jobs": 1,
      "verbosity": 0,
      "seed": 77,
    "is_unbalance": True,
}

estimator_v1 = XGBClassifier(**params)
estimator_v1 = estimator_v1.fit(train_full_x_train, train_full_y_train)



In [38]:
estimator_v2 = XGBClassifier(objective = 'multi:softmax', booster = 'gbtree', nrounds = 'min.error.idx',
                        num_class = 3, maximize = False, eval_metric = 'merror', eta = .1,
                        max_depth = 10, colsample_bytree = .4, n_jobs = -1,learning_rate = 0.05, verbose=2,
                             is_unbalance = True)

estimator_v2 = estimator_v2.fit(train_full_x_train, train_full_y_train)




In [40]:
estimator_v3 = XGBClassifier(nthread=2, num_class=3, 
                        min_child_weight=3, max_depth=15,
                        gamma=0.5, scale_pos_weight=0.8,
                        subsample=0.7, colsample_bytree = 0.8,
                        objective='multi:softmax',
                            is_unbalance = True)

estimator_v3 = estimator_v3.fit(train_full_x_train, train_full_y_train)



In [41]:
# cross validation
inner_cv_scores = cross_validate(estimator_v3, train_full_x_train, train_full_y_train, 
                                 cv=5, 
                                 scoring="accuracy", 
                                 n_jobs=15, 
                                 verbose=0, 
                                 return_train_score=True)

cv_scores=inner_cv_scores['test_score'].tolist()
fit_times=inner_cv_scores['fit_time'].tolist()

print("CV Scores:")
print(["{:0.4f}".format(cv_score) for cv_score in cv_scores])
print("CV Score mean: {:.4f} ".format(np.mean(cv_scores)))
print("CV Score range: {:0.4f} -- {:0.4f}".format(np.mean(cv_scores) - np.std(cv_scores), np.mean(cv_scores) + np.std(cv_scores)))

#v2
# CV Scores:
# ['0.8018', '0.7980', '0.7984', '0.7993', '0.7947']
# CV Score mean: 0.7984 
# CV Score range: 0.7962 -- 0.8007

CV Scores:
['0.7979', '0.7911', '0.7975', '0.7989', '0.7955']
CV Score mean: 0.7962 
CV Score range: 0.7934 -- 0.7989


In [27]:
#########rf

rf_clf_v1 = RandomForestClassifier(n_estimators=400, max_depth=None,
                                    min_samples_split = 4,
                                    max_features = 'auto',
                                    max_leaf_nodes= None,
                                    class_weight = 'balanced',
                                    random_state = 42)
rf_clf_v1.fit(train_full_x_train, train_full_y_train)



rf_pred_v1 = rf_clf_v1.predict(train_full_x_test)
class_names = [str(x) for x in rf_clf_v1.classes_]



print(classification_report(train_full_y_test, rf_pred_v1, target_names = class_names))

                         precision    recall  f1-score   support

             functional       0.81      0.88      0.84      6457
functional needs repair       0.50      0.37      0.42       851
         non functional       0.84      0.77      0.80      4572

               accuracy                           0.80     11880
              macro avg       0.71      0.67      0.69     11880
           weighted avg       0.80      0.80      0.80     11880



# Create test features

In [26]:
test_val.loc[test_val.payment_type == 'other', 'payment_type'] = 'unknown'
Ord_enc = Ord_enc.fit(test_val[['payment_type']])
test_val['payment_type_ord'] = Ord_enc.transform(test_val[['payment_type']])

test_val['quality_ind'] = np.where(test_val['quality_group'] == 'good',1,0)

test_val['construction_cat'] = test_val.apply(construction, axis=1)
Ord_enc_2 = Ord_enc_2.fit(test_val[['construction_cat']])
test_val['construction_cat'] = Ord_enc_2.transform(test_val[['construction_cat']])

test_val['permit_imp'] = imp.fit_transform(test_val[['permit']]).ravel()
test_val['permit_imp_ind'] = np.where(test_val['permit'].isna(),1,0)

test_val.loc[test_val.longitude == 0, 'longitude'] = np.nan
test_val.longitude.fillna(test_val.groupby(['subvillage'])['longitude'].transform('median'), inplace = True)
test_val.longitude.fillna(test_val.groupby(['ward'])['longitude'].transform('median'), inplace = True)
test_val.longitude.fillna(test_val.groupby(['lga'])['longitude'].transform('median'), inplace = True)

test_val_latitude = test_val[test_val['latitude'].round(0) == -0]
test_val.loc[test_val.latitude.round(0) == -0.0, 'latitude'] = np.nan
test_val.latitude.fillna(test_val.groupby(['subvillage'])['latitude'].transform('median'), inplace = True)
test_val.latitude.fillna(test_val.groupby(['ward'])['latitude'].transform('median'), inplace = True)
test_val.latitude.fillna(test_val.groupby(['lga'])['latitude'].transform('median'), inplace = True)


In [23]:
test_val.loc[test_val.gps_height == 0, 'gps_height'] = np.nan
test_val.gps_height.fillna(test_val.groupby(['subvillage'])['gps_height'].transform('mean'), inplace = True)
test_val.gps_height.fillna(test_val.groupby(['ward'])['gps_height'].transform('mean'), inplace = True)
test_val.gps_height.fillna(test_val.groupby(['lga'])['gps_height'].transform('mean'), inplace = True)
test_val.gps_height.fillna(test_val.groupby(['region_code'])['gps_height'].transform('mean'), inplace=True)

test_val['popu_ind'] = np.where(test_val['population'] == 0,1,0)
test_val.loc[test_val.population == 0, 'population'] = np.nan
test_val.population.fillna(test_val.groupby(['subvillage'])['population'].transform('median'), inplace = True)
test_val.population.fillna(test_val.groupby(['ward'])['population'].transform('median'), inplace = True)
test_val.population.fillna(test_val.groupby(['lga'])['population'].transform('median'), inplace = True)
test_val.population.fillna(test_val.groupby(['region_code'])['population'].transform('median'), inplace=True)

scaler.fit(test_val[['population']])
test_val['popu_Std'] = scaler.transform(test_val[['population']])
scaler_log.fit(test_val[['popu_Std']])
test_val['popu_StdLog'] = scaler_log.transform(test_val[['popu_Std']])

test_val= fix_typo(test_val)
test_val.installer.fillna('other', inplace=True)
test_val.loc[test_val.installer.str.len() == 1, 'installer'] = 'other'
Ord_enc_installer = Ord_enc_installer.fit(test_val[['installer']])
test_val['installer_ord'] = Ord_enc_installer.transform(test_val[['installer']])

### funder
test_val['funder'].replace(to_replace = '0', value ='None' , inplace=True)
test_val['funder'].replace(to_replace = 'Unknown', value ='None' , inplace=True)
columns_to_keep = ['Government Of Tanzania','Unknown','Danida','Hesawa','Rwssp','World Bank','Kkkt', 'World Vision',
                    'Unicef','Tasaf','District Council', 'Dhv', 'Private Individual', 'Dwsp','Norad','Germany Republi',
                    'Tcrs','Ministry Of Water','Water','Dwe']
test_val.loc[~test_val["funder"].isin(columns_to_keep), "funder"] = "Others"
test_val['funder'] = test_val['funder'].astype("category").cat.remove_unused_categories()
Ord_enc_funder = Ord_enc_funder.fit(test_val[['funder']])
test_val['funder_ord'] = Ord_enc_funder.transform(test_val[['funder']])

###### scheme_management
test_val.loc[test_val.scheme_management == 'None', 'scheme_management'] = 'Other'
imp2 = SimpleImputer(missing_values = np.nan, strategy='most_frequent')
test_val['scheme_management'] = imp2.fit_transform(test_val[['scheme_management']]).ravel()
Ord_enc_3 = Ord_enc.fit(test_val[['scheme_management']])
test_val['scheme_management_ord'] = Ord_enc_3.transform(test_val[['scheme_management']])


test_val['public_meeting'].astype('float')
test_val.loc[test_val.public_meeting.isnull(), 'public_meeting'] = 2
test_val['public_meeting']= test_val['public_meeting'].astype('float64')

In [29]:
test_set_pre = test_val[['quality_ind',
                           'construction_cat',
                           'permit_imp',
                           'permit_imp_ind',
                           'id',
                           'payment_type_ord',
                           'longitude',
                           'latitude',
                           'gps_height',
                           'population',
                           'popu_ind',                 
                           'popu_Std',               
                            'popu_StdLog',           
                            'installer_ord',           
                            'funder_ord',              
                            'scheme_management_ord', 
                           'public_meeting']]

test_set_ord = test_prep.drop(['construction_year',
                                 'permit',
                                 'amount_tsh',
                                 'gps_height',
                                 'longitude',
                                 'latitude',
                                 'population',
                                 'funder',
                                 'installer',
                                 'wpt_name',
                                 'num_private',
                                 'date_recorded','recorded_by',
                                 'subvillage',
                                 'region_code','district_code',
                                 'extraction_type','extraction_type_group',
                                 'scheme_management','scheme_name',
                                 'public_meeting',
                                 'management',
                                 'water_quality',
                                 'payment_type','payment',
                                 'quantity_group',
                                 'source_type','source_class',
                                 'waterpoint_type_group'
                              ],axis=1)

test_full = test_set_pre.merge(test_set_ord, on='id',how='left')
test_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14850 entries, 0 to 14849
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   quality_ind            14850 non-null  int64  
 1   construction_cat       14850 non-null  float64
 2   permit_imp             14850 non-null  float64
 3   permit_imp_ind         14850 non-null  int64  
 4   id                     14850 non-null  int64  
 5   payment_type_ord       14850 non-null  float64
 6   longitude              14850 non-null  float64
 7   latitude               14850 non-null  float64
 8   gps_height             14850 non-null  float64
 9   population             14850 non-null  float64
 10  popu_ind               14850 non-null  int64  
 11  popu_Std               14850 non-null  float64
 12  popu_StdLog            14850 non-null  float64
 13  installer_ord          14850 non-null  float64
 14  funder_ord             14850 non-null  float64
 15  sc

In [30]:
test_full['status_group'] = ''
full_set = pd.concat([train_full,test_full])

full_prep_train = full_set[full_set.status_group != '']
full_prep_test = full_set[full_set.status_group == ''] 

full_prep_train_x = full_prep_train.drop(['status_group'], axis=1)
full_prep_train_y = full_prep_train['status_group']
full_prep_test_x = full_prep_test.drop(['status_group'], axis=1)

In [42]:
estimator_v2 = estimator_v2.predict(full_prep_test_x )
pred_v1 = pd.DataFrame(estimator_v2,columns = ['status_group'])

pred_v1['id'] = test_val['id']
pred_v1 = pred_v1[['id','status_group']]
print(pred_v1.groupby('status_group').size())

# status_group
# functional                 10359
# functional needs repair      171
# non functional              4320
# dtype: int64

status_group
functional                 9916
functional needs repair     183
non functional             4751
dtype: int64


In [32]:
rf_clf_v7 = RandomForestClassifier(n_estimators=800, max_depth=None, 
                                   min_samples_split = 4, 
                                   max_features = 'auto',
                                   max_leaf_nodes= None,
                                   class_weight = 'balanced',
                                   random_state = 42)

rf_clf_v7.fit(full_prep_train_x, full_prep_train_y)
rf_clf_v7 = rf_clf_v7.predict(full_prep_test_x)

#n_estimators=800: 0.8061
#n_estimators=400: 0.8054


In [33]:
pred_v7 = pd.DataFrame(rf_clf_v7,columns = ['status_group'])

pred_v7['id'] = test_val['id']
pred_v7 = pred_v7[['id','status_group']]
print(pred_v7.groupby('status_group').size())

status_group
functional                 9180
functional needs repair     460
non functional             5210
dtype: int64


In [44]:
pred_v1.to_csv("/Users/jiamin/Desktop/output_v2nov20_xgb.csv",index=False)