In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

+## Merging and Cleaning

In [70]:
train_vals = pd.read_csv(r"MMA 869/Training features.csv")

In [None]:
train_vals.shape

In [None]:
train_labels = pd.read_csv(r"MMA 869/Training labels.csv")

In [None]:
train_data = pd.merge(train_vals, train_labels, on='id', how='left')

In [None]:
test_data = pd.read_csv(r"MMA 869/Test set values.csv")

In [None]:
def display_dataframe_without_trim():
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', 100)
    
display_dataframe_without_trim()

In [None]:
train_data.head(50)

In [None]:
train_data.shape

In [None]:
test_data.shape

### Missing values

In [None]:
train_data.isna().sum()

In [None]:
# columns with NAs are funder, installer, subvillage, public_meeting, scheme_management, scheme_name, permit

In [None]:
# dropping columns based on intuition

In [None]:
train_data.recorded_by.unique()

In [None]:
# How many waterpoints are permitted
train_data.permit.value_counts()

In [None]:
'''
drop these columns: wpt_name, region, recorded_by,extraction_type_group,extraction_type_class,payment,quality_group,
quantity_group,source_type,water_point_type_group
'''
train_data.drop(columns=['wpt_name','region','recorded_by','extraction_type_group','extraction_type_class',
                        'payment','quality_group','quantity_group','source_type','waterpoint_type_group'], inplace=True)

In [None]:
test_data.drop(columns=['wpt_name','region','recorded_by','extraction_type_group','extraction_type_class',
                        'payment','quality_group','quantity_group','source_type','waterpoint_type_group'], inplace=True)

In [None]:
train_data.shape

In [None]:
train_data.head()

In [None]:
train_data.source.nunique()

In [None]:
train_data.funder.value_counts()

In [None]:
# handling NAs for funder
df = train_data.funder.value_counts().to_frame().reset_index()

# replace with mode
train_data.loc[train_data.funder.isna(), 'funder'] = train_data.funder.mode().values[0]

In [None]:
train_data.installer.value_counts()

In [None]:
train_data.installer.isna().sum()

In [None]:
# Installer NAs
df = train_data.installer.value_counts().to_frame().reset_index()
# few = df.loc[df.installer < 5, 'index']

# replace with mode
train_data.loc[train_data.installer.isna(), 'installer'] = train_data.installer.mode().values[0]

In [None]:
# subvillage NAs - replace with mode
train_data.loc[train_data.subvillage.isna(), 'subvillage'] = train_data.subvillage.mode().values[0]

In [None]:
# public meeting NAs
train_data.public_meeting.value_counts()

In [None]:
# replace with mode
train_data.loc[train_data.public_meeting.isna(), 'public_meeting'] = train_data.public_meeting.mode().values[0]

In [None]:
# scheme_management NAs
train_data.scheme_management.value_counts()

In [None]:
# replace with mode
train_data.loc[train_data.scheme_management.isna(), 
               'scheme_management'] = train_data.scheme_management.mode().values[0]
# correct 'None' record
train_data.loc[train_data.scheme_management == 'None', 'scheme_management'] = 'Other'

In [None]:
# scheme_name NAs
train_data.scheme_name.nunique()

In [None]:
train_data.scheme_name.isna().sum()

In [None]:
def impute_scheme_name(row):
    if row.scheme_name == 0:
        ward = row.ward
        df = train_data[train_data.ward == ward]
        try:
            m = df.scheme_name.unique()[1]
            return(m)
        except:
            return (train_data.loc[train_data.scheme_name != 0, 'scheme_name'].mode().values[0])
    else:
        return (row.scheme_name)

In [None]:
train_data.loc[train_data.scheme_name.isna(), 'scheme_name'] = train_data.scheme_name.mode().values[0]

In [None]:
# Permit NAs
train_data.permit.value_counts()

In [None]:
# replace with mode
train_data.loc[train_data.permit.isna(), 'permit'] = train_data.permit.mode().values[0]

In [None]:
train_data.shape

#### Missing values - Test Data

In [None]:
test_data.isna().sum()

In [None]:
# handle funder, installer, public_meeting,scheme_management, permit cols

In [None]:
# handling NAs for funder
# replace with mode
test_data.loc[test_data.funder.isna(), 'funder'] = test_data.funder.mode().values[0]

In [None]:
# Installer NAs
# replace with mode
test_data.loc[test_data.installer.isna(), 'installer'] = test_data.installer.mode().values[0]

In [None]:
# replace with mode
test_data.loc[test_data.subvillage.isna(), 'subvillage'] = test_data.subvillage.mode().values[0]

In [None]:
# public meeting NAs
test_data.public_meeting.value_counts()

In [None]:
# replace with mode
test_data.loc[test_data.public_meeting.isna(), 'public_meeting'] = test_data.public_meeting.mode().values[0]

In [None]:
# scheme_name NAs
# replace with mode
test_data.loc[test_data.scheme_name.isna(), 'scheme_name'] = test_data.scheme_name.mode().values[0]

In [None]:
test_data.isna().sum()

In [None]:
# scheme_management NAs
test_data.scheme_management.value_counts()

In [None]:
# replace with mode
test_data.loc[test_data.scheme_management.isna(), 
               'scheme_management'] = test_data.scheme_management.mode().values[0]

In [None]:
# replace with mode
test_data.loc[test_data.permit.isna(), 'permit'] = test_data.permit.mode().values[0]

### More cleaning and EDA

In [None]:
train_data.head(50)

In [None]:
# amount_tsh column
# how many records are zeros?
train_data[train_data.amount_tsh==0.00].shape[0]

In [None]:
# date recorded -- ensure it's date format
train_data.date_recorded = pd.to_datetime(train_data.date_recorded)
test_data.date_recorded = pd.to_datetime(test_data.date_recorded)

In [None]:
# gps_height column
# how many records are zeros or neg?
train_data[train_data.gps_height<1].shape[0]

In [None]:
def impute_gps_height(row):
    if row.gps_height <= 0:
        basin = row.basin
        return (train_data.loc[(train_data.basin==basin)&(train_data.gps_height >0), 'gps_height'].median())
    else:
        return(row.gps_height)

In [None]:
# use median imputation
train_data.loc[train_data.gps_height < 1, 'gps_height'] = train_data.loc[train_data.gps_height > 0,
                                                                         'gps_height'].median()

test_data.loc[test_data.gps_height < 1, 'gps_height'] = test_data.loc[test_data.gps_height > 0,
                                                                         'gps_height'].median()

In [None]:
# installer column
x = train_data.installer.value_counts().to_frame().reset_index()

In [None]:
top = x['index'][:8]

In [None]:
# test data
x = test_data.installer.value_counts().to_frame().reset_index()
top = x['index'][:8]

In [None]:
# num_private column
# how many records are zeros or neg?
train_data[train_data.num_private < 1].shape[0]

In [None]:
# drop the column
train_data.drop(columns=['num_private'], inplace=True)
test_data.drop(columns=['num_private'], inplace=True)

In [None]:
# basin column
train_data.basin.nunique()

In [None]:
train_data.basin.value_counts()

In [None]:
test_data.basin.value_counts()

In [None]:
# lga column
train_data.lga.nunique()

In [None]:
# ward column
train_data.ward.nunique()

In [None]:
# population column
# how many records are zero?
train_data[train_data.population == 0].shape[0]

In [None]:
# mean imputation
train_data.loc[train_data.population == 0, 'population'] = train_data.loc[train_data.population != 0, 
                                                                          'population'].mean()
test_data.loc[test_data.population == 0, 'population'] = test_data.loc[test_data.population != 0, 
                                                                          'population'].mean()

In [None]:
def pop_imputer(row):
    if row.population <= 0:
        row_district = row.district_code
        impute = train_data.loc[train_data.district_code == row_district, 'population'].mean()
        if impute == 0:
            return (train_data.loc[train_data.population != 0, 'population'].median())
        else:
            return (impute)
    else:
        return (row.population)

In [None]:
# scheme_management column
train_data.scheme_management.value_counts()

In [None]:
test_data.scheme_management.value_counts()

In [None]:
# construction_year column
# how many zeros?
train_data.loc[train_data.construction_year == 0, 'construction_year'].shape[0]

In [None]:
# use mean imputation
train_data.loc[train_data.construction_year == 0, 'construction_year'] = int(train_data.loc[train_data.construction_year != 0, 
                                                                          'construction_year'].mean())

test_data.loc[test_data.construction_year == 0, 'construction_year'] = int(test_data.loc[test_data.construction_year != 0, 
                                                                          'construction_year'].mean())

In [None]:
# extraction_type column
x = train_data.extraction_type.value_counts().to_frame().reset_index()

In [None]:
top = x['index'][:9]

In [None]:
# management column
x = train_data.management.value_counts().to_frame().reset_index()
top = x['index'][:6]

In [None]:
# management_group column
train_data.management_group.value_counts()

In [None]:
# drop this column, management is informative enough
train_data.drop(columns=['management_group'], inplace=True)
test_data.drop(columns=['management_group'], inplace=True)

In [None]:
# payment_type column
train_data.payment_type.value_counts()

In [None]:
# water quality column
train_data.water_quality.value_counts()

In [None]:
# quantity column
train_data.quantity.value_counts()

In [None]:
# source column
train_data.source.value_counts()

In [None]:
# source class column
train_data.source_class.value_counts()

In [None]:
# waterpoint_type column
train_data.waterpoint_type.value_counts()

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
train_data.head()

### Feature engineering

In [None]:
# convert date_recorded to record_age
today = dt.datetime.today()
today

In [None]:
train_data['record_age'] = train_data['date_recorded'].apply(lambda row: (today -  row).days)
test_data['record_age'] = test_data['date_recorded'].apply(lambda row: (today -  row).days)

In [None]:
# we can drop date_recorded
train_data.drop(columns = ['date_recorded'], inplace=True)
test_data.drop(columns = ['date_recorded'], inplace=True)

In [None]:
# train_data.funder.value_counts()

In [None]:
# permit and public_meeting booleans converted to strings
train_data.permit = train_data.permit.astype('category')
train_data.public_meeting = train_data.public_meeting.astype('category')

test_data.permit = test_data.permit.astype('category')
test_data.public_meeting = test_data.public_meeting.astype('category')

In [None]:
x_train = train_data.loc[:, ~train_data.columns.isin(['status_group'])]
y_train = train_data.status_group

<class 'pandas.core.series.Series'>
RangeIndex: 59400 entries, 0 to 59399
Series name: c_na
Non-Null Count  Dtype
--------------  -----
59400 non-null  bool 
dtypes: bool(1)
memory usage: 58.1 KB


In [None]:
# add amount_tsh per pop as new feature
x_train['tsh_per_head'] = x_train['amount_tsh'] / x_train['population']
test_data['tsh_per_head'] = test_data['amount_tsh'] / test_data['population']

In [None]:
x_train.to_csv(r"Train_data_final.csv", index=False)

In [None]:
test_data.to_csv(r"Test_data_final.csv", index=False)

In [78]:
x_train = pd.read_csv("Train_data_final_1.csv")

In [79]:
test_data = pd.read_csv("Test_data_final_1.csv")

In [80]:
y_train_2 = pd.read_csv('MMA 869/Training labels.csv')

In [81]:
y_train = y_train_2['status_group']

In [82]:
# categorical features
s = (x_train.dtypes == 'object')
cat_cols = list(s[s].index)

In [83]:
cat_cols

['funder',
 'installer',
 'basin',
 'subvillage',
 'lga',
 'ward',
 'scheme_management',
 'scheme_name',
 'extraction_type',
 'management',
 'payment_type',
 'water_quality',
 'quantity',
 'source',
 'source_class',
 'waterpoint_type']

In [84]:
one_hot_cols = ['basin', 'scheme_management','extraction_type','management','payment_type','water_quality','quantity',
               'source','source_class','waterpoint_type']
cat_cols = list(set(cat_cols).difference(one_hot_cols))

In [62]:
# numerical features
s = (x_train.dtypes != 'object')
num_cols = list(s[s].index)

In [63]:
# remove id, longitude, latitude
to_remove = ['id','public_meeting', 'permit']
for j in to_remove:
    num_cols.remove(j)
num_cols

['amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'region_code',
 'district_code',
 'population',
 'construction_year',
 'record_age',
 'tsh_per_head',
 'c_na']

In [64]:
x_train.quantity.value_counts()

enough          33186
insufficient    15129
dry              6246
seasonal         4050
unknown           789
Name: quantity, dtype: int64

In [65]:
# quantity is ordinal, remove it from cat_cols
ord_cols = ['quantity']
one_hot_cols.remove('quantity')

In [66]:
# create mapping for quantity column
mapper = {'unknown':0, 'dry':1, 'seasonal':2, 'insufficient':3, 'enough':4}

In [67]:
x_train['quantity'] = x_train['quantity'].replace(mapper)
test_data['quantity'] = test_data['quantity'].replace(mapper)

In [68]:
# encode one_hot_cols with OnehotEncoder
from sklearn.preprocessing import OneHotEncoder
oneHotEnc = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [69]:
fit = oneHotEnc.fit_transform(x_train[one_hot_cols])
x_train_oneH = pd.DataFrame(fit, columns=oneHotEnc.get_feature_names())
fit_ = oneHotEnc.transform(test_data[one_hot_cols])
test_data_oneH = pd.DataFrame(fit_, columns=oneHotEnc.get_feature_names())



In [70]:
# Trying Label encoding
from sklearn.preprocessing import LabelEncoder

In [71]:
encoder = LabelEncoder()

ValueError: invalid literal for int() with base 10: 'functional'

In [72]:
for col in cat_cols:
    encoder.fit(pd.concat([x_train[col], test_data[col]], axis=0, sort=False))
    x_train[col] = encoder.transform(x_train[col])
    test_data[col] = encoder.transform(test_data[col])

In [73]:
x_train_enc = x_train[cat_cols]
test_data_enc = test_data[cat_cols]

In [22]:
# Trying count encoding
import category_encoders as ce

In [None]:
# count_enc = ce.CountEncoder()
# x_train_enc = count_enc.fit_transform(x_train[cat_cols])
# x_train_enc.head()



In [74]:
x_train_enc.head()

Unnamed: 0,ward,scheme_name,funder,installer,subvillage,lga
0,1429,2388,1548,1706,13116,51
1,1581,644,522,610,17596,103
2,1629,2261,924,2296,10096,108
3,1576,644,1961,2078,9998,87
4,1692,644,20,133,8583,26


In [75]:
# scale numerical attributes
from sklearn.preprocessing import StandardScaler

In [76]:
scaler = StandardScaler()
num_cols = num_cols + ord_cols

In [77]:
num_cols, cat_cols

(['amount_tsh',
  'gps_height',
  'longitude',
  'latitude',
  'region_code',
  'district_code',
  'population',
  'construction_year',
  'record_age',
  'tsh_per_head',
  'c_na',
  'quantity'],
 ['ward', 'scheme_name', 'funder', 'installer', 'subvillage', 'lga'])

In [25]:
x_train_scaled = scaler.fit_transform(x_train[num_cols])
x_train_scaled = pd.DataFrame(x_train_scaled, columns=num_cols)
test_data_scaled = scaler.transform(test_data[num_cols])
test_data_scaled = pd.DataFrame(test_data_scaled, columns=num_cols)

x_train_enc_scaled = scaler.fit_transform(x_train_enc)
x_train_enc_scaled = pd.DataFrame(x_train_enc_scaled, columns=x_train_enc.columns)
test_data_enc_scaled = scaler.transform(test_data_enc)
test_data_enc_scaled = pd.DataFrame(test_data_enc_scaled, columns=test_data_enc.columns)

In [26]:
x_train_enc.head()

Unnamed: 0,ward,scheme_name,funder,installer,subvillage,lga
0,1429,2388,1548,1706,13116,51
1,1581,644,522,610,17596,103
2,1629,2261,924,2296,10096,108
3,1576,644,1961,2078,9998,87
4,1692,644,20,133,8583,26


In [27]:
x_train_trans = pd.concat([x_train_scaled, x_train_enc_scaled, x_train_oneH], axis=1)
test_data_trans = pd.concat([test_data_scaled, test_data_enc_scaled, test_data_oneH], axis=1)

In [28]:
x_train_trans.shape

(59400, 103)

In [29]:
test_data_trans.shape

(14850, 103)

In [30]:
x_train_trans.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,region_code,district_code,population,construction_year,record_age,tsh_per_head,...,x7_groundwater,x7_surface,x7_unknown,x8_cattle trough,x8_communal standpipe,x8_communal standpipe multiple,x8_dam,x8_hand pump,x8_improved spring,x8_other
0,1.895665,0.594894,0.131052,-1.408791,-0.244325,-0.06537,-0.3809244,0.24514,1.141136,0.037539,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-0.10597,0.613995,0.09461,1.207934,0.267409,-0.376781,-0.002406504,1.337149,-1.022152,-0.091202,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-0.09763,-0.899219,0.515158,0.639751,0.324269,-0.169174,-0.06881315,1.237876,-0.995223,-0.090968,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.10597,-1.79696,0.671308,-1.84972,4.247564,5.955245,-0.4938157,-1.045415,-0.911444,-0.091202,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-0.10597,0.178919,-0.448669,1.317271,0.153691,-0.480585,-6.070124e-11,-0.05268,0.779092,-0.091202,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [31]:
test_data_trans.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,region_code,district_code,population,construction_year,record_age,tsh_per_head,...,x7_groundwater,x7_surface,x7_unknown,x8_cattle trough,x8_communal standpipe,x8_communal standpipe multiple,x8_dam,x8_hand pump,x8_improved spring,x8_other
0,-0.10597,1.88102,0.184758,0.558839,0.324269,-0.272978,0.088349,1.535696,-0.932389,-0.091202,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.10597,0.974789,0.392742,0.813586,-0.756059,-0.376781,0.041865,0.344414,-0.932389,-0.091202,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-0.10597,0.970545,0.105131,0.238184,-0.130606,-0.376781,0.484576,1.337149,-0.923412,-0.091202,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.10597,-1.788471,0.60612,-1.260233,3.67897,3.879171,-0.068813,-0.946142,-0.893491,-0.091202,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.060833,0.318992,0.141411,-1.780173,-0.301184,-0.272978,-0.489389,0.344414,-1.084986,-0.071712,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# Scheme_name doesn't add predictive power

In [33]:
x_train_trans.drop(columns=['scheme_name'], inplace=True)

In [34]:
test_data_trans.drop(columns=['scheme_name'], inplace=True)

### Feature selection

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

In [37]:
logistic = LogisticRegression(C=0.5, penalty="l1", random_state=42, solver='liblinear')

In [None]:
# log_fit = logistic.fit(x_train_trans, y_train)

In [None]:
# model = SelectFromModel(log_fit, prefit=True)

In [None]:
# x_new = model.transform(x_train_trans)

In [None]:
# Get back the kept features as a DataFrame with dropped columns as all 0s
# selected_features = pd.DataFrame(model.inverse_transform(x_new), index=x_train_trans.index, 
#                                  columns=x_train_trans.columns)

In [None]:
# Dropped columns have values of all 0s, keep other columns 
# selected_columns = selected_features.columns[selected_features.var() != 0]

In [None]:
# selected_columns.shape

## Model

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [97]:
from sklearn.ensemble import ExtraTreesClassifier

In [40]:
x_train_trans.to_csv(r'xtrainfinal.csv',index = 'id')

#### Random forest

In [37]:
forest = RandomForestClassifier(random_state=42, n_estimators=1250, min_samples_split=5,min_samples_leaf = 1,
                                oob_score=True, n_jobs=-1, max_features='sqrt', bootstrap=True,max_depth= 80)

In [98]:
Extra = ExtraTreesClassifier(random_state=42, n_estimators=1250, min_samples_split=5,min_samples_leaf = 1,
                                oob_score=True, n_jobs=-1, max_features='auto', bootstrap=True,max_depth= 80)

In [99]:
Extra.fit(x_train_trans, y_train)

  warn(


In [38]:
forest.fit(x_train_trans, y_train)

  warn(


In [39]:
forest.oob_score_

0.8194949494949495

In [100]:
Extra.oob_score_

0.8142087542087542

In [45]:
# cross-validation
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest, x_train_trans, y_train,
                                scoring="accuracy", cv=5)
forest_scores.mean()

  warn(
  warn(
  warn(
  warn(
  warn(


0.8147643097643098

In [135]:
# feature importance
for name, score in zip(x_train_trans[x_train_trans.columns], forest.feature_importances_):
    print(name, score)

amount_tsh 0.020129127041875843
gps_height 0.040797092885783315
longitude 0.0702450886815711
latitude 0.06780536299602344
region_code 0.01858444920814591
district_code 0.0189288295076374
population 0.02844416206352324
construction_year 0.04760570002015468
record_age 0.04552852606881736
tsh_per_head 0.02396466356108322
qty_per_head 0.06081977335940892
quantity 0.10908732630135785
ward 0.034559657213019526
funder 0.030959123931908434
subvillage 0.041042567403580955
lga 0.028239022392559706
installer 0.026584917497703257
x0_Internal 0.003337963831260443
x0_Lake Nyasa 0.003442902748606464
x0_Lake Rukwa 0.00292435953498927
x0_Lake Tanganyika 0.00333581878906718
x0_Lake Victoria 0.003016816052277962
x0_Pangani 0.0027028365266674507
x0_Rufiji 0.002374102559530653
x0_Ruvuma / Southern Coast 0.0021374529220785934
x0_Wami / Ruvu 0.002368023620064309
x1_Company 0.0012317036265858995
x1_Other 0.0005616165587115804
x1_Parastatal 0.001035155308595023
x1_Private operator 0.0007570942658583177
x1_SWC 

In [None]:
# Randomized search
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [106]:
def objective_dt(trial, X, y):

  # Now, define all the hyperparams we want to vary, and what values they are allowed
  # to take.
  #
  # Each trial, optuna will automatically choose values for each hyperparam.
  hyper_params = {
        
        "n_estimators":trial.suggest_int("n_estimators", 900, 1700, step=10),
        "max_depth": trial.suggest_int("max_depth", 50,100 , step=5),

        "min_samples_split": trial.suggest_int("min_samples_split", 5,10, step=5),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1,5, step=1),
        #"max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 10, 1010, step=100),

        'class_weight': trial.suggest_categorical('class_weight', [None]),
         'oob_score' : True,
         
        "random_state": 42,
        "bootstrap": True,
      
  }
    # Use the hyperparams that optuna has chosen for this trial to create a DecisionTreeClassifier
  clf = ExtraTreesClassifier(**hyper_params)

  # Run CV to see how well these hyper_params do
  cv_scores = cross_val_score(clf, X, y, cv=5, scoring="accuracy")
  score = np.mean(cv_scores)

  # Whatever we return here tells optuna how well these parameters did
  return score

In [60]:
from sklearn.model_selection import cross_val_score

In [76]:
import optuna

In [107]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-07-23 11:07:55,882][0m A new study created in memory with name: no-name-2bdb8100-45ac-44d5-a09b-aea6845379a2[0m


In [None]:
study.optimize(lambda trial: objective_dt(trial, x_train_trans, y_train), n_trials=30,  gc_after_trial=True)

[32m[I 2022-07-23 11:12:59,558][0m Trial 0 finished with value: 0.7865488215488216 and parameters: {'n_estimators': 970, 'max_depth': 55, 'min_samples_split': 10, 'min_samples_leaf': 4, 'class_weight': None}. Best is trial 0 with value: 0.7865488215488216.[0m
[32m[I 2022-07-23 11:18:51,644][0m Trial 1 finished with value: 0.7955218855218855 and parameters: {'n_estimators': 940, 'max_depth': 70, 'min_samples_split': 10, 'min_samples_leaf': 2, 'class_weight': None}. Best is trial 1 with value: 0.7955218855218855.[0m
[32m[I 2022-07-23 11:28:15,925][0m Trial 2 finished with value: 0.7865488215488216 and parameters: {'n_estimators': 1620, 'max_depth': 75, 'min_samples_split': 10, 'min_samples_leaf': 4, 'class_weight': None}. Best is trial 1 with value: 0.7955218855218855.[0m
[32m[I 2022-07-23 11:35:35,123][0m Trial 3 finished with value: 0.7827946127946128 and parameters: {'n_estimators': 1290, 'max_depth': 95, 'min_samples_split': 5, 'min_samples_leaf': 5, 'class_weight': None}.

In [None]:
rf = RandomForestClassifier()

rf_search = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rf_search.fit(x_train_trans, y_train)
rf_search.best_params_S

#### Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
bagging = BaggingClassifier(DecisionTreeClassifier(random_state=42), n_estimators=500, max_samples=0.7,
                           bootstrap=True, n_jobs=-1)
bagging.fit(x_train_trans, y_train)

In [None]:
bagging_scores = cross_val_score(bagging, x_train_trans, y_train,
                                scoring="accuracy", cv=5)
bagging_scores.mean()

#### Ensemble of RForest, Neural net and XGB

In [68]:
from sklearn.ensemble import VotingClassifier
forest = RandomForestClassifier(random_state=42, n_estimators=600, min_samples_split=10, oob_score=True,
                                n_jobs=-1, max_features='sqrt', bootstrap=True,max_depth=110)
neighbor = KNeighborsClassifier(n_neighbors=3)
lgb = LGBMClassifier()

voting_clf = VotingClassifier(estimators = [('rf',forest), ('gb',lgb), ('neigh', neighbor)],  voting='soft',weights=[3,1,1])

In [69]:
voting_clf.fit(x_train_trans, y_train)
scores = cross_val_score(voting_clf, x_train_trans, y_train,
                                scoring="accuracy", cv=5)
scores.mean()

0.8114309764309764

#### SVM classifier

In [None]:
svc.fit(x_train_trans, y_train)

In [None]:
# cross-validation
svc_scores = cross_val_score(svc, x_train_trans, y_train, scoring="accuracy", cv=2)
svc_scores

#### K-NN

In [49]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
neighbor = KNeighborsClassifier(n_neighbors=3)

In [None]:
neighbor.fit(x_train_trans, y_train)

In [None]:
# cross-validation
knn_scores = cross_val_score(neighbor, x_train_trans, y_train, scoring="accuracy", cv=3)
knn_scores

#### lgbm classifier

In [55]:
from lightgbm import LGBMClassifier

In [None]:
lgb = lightgbm.LGBMClassifier(random_state=42, n_estimators=600, min_samples_split=2, oob_score=True,
                                n_jobs=-1, max_features='sqrt', bootstrap=False,max_depth=60,minimum_sample_leaf = 4)

In [None]:
lgb.fit(x_train_trans, y_train)



In [None]:
# cross-validation
lgb_scores = cross_val_score(lgb, x_train_trans, y_train, scoring="accuracy", cv=5)
lgb_scores.mean()

In [None]:


lgb_search = RandomizedSearchCV(estimator = lgb, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

lgb_search.fit(x_train_trans, y_train)
lgb_search.best_params_

#### Neural net

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
net = MLPClassifier(random_state=42)

In [None]:
net.fit(x_train_trans, y_train)

In [None]:
# cross-validation
net_scores = cross_val_score(net, x_train_trans, y_train, scoring="accuracy", cv=3)
net_scores

### Test set predictions

In [40]:
pred_output = forest.predict(test_data_trans)

In [41]:
pred_output = pd.DataFrame(pred_output, columns=['status_group'])

In [42]:
output = pd.concat([test_data['id'], pred_output.loc[:,'status_group']], axis=1)

In [43]:
output.head()

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [44]:
output.to_csv(r"submission23.csv", index=False)