In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [2]:
values=pd.read_csv('training_values.csv')
labels=pd.read_csv('training_labels.csv')
df = values.merge(labels, on='id')

In [3]:
df['gps_height'].replace(0.0, np.nan, inplace=True)
df['population'].replace(0.0, np.nan, inplace=True)
df['amount_tsh'].replace(0.0, np.nan, inplace=True)
df['longitude'].replace(0.0, np.nan, inplace=True)
df['latitude'].replace(0.0, np.nan, inplace=True)
df['construction_year'].replace(0.0, np.nan, inplace=True)

In [4]:
import datetime
current_year = datetime.datetime.now().year
df['waterpoint_age'] = current_year - df['construction_year']
current_date = datetime.datetime.now()
df['date_recorded'] = pd.to_datetime(df['date_recorded'])
df['days_since_recorded'] = (current_date - df['date_recorded']).dt.days
df.drop('date_recorded',axis=1,inplace=True)

In [5]:
# fill na values in true / false columns as false
df['permit'] = df['permit'].fillna(False)
df['public_meeting'] = df['public_meeting'].fillna(False)

In [6]:
# turn all string values lowercase for consistency 
string_columns = df.select_dtypes(include='object').columns
df[string_columns] = df[string_columns].apply(lambda x: x.str.lower())
df["funder"].fillna("other", inplace=True)
df["subvillage"].fillna("other", inplace=True)
df["wpt_name"].fillna("other", inplace=True)
df["scheme_management"].fillna("other", inplace=True)
df['installer'].fillna("other", inplace=True)
df["scheme_name"].fillna("other", inplace=True)

In [7]:
# drop uneccesary features (some have same info as other columns)
df.drop('management_group',axis=1,inplace=True)
df.drop('extraction_type_group',axis=1,inplace=True)
df.drop('scheme_name',axis=1,inplace=True)
df.drop('payment',axis=1,inplace=True)
df.drop('quality_group',axis=1,inplace=True)
df.drop('quantity_group',axis=1,inplace=True)
df.drop('source_type',axis=1,inplace=True)
df.drop('waterpoint_type_group',axis=1,inplace=True)
df.drop('ward',axis=1,inplace=True)
df.drop('installer',axis=1,inplace=True)
df.drop('permit', axis=1, inplace=True)
df.drop('construction_year', axis=1, inplace=True)

In [8]:
train = df.copy()
label = train.pop('status_group')

In [9]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column_name in train.columns:
    if train[column_name].dtype == object:
        train[column_name] = le.fit_transform(train[column_name])
    else:
        pass

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.3, random_state=42)

In [11]:
X_train["gps_height"].fillna(X_train.groupby(['region'])["gps_height"].transform("mean"), inplace=True)
X_train["gps_height"].fillna(X_train["gps_height"].mean(), inplace=True)

X_train["population"].fillna(X_train.groupby(['region'])["population"].transform("mean"), inplace=True)
X_train["population"].fillna(X_train["population"].mean(), inplace=True)

X_train["amount_tsh"].fillna(X_train.groupby(['region'])["amount_tsh"].transform("mean"), inplace=True)
X_train["amount_tsh"].fillna(X_train["amount_tsh"].mean(), inplace=True)


X_train["waterpoint_age"].fillna(X_train.groupby(['region'])["waterpoint_age"].transform("mean"), inplace=True)
X_train["waterpoint_age"].fillna(X_train["waterpoint_age"].mean(), inplace=True)

X_train["longitude"].fillna(X_train.groupby(['region'])["longitude"].transform("mean"), inplace=True)

In [12]:
X_test["gps_height"].fillna(X_test.groupby(['region'])["gps_height"].transform("mean"), inplace=True)
X_test["gps_height"].fillna(X_test["gps_height"].mean(), inplace=True)

X_test["population"].fillna(X_test.groupby(['region'])["population"].transform("mean"), inplace=True)
X_test["population"].fillna(X_test["population"].mean(), inplace=True)

X_test["amount_tsh"].fillna(X_test.groupby(['region'])["amount_tsh"].transform("mean"), inplace=True)
X_test["amount_tsh"].fillna(X_test["amount_tsh"].mean(), inplace=True)


X_test["waterpoint_age"].fillna(X_test.groupby(['region'])["waterpoint_age"].transform("mean"), inplace=True)
X_test["waterpoint_age"].fillna(X_test["waterpoint_age"].mean(), inplace=True)

X_test["longitude"].fillna(X_test.groupby(['region'])["longitude"].transform("mean"), inplace=True)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV 
from sklearn.metrics import accuracy_score


In [24]:
model_rfc = RandomForestClassifier()

param_grid = { 
    'n_estimators': [50, 250, 500], 
    'max_features': ['sqrt', 'log2', None], 
    'n_jobs': [-1] , 
} 

In [None]:
grid_search = GridSearchCV(RandomForestClassifier(), 
                           param_grid=param_grid) 
grid_search.fit(X_train, y_train) 
print(grid_search.best_estimator_) 

In [None]:
results = pd.DataFrame(grid_search.cv_results_)
columns = ['param_n_estimators', 'param_max_features', 'mean_test_score', 'std_test_score']
results_sorted = results[columns].sort_values(by='mean_test_score', ascending=False)
results_sorted.head(10)




In [15]:
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators=1000,max_features='log2', n_jobs = -1)
scores = cross_val_score(rf, X_train, y_train, cv=5)
# Print the cross-validation scores
print("Cross-validation scores:", scores)

# Calculate and print the mean score
print("Mean accuracy:", scores.mean())

Cross-validation scores: [0.80627706 0.80519481 0.80784031 0.80796056 0.80459355]
Mean accuracy: 0.8063732563732564


In [23]:
from sklearn.metrics import accuracy_score
best = RandomForestClassifier(n_estimators=250,max_features='log2', n_jobs = -1)
best.fit(X_train, y_train)
best_y_pred = best.predict(X_test)
accuracy = accuracy_score(y_test, best_y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8117845117845118


In [22]:
grid = {'max_depth': [3,4,5,6],'n_estimators':[100, 200, 300, 400, 500, 1000], 'verbose':[False]}
gscv = GridSearchCV (estimator = catboost, param_grid = grid, scoring ='accuracy', cv = 5)
gscv.fit(X_train, y_train)
#returns the estimator with the best performance
print(gscv.best_estimator_)

#returns the best score
print(gscv.best_score_)

#returns the best parameters
print(gscv.best_params_)

<catboost.core.CatBoostClassifier object at 0x00000202003C5640>
0.7894901394901395
{'max_depth': 6, 'n_estimators': 1000, 'verbose': False}


In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1, max_iter=500).fit(X_train, y_train)
y_test_mlp = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_test_mlp)
print("Accuracy:", accuracy)

In [18]:
from catboost import CatBoostClassifier
catboost = CatBoostClassifier()
catboost.fit(X_train, y_train,verbose=False)
y_pred_cb = catboost.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_cb)
print("Accuracy:", accuracy)

Accuracy: 0.7930415263748597


In [38]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators = 1000)
gb_model = gb.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_gb)
print("Accuracy:", accuracy)


Accuracy: 0.7883838383838384


In [32]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.fit_transform(X_test)

from sklearn.decomposition import PCA
pca = PCA(n_components=20)
pca_X_train = pca.fit_transform(scaled_X_train)
pca_X_test = pca.fit_transform(scaled_X_test)

In [33]:
# PCA TESTING
model_rfc_pca = RandomForestClassifier(n_estimators=250, n_jobs = -1)
model_rfc_pca.fit(pca_X_train,y_train)
y_pred = model_rfc_pca.predict(pca_X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.44753086419753085


In [25]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41580 entries, 43069 to 56422
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     41580 non-null  int64  
 1   amount_tsh             41580 non-null  float64
 2   funder                 41580 non-null  int32  
 3   gps_height             41580 non-null  float64
 4   longitude              41580 non-null  float64
 5   latitude               41580 non-null  float64
 6   wpt_name               41580 non-null  int32  
 7   num_private            41580 non-null  int64  
 8   basin                  41580 non-null  int32  
 9   subvillage             41580 non-null  int32  
 10  region                 41580 non-null  int32  
 11  region_code            41580 non-null  int64  
 12  district_code          41580 non-null  int64  
 13  lga                    41580 non-null  int32  
 14  population             41580 non-null  float64
 15

In [37]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred_gnb = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_gnb)
print("Accuracy:", accuracy)

Accuracy: 0.5299102132435466


In [36]:
catboost_pca = CatBoostClassifier()
catboost_pca.fit(pca_X_train, y_train,verbose=False)
y_pred_cb = catboost_pca.predict(pca_X_test)
accuracy = accuracy_score(y_test, y_pred_cb)
print("Accuracy:", accuracy)

Accuracy: 0.4628507295173962


In [39]:
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(train, label, test_size=0.3, random_state=42)

In [42]:
X_train_m["gps_height"].fillna(X_train_m.groupby(['region'])["gps_height"].transform("median"), inplace=True)
X_train_m["gps_height"].fillna(X_train_m["gps_height"].mean(), inplace=True)

X_train_m["population"].fillna(X_train_m.groupby(['region'])["population"].transform("median"), inplace=True)
X_train_m["population"].fillna(X_train_m["population"].median(), inplace=True)

X_train_m["amount_tsh"].fillna(X_train_m.groupby(['region'])["amount_tsh"].transform("median"), inplace=True)
X_train_m["amount_tsh"].fillna(X_train_m["amount_tsh"].median(), inplace=True)


X_train_m["waterpoint_age"].fillna(X_train_m.groupby(['region'])["waterpoint_age"].transform("median"), inplace=True)
X_train_m["waterpoint_age"].fillna(X_train_m["waterpoint_age"].median(), inplace=True)

X_train_m["longitude"].fillna(X_train_m.groupby(['region'])["longitude"].transform("median"), inplace=True)

In [44]:
X_test_m["gps_height"].fillna(X_test_m.groupby(['region'])["gps_height"].transform("median"), inplace=True)
X_test_m["gps_height"].fillna(X_test_m["gps_height"].median(), inplace=True)

X_test_m["population"].fillna(X_test_m.groupby(['region'])["population"].transform("median"), inplace=True)
X_test_m["population"].fillna(X_test_m["population"].median(), inplace=True)

X_test_m["amount_tsh"].fillna(X_test_m.groupby(['region'])["amount_tsh"].transform("median"), inplace=True)
X_test_m["amount_tsh"].fillna(X_test_m["amount_tsh"].median(), inplace=True)


X_test_m["waterpoint_age"].fillna(X_test_m.groupby(['region'])["waterpoint_age"].transform("median"), inplace=True)
X_test_m["waterpoint_age"].fillna(X_test_m["waterpoint_age"].median(), inplace=True)

X_test_m["longitude"].fillna(X_test_m.groupby(['region'])["longitude"].transform("median"), inplace=True)

In [46]:
rf_median = RandomForestClassifier(n_estimators=250,max_features='log2', n_jobs = -1)
rf_median.fit(X_train_m,y_train_m)
y_pred_median = rf_median.predict(X_test_m)
accuracy = accuracy_score(y_test_m, y_pred_median)
print("Accuracy:", accuracy)

Accuracy: 0.8109988776655443


In [47]:
rf = RandomForestClassifier(n_estimators=250,max_features='log2', n_jobs = -1)
rf.fit(X_train,y_train)
y_pred = rf_median.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_median)
print("Accuracy:", accuracy)

Accuracy: 0.8109988776655443
