# Business Understanding

Tanzania is a developing country in eastern Africa. It is a geographically diverse country with mountainous terrain and flat plains. The country borders the Indian Ocean in the east, and the Great Rift Valley on its western border. A good portion of the country is below sea level, and much of it is 900 ft above sea level. Water is a tricky resource in the country and many organizations have installed water pumps in villages around the country in an effort to provide clean drinking water to the poeple. These pumps vary in how they extract water, the water quality, and what basin the water comes from. Pumps break down and require maintenance which can be quite a task considering there are over 57,000 pumps in the country.

We have been asked by the Tanzanian Government to create a predicition model that will predict the condition of a water pump; functional, or non-functional. This will allow them strategically mobilize repair teams and to efficiently react when resources are needed in a particular area, such as supplying potable water to a village whose pump is non-functional and needs repair.

#### Our null hypothesis is that we are not able to predict whether a well is functional or not.

#### Our alternative hypothesis is that we can predict the condition of a well.

> A false positive would be to predict a well is functional when it is not.

> A false negative would be to predict a well is non-functional when it is.

In [None]:
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import yellowbrick as yb
import folium
sns.set(style="whitegrid")
pd.set_option('display.max_columns',None)

from scipy import stats as stats
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix,\
    precision_score, recall_score, accuracy_score, f1_score, log_loss,\
    roc_curve, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_selection import SelectFromModel
from time import time
pd.set_option("mode.chained_assignment", None)

In [None]:
V = pd.read_csv('Data/values.csv')
y = pd.read_csv('Data/labels.csv')

In [None]:
y = y.replace({'status_group': {'functional' : 0, 'non functional' : 1, 'functional needs repair' : 1}})

In [None]:
y.status_group.value_counts()

In [None]:
df_lat_long = V['latitude'].to_frame().join(V['longitude']).join(y)
map_center = [df_lat_long['latitude'].mean(), df_lat_long['longitude'].mean()]

In [None]:
# map1 = folium.Map(location = map_center, tiles='Openstreetmap', zoom_start = 5, control_scale=True, prefer_canvas=True)
# for index, loc in df_lat_long.iterrows():
#     if loc['status_group']==0:
#         color = 'green'
#     elif loc['status_group']==1:
#         color = "red"
#     elif loc['status_group']==2:
#         color = 'blue'
#     else:
#         color = 'black'
#     folium.CircleMarker([loc['latitude'], loc['longitude']], radius=2, weight=1, popup=loc['id'], fill_color=color, stroke=False).add_to(map1)
# folium.LayerControl().add_to(map1)

# map1

In [None]:
df_all = V.join(y.set_index('id'), on='id')

In [None]:
df_all.drop(labels=df_all.id[49651], inplace=True)

In [None]:
# Need to drop features that have no influence, and are repetitive
df = df_all.drop(columns=['id', 'amount_tsh', 'date_recorded', 'funder',
       'longitude', 'latitude', 'wpt_name', 'num_private',
       'subvillage', 'region', 'region_code', 'district_code',
       'ward', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 
       'extraction_type_group', 'extraction_type_class',
       'management_group', 'payment', 'payment_type',
       'quality_group', 'quantity', 'quantity_group',
       'source_type', 'source_class',
       'waterpoint_type_group'])

In [None]:
df.duplicated().value_counts()

In [None]:
# df.drop_duplicates(inplace=True)

In [None]:
df.isna().sum().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
sns.pairplot(df)
plt.show()

In [None]:
X = df.drop('status_group', axis=1)
y = df.status_group

In [None]:
def process_scale (X, y):
    X_cat = X.select_dtypes('object')
    ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
    dums = ohe.fit_transform(X_cat)
    dums_df = pd.DataFrame(dums, columns=ohe.get_feature_names(), index=X_cat.index)

    X_nums = X.select_dtypes('int64')
    X = pd.concat([X_nums, dums_df], axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=42)

#         X_train.population.where(X_train.population != 0, X_train.population.median(), inplace=True)
    X_train.construction_year.where(X_train.construction_year != 0, X_train.construction_year.median(), inplace=True)

    X_train_nums = X_train.select_dtypes('int64')
    ss = StandardScaler()
    ss.fit(X_train_nums)
    nums_df = pd.DataFrame(ss.transform(X_train_nums),
                      index=X_train_nums.index)
    X_cats = X_train.select_dtypes('float64')
    X_train_clean = pd.concat([nums_df, X_cats], axis=1)

#         X_test.population.where(X_test.population != 0, X_test.population.median(), inplace=True)
    X_test.construction_year.where(X_test.construction_year != 0, X_test.construction_year.median(), inplace=True)

    X_test_nums = X_test.select_dtypes('int64')
    ss = StandardScaler()
    ss.fit(X_test_nums)
    test_nums_df = pd.DataFrame(ss.transform(X_test_nums),
                      index=X_test_nums.index)
    X_test_cats = X_test.select_dtypes('float64')
    X_test_clean = pd.concat([test_nums_df, X_test_cats], axis=1)

    return X_train_clean, X_test_clean, y_train, y_test


def process_no_scale (X, y):
    X_cat = X.select_dtypes('object')
    ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
    dums = ohe.fit_transform(X_cat)
    dums_df = pd.DataFrame(dums, columns=ohe.get_feature_names(), index=X_cat.index)

    X_nums = X.select_dtypes('int64')
    X = pd.concat([X_nums, dums_df], axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=42)

#         X_train.population.where(X_train.population != 0, X_train.population.median(), inplace=True)
    X_train.construction_year.where(X_train.construction_year != 0, X_train.construction_year.median(), inplace=True)

#         X_test.population.where(X_test.population != 0, X_test.population.median(), inplace=True)
    X_test.construction_year.where(X_test.construction_year != 0, X_test.construction_year.median(), inplace=True)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = process_scale(X, y)

X_train_ns, X_test_ns, y_train_ns, y_test_ns = process_no_scale(X, y)

# Logistic Regression

In [None]:
logreg = LogisticRegression(fit_intercept=False, class_weight='balanced', max_iter=1000, solver='lbfgs')
model_log = logreg.fit(X_train, y_train)

In [None]:
print(f"training accuracy: {model_log.score(X_train, y_train)}")
print(f"testing accuracy: {model_log.score(X_test, y_test)}")

In [None]:
y_hat_test = model_log.predict(X_test)

print(confusion_matrix(y_test, y_hat_test))

In [None]:
precision = precision_score(y_test, y_hat_test)
recall = recall_score(y_test, y_hat_test)
accuracy = accuracy_score(y_test, y_hat_test)
F1 = f1_score(y_test, y_hat_test)

print(precision)
print(accuracy)
print(recall)
print(F1)

# K Nearest Neighbors

In [None]:
knn = KNeighborsClassifier(n_neighbors=30, leaf_size=30, weights='distance')
knn.fit(X_train, y_train)

In [None]:
print(f"training accuracy: {knn.score(X_train, y_train)}")
print(f"testing accuracy: {knn.score(X_test, y_test)}")

In [None]:
knn_preds = knn.predict(X_test)

In [None]:
print(confusion_matrix(y_test, knn_preds))

In [None]:
precision = precision_score(y_test, knn_preds)
recall = recall_score(y_test, knn_preds)
accuracy = accuracy_score(y_test, knn_preds)
F1 = f1_score(y_test, knn_preds)

print(precision)
print(accuracy)
print(recall)
print(F1)

# Naive Bayes

In [None]:
gnb = GaussianNB()

gnb.fit(X_train_ns, y_train)

print(f"training accuracy: {gnb.score(X_train, y_train)}")
print(f"testing accuracy: {gnb.score(X_test, y_test)}")

In [None]:
gnb_preds = gnb.predict(X_test)
print(confusion_matrix(y_test, gnb_preds))

In [None]:
precision = precision_score(y_test, gnb_preds)
recall = recall_score(y_test, gnb_preds)
accuracy = accuracy_score(y_test, gnb_preds)
F1 = f1_score(y_test, gnb_preds)

print(precision)
print(accuracy)
print(recall)
print(F1)

# Decision Tree

In [None]:
tree = DecisionTreeClassifier(max_depth=10, criterion='gini')

tree.fit(X_train_ns, y_train_ns)

print(f"training accuracy: {tree.score(X_train_ns, y_train_ns)}")
print(f"testing accuracy: {tree.score(X_test_ns, y_test_ns)}")

In [None]:
tree_preds = tree.predict(X_test_ns)
print(confusion_matrix(y_test_ns, tree_preds))

In [None]:
precision = precision_score(y_test_ns, tree_preds)
recall = recall_score(y_test_ns, tree_preds)
accuracy = accuracy_score(y_test_ns, tree_preds)
F1 = f1_score(y_test_ns, tree_preds)

print(precision)
print(accuracy)
print(recall)
print(F1)

# Random Forest

In [None]:
forest = RandomForestClassifier(max_features=None, class_weight="balanced",
                            max_depth = 10)

forest.fit(X_train_ns, y_train_ns)

print(f"training accuracy: {forest.score(X_train_ns, y_train_ns)}")
print(f"testing accuracy: {forest.score(X_train_ns, y_train_ns)}")

In [None]:
forest_preds = forest.predict(X_test_ns)
print(confusion_matrix(y_test_ns, forest_preds))

In [None]:
precision = precision_score(y_test_ns, forest_preds)
recall = recall_score(y_test_ns, forest_preds)
accuracy = accuracy_score(y_test_ns, forest_preds)
F1 = f1_score(y_test_ns, forest_preds)

print(precision)
print(accuracy)
print(recall)
print(F1)

# Tuning the KNN Model

In [None]:
knn_tuned = KNeighborsClassifier(algorithm='auto')

print('good 1')

parameters_KNN = {
    'n_neighbors': (5, 30, 50),
    'leaf_size': (10, 30, 100, 1000),
    'weights': ('uniform', 'distance'),
}

print('good 2')

grid_search_KNN = GridSearchCV(
    estimator=knn_tuned,
    param_grid=parameters_KNN,
    scoring = 'recall',
    n_jobs = -1,
    cv = 5
)

print('good 3')

KNN = grid_search_KNN.fit(X_train, y_train)

print('good 4')

print(grid_search_KNN.best_params_ )
print('--------------------------')
print('Best Score - KNN:', grid_search_KNN.best_score_ )