<a href="https://colab.research.google.com/github/mkirby1995/Tanzania_water_project/blob/master/Tanzania_Attempt_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data

In [0]:
import pandas as pd

In [0]:
features = 'https://raw.githubusercontent.com/mkirby1995/Tanzania_water_project/master/train_features.csv'
target = 'https://raw.githubusercontent.com/mkirby1995/Tanzania_water_project/master/train_labels.csv'

features = pd.read_csv(features)
labels = pd.read_csv(target)

In [0]:
X_test = pd.read_csv('https://raw.githubusercontent.com/mkirby1995/Tanzania_water_project/master/test_features.csv')

# Clean

In [0]:
!pip install category_encoders



In [0]:
import category_encoders as ce
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
import numpy as np

In [0]:
def reverse_cardinality_check(n, df):
  """
  Given a cardinality limit (n) and a dataframe this function will search the
  dataframe for features above the cardinality limit, then create a dict
  from the results
  """
  
  feature_list = []
  
  cardinality_value = []
  
  for _ in range(len(df.columns)):
    if len(df[df.columns[_]].value_counts()) > n:
      
      feature_list.append(df.columns[_])
      
      cardinality_value.append(len(df[df.columns[_]].value_counts()))
                               
        
  feature_dict = dict(zip(feature_list, cardinality_value))
  
  return feature_dict

## Exploration

In [0]:
features['construction_year'] = features['construction_year'].replace({0:1993})
features['age'] = features['date_recorded'].astype(str).str[:4].astype(int) - features['construction_year']
features['pop/year'] = features['population'].replace({0:1}) / features['age'].replace({0:1})

In [0]:
X_test['construction_year'] = X_test['construction_year'].replace({0:1993})
X_test['age'] = X_test['date_recorded'].astype(str).str[:4].astype(int) - X_test['construction_year']
X_test['pop/year'] = X_test['population'].replace({0:1}) / X_test['age'].replace({0:1})

In [0]:
features['water_/_person'] = features['amount_tsh'].replace({0:1}) / features['population'].replace({0:1})

In [0]:
X_test['water_/_person'] = X_test['amount_tsh'].replace({0:1}) / X_test['population'].replace({0:1})

## Features

In [0]:
high_cardinality_feature_dict = reverse_cardinality_check(150, features)
high_cardinality_feature_dict

{'date_recorded': 356,
 'funder': 1897,
 'gps_height': 2428,
 'id': 59400,
 'installer': 2145,
 'latitude': 57517,
 'longitude': 57516,
 'pop/year': 4421,
 'population': 1049,
 'scheme_name': 2696,
 'subvillage': 19287,
 'ward': 2092,
 'water_/_person': 2633,
 'wpt_name': 37400}

In [0]:
low_cardinality_features = features.drop(columns = list(high_cardinality_feature_dict.keys()))
low_cardinality_features.columns

Index(['amount_tsh', 'num_private', 'basin', 'region', 'region_code',
       'district_code', 'lga', 'public_meeting', 'recorded_by',
       'scheme_management', 'permit', 'construction_year', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_quality',
       'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
       'source_class', 'waterpoint_type', 'waterpoint_type_group', 'age'],
      dtype='object')

In [0]:
high_cardinality_features = features[list(high_cardinality_feature_dict.keys())]
high_cardinality_features.columns

Index(['id', 'date_recorded', 'funder', 'gps_height', 'installer', 'longitude',
       'latitude', 'wpt_name', 'subvillage', 'ward', 'population',
       'scheme_name', 'pop/year', 'water_/_person'],
      dtype='object')

In [0]:
one_hot_encode = ce.OneHotEncoder(use_cat_names=True)
one_hot_encode.fit(low_cardinality_features, labels['status_group'])
low_cardinality_features = one_hot_encode.transform(low_cardinality_features)

ordinal_encode = ce.OrdinalEncoder()
ordinal_encode.fit(high_cardinality_features, labels['status_group'])
high_cardinality_features = ordinal_encode.transform(high_cardinality_features)

In [0]:
features = low_cardinality_features.merge(high_cardinality_features,
                                          on = low_cardinality_features.index)

In [0]:
feature_names = features.columns
feature_names

Index(['key_0', 'amount_tsh', 'num_private', 'basin_Lake Nyasa',
       'basin_Lake Victoria', 'basin_Pangani', 'basin_Ruvuma / Southern Coast',
       'basin_Internal', 'basin_Lake Tanganyika', 'basin_Wami / Ruvu',
       ...
       'installer', 'longitude', 'latitude', 'wpt_name', 'subvillage', 'ward',
       'population', 'scheme_name', 'pop/year', 'water_/_person'],
      dtype='object', length=322)

In [0]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(features, labels['status_group'])
features = imputer.transform(features)

In [0]:
scaler = RobustScaler()
scaler.fit(features, labels['status_group'])
features = scaler.transform(features)

In [0]:
from sklearn.cluster import KMeans

longlat = pd.read_csv('https://raw.githubusercontent.com/mkirby1995/Tanzania_water_project/master/train_features.csv')

coordinates = longlat[['longitude', 'latitude']]

kmeans = KMeans(n_clusters=1500)

kmeans.fit(coordinates)

pd.Series(kmeans.labels_)

array([ 607,  383,  857, ..., 1353,  874,  470], dtype=int32)

In [0]:
features['geo_cluster'] = pd.Series(kmeans.labels_)

IndexError: ignored

## X_test

In [0]:
high_cardinality_X_test_dict = reverse_cardinality_check(150, X_test)
high_cardinality_X_test_dict

In [0]:
low_cardinality_X_test = X_test.drop(columns = list(high_cardinality_X_test_dict.keys()))
low_cardinality_X_test.columns

In [0]:
high_cardinality_X_test = X_test[list(high_cardinality_X_test_dict.keys())]
high_cardinality_X_test.columns

In [0]:
low_cardinality_X_test = one_hot_encode.transform(low_cardinality_X_test)

In [0]:
high_cardinality_X_test = ordinal_encode.transform(high_cardinality_X_test)

In [0]:
X_test = low_cardinality_X_test.merge(high_cardinality_X_test,
                                          on = low_cardinality_X_test.index)

In [0]:
X_test = imputer.transform(X_test)

In [0]:
X_test = scaler.transform(X_test)

In [0]:
test_longlat = pd.dataFrame(pd.read_csv('https://raw.githubusercontent.com/mkirby1995/Tanzania_water_project/master/train_labels.csv'))

test_coordinates = test_longlat[['longitude', 'latitude']]

kmeans.fit(test_coordinates)

X_test['geo_cluster'] = kmeans.labels_

# Model

In [0]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

In [0]:
target = labels['status_group']

In [0]:
from sklearn.metrics import classification_report, confusion_matrix

def con_matrix_analysis(model):
  x = model.predict(features)
  y = target
  
  print(classification_report(y, x,
        target_names=['Functional', 'Needs Repair', 'Not-Functional']))

  con_matrix = pd.DataFrame(confusion_matrix(y, x), 
             columns=['Predicted Functional', 'Predicted Needs Repair', 'Predicted Not-Functional'], 
             index=['Actual Functional', 'Actual Needs Repair', 'Actual Not-Functional'])
                            
  sns.heatmap(data=con_matrix, cmap='cool')
  plt.show();
  return con_matrix

In [0]:
from sklearn.ensemble import RandomForestClassifier

# Model Analysis

## Cross Validation

In [0]:
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'max_depth': [25, 26, 27, 28, 29]
}

gridsearch = RandomizedSearchCV(
    RandomForestClassifier(n_estimators=92, n_jobs=-1, random_state=42), 
    param_distributions=param_distributions, 
    n_iter=5, 
    cv=5, 
    scoring='accuracy', 
    verbose=10, 
    return_train_score=True, 
    n_jobs=-1
)

gridsearch.fit(features, target)

In [0]:
results = pd.DataFrame(gridsearch.cv_results_)
print(f'Best result from search of {len(results)} parameter combinations')
results.sort_values(by='rank_test_score').head(5)

In [0]:
model = gridsearch.best_estimator_

In [0]:
con_matrix_analysis(model)

## feature Importance

# Export

In [0]:
# estimator is your model or pipeline, which you've fit on X_train

# X_test is your pandas dataframe or numpy array, 
# with the same number of rows, in the same order, as test_features.csv, 
# and the same number of columns, in the same order, as X_train

y_pred = model.predict(X_test)


# Makes a dataframe with two columns, id and status_group, 
# and writes to a csv file, without the index

sample_submission = pd.read_csv('https://raw.githubusercontent.com/mkirby1995/Tanzania_water_project/master/sample_submission.csv')
submission = sample_submission.copy()


In [0]:
submission.shape, y_pred.shape

In [0]:
submission['status_group'] = y_pred
submission.to_csv('submission_1.csv', index=False)

In [0]:
from google.colab import files
files.download('submission_1.csv') 