# picking up from my question in notebook 06- I want to see if I can discover anything about the Waterpoint Type "Other"

### Our metric is 

In [2]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#additional EDA
from pandas_profiling import ProfileReport

# Model selection
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, plot_confusion_matrix, classification_report

# Ensemble/XGBoost
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

# Visualizing
from sklearn.tree import export_graphviz
from IPython.display import Image  
from sklearn.tree import export_graphviz
from pydotplus import graph_from_dot_data
from sklearn.tree import plot_tree

### Reading in the geodata_clean csv... Amanda's research showed that some observations needed to be dropped. see README for details.

In [3]:
df = pd.read_csv("../geodata_clean.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,...,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,geometry,x,y
0,0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,...,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,POINT (34.93809275 -9.856321769999999),34.938093,-9.856322
1,1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,...,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional,POINT (34.6987661 -2.14746569),34.698766,-2.147466
2,2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,...,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional,POINT (37.46066446 -3.82132853),37.460664,-3.821329
3,3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,...,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional,POINT (38.48616088 -11.15529772),38.486161,-11.155298
4,4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,...,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional,POINT (31.13084671 -1.82535885),31.130847,-1.825359


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57588 entries, 0 to 57587
Data columns (total 45 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             57588 non-null  int64  
 1   id                     57588 non-null  int64  
 2   amount_tsh             57588 non-null  float64
 3   date_recorded          57588 non-null  object 
 4   funder                 53966 non-null  object 
 5   gps_height             57588 non-null  int64  
 6   installer              53952 non-null  object 
 7   longitude              57588 non-null  float64
 8   latitude               57588 non-null  float64
 9   wpt_name               57588 non-null  object 
 10  num_private            57588 non-null  int64  
 11  basin                  57588 non-null  object 
 12  subvillage             57217 non-null  object 
 13  region                 57588 non-null  object 
 14  region_code            57588 non-null  int64  
 15  di

I want to see the count of "other" waterpoint types by status

In [12]:
cat_df = df[['waterpoint_type','waterpoint_type_group', 'basin',
             'extraction_type', 'extraction_type_group', 'extraction_type_class']]


In [13]:
cat_df.loc[:, ('waterpoint_type','waterpoint_type_group', 'basin',
             'extraction_type', 'extraction_type_group', 'extraction_type_class')].astype('category')

Unnamed: 0,waterpoint_type,waterpoint_type_group,basin,extraction_type,extraction_type_group,extraction_type_class
0,communal standpipe,communal standpipe,Lake Nyasa,gravity,gravity,gravity
1,communal standpipe,communal standpipe,Lake Victoria,gravity,gravity,gravity
2,communal standpipe multiple,communal standpipe,Pangani,gravity,gravity,gravity
3,communal standpipe multiple,communal standpipe,Ruvuma / Southern Coast,submersible,submersible,submersible
4,communal standpipe,communal standpipe,Lake Victoria,gravity,gravity,gravity
...,...,...,...,...,...,...
57583,communal standpipe,communal standpipe,Pangani,gravity,gravity,gravity
57584,communal standpipe,communal standpipe,Rufiji,gravity,gravity,gravity
57585,hand pump,hand pump,Rufiji,swn 80,swn 80,handpump
57586,hand pump,hand pump,Rufiji,nira/tanira,nira/tanira,handpump


In [24]:
def plot_category(df, category):
    new_df = cat_df(df, category)

    fig, ax = plt.subplots(figsize=(15, 10))

    x_labels = new_df.columns
    y = new_df.values
    width = .25
    x1 = np.arange(len(y[0]))
    x2 = [x + width for x in x1]
    x3 = [x + width for x in x2]
    labels = new_df.index
    ax.bar(x1, y[0], label=labels[0], width=width)
    ax.bar(x2, y[1], label=labels[1], width=width)
    ax.bar(x3, y[2], label=labels[2], width=width)
    ax.set_title('Number of Pumps Per {} Category'.format(category.replace('_', ' ').title()), fontsize=18)
    ax.set_ylabel('Number of Pumps', fontsize=15)
    ax.set_xlabel('{} Categories'.format(category.replace('_', ' ').title()), fontsize=15)
    plt.xticks([r + width for r in range(len(y[0]))], x_labels)
    plt.legend()

    return new_df

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))

x_labels = cat_df.waterpoint_type
y = cat_df.values

x = cat_df.waterpoint_type.values
y = cat_df.values.sum()

# Create just a figure and only one subplot
fig, ax = plt.subplots(figsize=(15, 10))
ax.plot(x, y)
ax.set_title('Simple plot')