# Classifying waterpoints with TPOT

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
data_path = 'data'
train_df = pd.read_csv(f'{data_path}/train.csv', index_col='id').join(
    pd.read_csv(f'{data_path}/train_labels.csv', index_col='id')
)

In [3]:
train_df.shape

(59400, 40)

In [4]:
from sklearn.base import TransformerMixin
    
class LatLonImputer(TransformerMixin):
    
    def __init__(self):
        self.lga_latlon = {
            'Bariadi': (-2.807838, 33.988149),
            'Geita': (-2.878836, 32.227321),
            'Magu': (-2.591117, 33.439851),
        }
        
    def transform(self, df, y=None):
        X = df.copy()
        mask = X.longitude < 20
        for lga in X[mask].lga.unique():
            assert lga in self.lga_latlon, f'unknown lga: {lga}'
            lat, lon = self.lga_latlon[lga]
            X.loc[mask & (X.lga == lga), 'latitude'] = lat
            X.loc[mask & (X.lga == lga), 'longitude'] = lon
        return X
    
    def fit(self, df, y=None):
        return self

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer

In [6]:
class NanStringImputer(TransformerMixin):
    
    def transform(self, df, y=None):
        X = df.copy()
        obj_cols = [col for col, dtype in df.dtypes.items() if dtype == 'object']
        for col in obj_cols:
            X.loc[X[col].isna(), col] = 'nan'
        return X
    
    def fit(self, df, y=None):
        return self

In [7]:
num_features = [
    'latitude', 'longitude', 'gps_height',
    'population', 'amount_tsh',
]

cat_features = [
    'funder', 'installer',
    'basin', 'subvillage', 'region', 'lga', 'ward',
    'public_meeting', 'permit',
    'extraction_type', 'extraction_type_group', 'extraction_type_class',
    'payment', 'payment_type',
    'water_quality',
    'quality_group', 'quantity', 'quantity_group',
    'source', 'source_type', 'source_class',
    'waterpoint_type', 'waterpoint_type_group',
]

features = num_features + cat_features

In [8]:
from category_encoders import OrdinalEncoder

feature_pipeline = make_pipeline(
    LatLonImputer(),
    NanStringImputer(),
    ColumnTransformer([
        ("cat_features", OrdinalEncoder(), cat_features),
    ], remainder='passthrough')
)

target_encoder = OrdinalEncoder()

In [9]:
train_features = feature_pipeline.fit_transform(train_df[features])
train_target = target_encoder.fit_transform(train_df.status_group.values).values.ravel()

In [10]:
from tpot import TPOTClassifier

X_train, X_test, y_train, y_test = train_test_split(train_features, train_target,
                                                    train_size=0.8, test_size=0.2)

pipeline_optimizer = TPOTClassifier(
    generations=5,
    population_size=64, 
    cv=5,
    random_state=42,
    n_jobs=-1,
    memory='auto',
    scoring='accuracy',
    verbosity=5,
    warm_start=True,
)

It took about 3 hours...
```python
pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))
pipeline_optimizer.export('tpot_exported_pipeline.py')
```

Here's what we copied from `tpot_exported_pipeline.py`.

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from tpot.export_utils import set_param_recursive

# Average CV score on the training set was: 0.8047558922558922
exported_pipeline = make_pipeline(
    RobustScaler(),
    RandomForestClassifier(
        bootstrap=False, 
        criterion="entropy", 
        max_features=0.5, 
        min_samples_leaf=8, 
        min_samples_split=2, 
        n_estimators=100
    )
)
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

In [12]:
from sklearn.model_selection import GridSearchCV

params = {
    'randomforestclassifier__bootstrap': [False],
    'randomforestclassifier__criterion': ['entropy'],
    'randomforestclassifier__max_features': [0.5],
    'randomforestclassifier__min_samples_leaf': [6, 8],
    'randomforestclassifier__min_samples_split': [2],
    'randomforestclassifier__n_estimators': [100, 160, 180],
}
results = GridSearchCV(
    estimator=exported_pipeline, 
    param_grid=params, 
    scoring='accuracy',
    cv=4,
    n_jobs=-1,
    verbose=3,
).fit(X=train_features, y=train_target)

Fitting 4 folds for each of 6 candidates, totalling 24 fits


In [13]:
results.best_score_

0.8072558922558923

In [14]:
results.best_params_

{'randomforestclassifier__bootstrap': False,
 'randomforestclassifier__criterion': 'entropy',
 'randomforestclassifier__max_features': 0.5,
 'randomforestclassifier__min_samples_leaf': 6,
 'randomforestclassifier__min_samples_split': 2,
 'randomforestclassifier__n_estimators': 160}

In [21]:
model = make_pipeline(
    RobustScaler(),
    RandomForestClassifier(
        bootstrap=False, 
        criterion="entropy", 
        max_features=0.5, 
        min_samples_leaf=6, 
        min_samples_split=2, 
        n_estimators=160
    )
)

model.fit(X=train_features, y=train_target)

Pipeline(steps=[('robustscaler', RobustScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=False, criterion='entropy',
                                        max_features=0.5, min_samples_leaf=6,
                                        n_estimators=160))])

## Submission

In [17]:
test_df = pd.read_csv(f'{data_path}/test.csv')

test_features = feature_pipeline.transform(test_df[features])

index = test_df.id.values
test_target = target_encoder.inverse_transform(model.predict(test_features)).values.ravel()

In [18]:
submission = pd.DataFrame({'id': index, 'status_group': test_target}).set_index('id')
submission.to_csv('submission.csv')

In [19]:
submission.head(5)

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
50785,non functional
51630,functional
17168,functional
45559,non functional
49871,functional
