# Load Dependencies 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Import Data

In [37]:
train = pd.read_csv('Data/train_features.csv')
test = pd.read_csv('Data/test_features.csv')
target = pd.read_csv('Data/train_labels.csv')
train.shape, test.shape

((59400, 40), (14358, 40))

# Select Features

In [38]:
selected_features = ['amount_tsh',
    'date_recorded',
    'gps_height',
    'basin',
    'region',
    'population',
    'public_meeting',
    'scheme_management',
    'permit',
    'construction_year',
    'extraction_type_class',
    'management_group',
    'payment',
    'quality_group',
    'quantity',
    'source_type',
    'source_class', 
    'waterpoint_type',
    'funder',
    'installer']

def select_features(df, features):
    '''
    Subsets dataframe based on list of columns names accepted 
    as a parameter.
    '''
    return df[features]

In [39]:
train = select_features(train, features=selected_features)
test = select_features(test, features=selected_features)
train.shape, test.shape

((59400, 20), (14358, 20))

# Process Features 

In [6]:
def wrangle_features(X):
    X = X.copy()
    
    # Create month and year features from the recorded data feature
    X['date_recorded'] = pd.to_datetime(X['date_recorded'])
    X['date_recorded_month'] = X['date_recorded'].dt.month
    X['date_recorded_year'] = X['date_recorded'].dt.year
    
    # Bin low freq. categories into 'other'
    X['scheme_management'] = X['scheme_management'].replace({
        'SWC':'Other',
        'Trust':'Other',
        'None':'Other'
    })
    
    # Create age category out of construction_year
    # Bin 0 values as -1
    X['pump_age'] = ( 2014 - X['construction_year'] )
    X['pump_age'].replace({2014:-1}).value_counts()
    
    # Drop unecessary columns
    drop_cols = ['date_recorded', 'construction_year']
    X = X.drop(columns=drop_cols)
    
    return X

In [7]:
train = wrangle_features(train)
test = wrangle_features(test)

train.shape, test.shape

((59400, 19), (14358, 19))

In [8]:
from sklearn.preprocessing import StandardScaler
def standardize(X):
    # Features to standardize
    standardize_cols = ['amount_tsh', 'gps_height', 'population', ]
    
    # Silence Data Conversion warning
    X[standardize_cols] = X[standardize_cols].astype(float)
    
    # Fit and transform scaler
    scaler = StandardScaler()
    scaler.fit(train[standardize_cols])
    scaled = pd.DataFrame( scaler.transform(X[standardize_cols]) )
    
    # Add back column names
    for i in range(len(standardize_cols)):
        scaled = scaled.rename(columns={i:standardize_cols[i]})
        
    # Drop non-standardized columns
    X = X.drop(columns=standardize_cols)
    
    # Concat scaled features with rest of features
    X = pd.concat([X, scaled], axis=1)
    
    return X

In [9]:
train = standardize(train)
test = standardize(test)

train.shape, test.shape

((59400, 19), (14358, 19))

In [10]:
import category_encoders as ce

def one_hot(X):
    # Features to one hot encode
    one_hot_cols = ['date_recorded_month', 'date_recorded_year', 
                    'basin', 'region', 'extraction_type_class', 
                    'management_group', 'payment', 'quality_group', 
                    'quantity', 'source_type', 'source_class', 
                    'waterpoint_type']
    
    # Convert all relevant cols to category datatype (for encoder)
    X[one_hot_cols] = X[one_hot_cols].astype('category')
    
    # Initialize and transform relevant features
    encoder = ce.OneHotEncoder(use_cat_names=True)
    
    # Note, train hardcoded to avoid overfitting test data
    encoder.fit(train[one_hot_cols])
    X = encoder.transform(X[one_hot_cols])
    
    return X

In [11]:
processed_train = one_hot(train)
processed_test = one_hot(test)
processed_train.shape, processed_test.shape

((59400, 94), (14358, 94))

# Modeling 

In [12]:
target = pd.read_csv('Data/train_labels.csv')

In [13]:
target.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

model = RandomForestClassifier(n_estimators=300,
                              max_depth=None)

In [121]:
scores = cross_validate(model, 
                        processed_train, 
                        target['status_group'], 
                        return_train_score=True,
                        return_estimator=True,
                        scoring='accuracy', 
                        cv=5)

In [122]:
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,estimator,test_score,train_score
0,23.965907,1.13996,"(DecisionTreeClassifier(class_weight=None, cri...",0.778722,0.821692
1,23.611494,1.130344,"(DecisionTreeClassifier(class_weight=None, cri...",0.772157,0.823165
2,24.540524,1.136692,"(DecisionTreeClassifier(class_weight=None, cri...",0.774747,0.822727
3,31.296544,1.173315,"(DecisionTreeClassifier(class_weight=None, cri...",0.770286,0.823506
4,25.210354,1.140754,"(DecisionTreeClassifier(class_weight=None, cri...",0.770837,0.823366


In [16]:
from sklearn.metrics import accuracy_score

model.fit(processed_train, target['status_group'])

y_train_pred = model.predict(processed_train)
y_test_pred = model.predict(processed_test)

score = accuracy_score(target['status_group'], y_train_pred)
print(f'Train Accuracy Score: {score}')

Train Accuracy Score: 0.8197474747474748


# With Boosting

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

grboost = GradientBoostingClassifier(n_estimators=100,
                                    max_depth=3, 
                                    verbose=10,
                                    n_iter_no_change=10)

In [32]:
from sklearn.metrics import accuracy_score

grboost.fit(processed_train, target['status_group'])

y_train_pred = model.predict(processed_train)
y_test_pred = model.predict(processed_test)

score = accuracy_score(target['status_group'], y_train_pred)
print(f'Train Accuracy Score: {score}')

      Iter       Train Loss   Remaining Time 
         1       50991.8162            1.61m
         2       48892.6589            1.58m
         3       47155.6381            1.56m
         4       45721.8270            1.54m
         5       44517.4814            1.52m
         6       43495.8115            1.50m
         7       42635.8287            1.49m
         8       41896.8445            1.47m
         9       41234.9685            1.47m
        10       40654.4576            1.47m
        11       40159.6736            1.47m
        12       39724.7802            1.46m
        13       39330.3681            1.45m
        14       38998.5898            1.44m
        15       38668.6919            1.42m
        16       38382.4507            1.41m
        17       38120.3841            1.40m
        18       37887.8508            1.38m
        19       37668.4279            1.36m
        20       37461.1813            1.35m
        21       37258.4692            1.33m
        2

# Create Submission 

In [33]:
def create_submission(y_test_pred):
    sample_submission = pd.read_csv('Data/sample_submission.csv')
    submission = sample_submission.copy()
    submission['status_group'] = y_test_pred
    
    now = pd.to_datetime('now')
    filename = 'MB_' + str(now).replace(' ','_')[0:-7] 
    
    submission.to_csv(f'Submissions/{filename}.csv', index=False)

In [34]:
create_submission(y_test_pred)

# Review Feature Importances

In [None]:
def plot_feature_importances(model):
    '''
    Assumes at least 15 features, model must be fitted
    '''
    fi_values = model.feature_importances_
    fi = pd.DataFrame({
        'feature':processed_train.columns,
        'importance':fi_values
    })
    fi['normalized'] = fi['importance'] / fi['importance'].sum()
    fi = fi.sort_values('normalized', ascending=False).reset_index()
    plt.figure(figsize=(10,6))
    ax = plt.subplot()
    ax.barh(list(reversed(list(fi.index[:15]))),
           fi['normalized'].head(15),
           align='center', edgecolor='k')
    ax.set_yticks( list(reversed(list(fi.index[:15]))) )
    ax.set_yticklabels(fi['feature'].head(15))
    ax.set_title('Top 15 Feature Importances')
    
    print(fi.tail(10))
    plt.show()

In [None]:
plot_feature_importances(model)