In [23]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [24]:
countries = ['A', 'B', 'C']

In [29]:
parameters = {'n_estimators':[5, 10, 100, 500, 1000],
              'min_samples_leaf':[1, 2, 5],
              'max_features':['auto', 5, 10, 25],
              'min_samples_split':[2, 4, 8],
              'max_depth':[1, 10, 100, None]}

In [30]:
results = pd.DataFrame()

for letter in countries:
    df_train = pd.read_csv("data/"+letter+"_hhold_train.csv")
    df_test = pd.read_csv("data/"+letter+"_hhold_test.csv")
    df_train, df_test = df_train.set_index('id'), df_test.set_index('id')
    country_train, country_test = df_train.pop('country'), df_test.pop('country')
    y_train = df_train.pop('poor')
    
    X_train = pd.get_dummies(df_train)
    X_test = pd.get_dummies(df_test)
    
    cols = X_test.columns.intersection(X_train.columns)
    X_test = X_test[cols]
    X_train = X_train[cols]
    
    X_train = X_train.fillna(value=0)
    X_test = X_test.fillna(value=0)
       
    model = RandomForestClassifier()
    clf = GridSearchCV(model, parameters, scoring='neg_log_loss',n_jobs=-1, cv=5, verbose=1)
    clf.fit(X_train, y_train)
        
    y_hat = clf.predict_proba(X_test)
    
    X_test['poor'] = y_hat[:,1] 
    
    predictions = pd.DataFrame()
    predictions['country'] = country_test
    predictions['poor'] = X_test['poor']
    
    results = results.append(predictions)

    print("Country "+letter+" done")
    print(clf.best_params_)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   31.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 41.2min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 50.3min finished


Country A done
{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 209 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done 459 tasks      | elapsed:   57.7s
[Parallel(n_jobs=-1)]: Done 809 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1259 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1809 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 2459 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 3209 tasks      | elapsed: 15.2min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 17.6min finished


Country B done
{'max_depth': 100, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500}
Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done 458 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 808 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1258 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 1808 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 2458 tasks      | elapsed: 16.4min
[Parallel(n_jobs=-1)]: Done 3208 tasks      | elapsed: 24.6min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 29.3min finished


Country C done
{'max_depth': 100, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 10}


In [31]:
results.to_csv("submission_cv_optimised_RF.csv")