In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

In [2]:
time_site_pairs_train = pd.read_csv("~data/train_labels_mean_zero.csv")[['time', 'site']]
time_site_pairs_val = pd.read_csv("~data/validation_labels_mean_zero.csv")[['time', 'site']]

In [3]:
df_merged_zero = pd.read_csv("~data/df_merged_mean_15x15_final.csv")
df_merged_neg = pd.read_csv("~data/df_merged_neg_15x15_final.csv")

## df_merged_zero

In [4]:
def get_train_test(dataset):
    
    df_train = dataset.merge(time_site_pairs_train, on=['time', 'site'], how='inner')
    df_test = dataset.merge(time_site_pairs_val, on=['time', 'site'], how='inner')

    df_train.drop(['time', 'site'], axis=1, inplace=True)
    df_test.drop(['time', 'site'], axis=1, inplace=True)

    y_train = df_train.pop('riskLevelLabel')
    y_test = df_test.pop('riskLevelLabel')

    X_train = df_train
    X_test = df_test
    
    return X_train, y_train, X_test, y_test

In [5]:
def rf_result(X_train, y_train, X_test, y_test, model_name):

    param_grid = {
        'max_features': [None],
        'max_depth': [None],
        'min_samples_split': [2],
        'min_samples_leaf': [1]
    }

    best_score = 0
    best_params = {}

    
    for max_features in param_grid['max_features']:
        for max_depth in param_grid['max_depth']:
            for min_samples_split in param_grid['min_samples_split']:
                for min_samples_leaf in param_grid['min_samples_leaf']:
                    rf = RandomForestClassifier(n_estimators=100,
                                                max_features=max_features,
                                                max_depth=max_depth,
                                                min_samples_split=min_samples_split,
                                                min_samples_leaf=min_samples_leaf,
                                                random_state=42)
                    rf.fit(X_train, y_train)

                    y_pred = rf.predict(X_test)
                    f1 = f1_score(y_test, y_pred)

                    if f1 > best_score:
                        best_score = f1
                        best_params = {
                            'max_features': max_features,
                            'max_depth': max_depth,
                            'min_samples_split': min_samples_split,
                            'min_samples_leaf': min_samples_leaf
                        }
                        
                    print('[', model_name,': max_features', max_features, '| max_depth', max_depth, 
                          '| min_samples_split', min_samples_split, '| min_samples_leaf', min_samples_leaf, '] DONE !')
    # Train the best model
    best_rf = RandomForestClassifier(**best_params, random_state=42)
    best_rf.fit(X_train, y_train)
    
    y_pred = best_rf.predict(X_test)
    y_pred_proba = best_rf.predict_proba(X_test)[:, 1]

    accuracy = np.mean(y_pred == y_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    # save model file
    joblib.dump(best_rf, model_name+".joblib")

    return {'Precision': precision,
            'Recall': recall,
            'f1': f1,
            'acc': accuracy,
            'AUC': auc,
            'Best Params': best_params}

In [10]:
results = []

X_train_neg, y_train_neg, X_test_neg, y_test_neg = get_train_test(df_merged_neg)
X_train_zero, y_train_zero, X_test_zero, y_test_zero = get_train_test(df_merged_zero)

In [7]:
%%time
result = rf_result(X_train_neg, y_train_neg, X_test_neg, y_test_neg, 'best_rf_model_neg')
result['Dataset'] = 'df_merged_neg'
results.append(result)

result = rf_result(X_train_zero, y_train_zero, X_test_zero, y_test_zero, 'best_rf_model_zero')
result['Dataset'] = 'df_merged_zero'
results.append(result)

[ best_rf_model_neg : max_features None | max_depth None | min_samples_split 2 | min_samples_leaf 1 ] DONE !


ValueError: Found input variables with inconsistent numbers of samples: [23822, 37422]

In [18]:
y_train_neg.shape

(71532,)

In [8]:
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Precision,Recall,f1,acc,AUC,Best Params,Dataset
0,0.365854,0.027624,0.05137,0.976744,0.664236,"{'max_features': None, 'max_depth': None, 'min...",df_merged_neg


In [9]:
#loaded_rf = joblib.load("my_random_forest.joblib")