In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pickle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from statistics import mean
import os

In [2]:
ADULT_PATH = './adult/adult.pkl'
DOTA_PATH = './dota2Dataset/dota2Train.pkl'
CONNECT_4_PATH = './connect-4/connect-4.pkl'

In [3]:
# load datasets

adult = pickle.load(open(ADULT_PATH, 'rb'))
dota = pickle.load(open(DOTA_PATH, 'rb'))
connect = pickle.load(open(CONNECT_4_PATH, 'rb'))

#test
adult = adult.iloc[:10000, :]
dota = dota.iloc[:10000, :]
connect = connect.iloc[:10000, :]

adult_label = adult.iloc[:,0]
dota_label = dota.iloc[:,0]
connect_label = connect.iloc[:,0]

adult.drop(columns='label', inplace=True)
dota.drop(columns='label', inplace=True)
connect.drop(columns='label', inplace=True)

adult = pd.get_dummies(adult)
dota = pd.get_dummies(dota)
connect = pd.get_dummies(connect)

print(type(adult), type(dota), type(connect))
print(adult.shape, dota.shape, connect.shape)
print(adult_label.unique(), dota_label.unique(), connect_label.unique())

datasets = [adult.values, dota.values, connect.values]
labels = [adult_label.values, dota_label.values, connect_label.values]

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>
(50, 108) (50, 115) (50, 42)
[-1  1] [-1  1] [ 1 -1]


In [4]:
#partition schemes and classifiers
partitions = [0.2, 0.5, 0.8]

rfc = RandomForestClassifier(n_estimators=1024, max_depth=10, random_state=0, n_jobs=-1)
max_features = [1,2,4,6,8,12,16,20]
parameters_1 = {'clf__max_features': max_features}

lgc = LogisticRegression(random_state=0, n_jobs=-1)
penalty = ['l1', 'l2']
C = [10**x for x in range(-8,5)]
parameters_2 = {'clf__penalty': penalty, 'clf__C': C}

svc = SVC(gamma='auto')
C = [10**x for x in range(-7,3)]
kernel = ['linear', 'poly']
degree = [2,3]
parameters_3 = {'clf__C': C, 'clf__kernel': kernel, 'clf__degree': degree}

clfs = [
    RandomForestClassifier(n_estimators=1024, max_depth=20, random_state=0, n_jobs=-1), 
    LogisticRegression(random_state=0, n_jobs=-1),
    SVC(gamma='auto')
]

parameters = [parameters_1, parameters_2, parameters_3]
clf_names = ['rf', 'lg', 'svm']

In [5]:
for classifier in range(3):
    for dataset in range(3):        
        for partition in partitions:
            test_accs = []
            train_accs = []
            val_accs = []
            for trial in range(3):
                X = datasets[dataset]
                y = labels[dataset]
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=partition, stratify=y)
                
                #Make pipeline
                pipeline = Pipeline(
                    [('scaler', MinMaxScaler()),
                     ('clf', clfs[classifier]),                     
                ])
                clf = GridSearchCV(estimator=pipeline, param_grid=parameters[classifier], 
                                   n_jobs=-1, cv=3, return_train_score=True, iid=False)
                clf.fit(X_train, y_train)
                
                print('Classifier {}, dataset {}, partition {}, trial {}'.format(classifier, dataset, partition, trial))
                
                train_accuracy = clf.cv_results_['mean_train_score'][clf.best_index_]
                train_accs.append(train_accuracy)
                
                val_accuracy = clf.cv_results_['mean_test_score'][clf.best_index_]
                val_accs.append(val_accuracy)
                
                test_accuracy = clf.score(X_test, y_test)
                test_accs.append(test_accuracy)
                
                print('Accuracy: {}'.format(test_accuracy))

                print('Best estimator:', clf.best_estimator_)
            
                print('Best params: ', clf.best_params_)
            
                clf_name = clf_names[classifier]
                dirname = "./classifier/{}/{}/{}/{}".format(
                    classifier, dataset, partition, trial
                )
               
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                    
                clf_dump = open("{}/{}.pkl".format(
                    dirname, clf_name
                ),"wb")
                pickle.dump(clf, clf_dump)
                clf_dump.close()
                
                score_dump = open("{}/y_test.pkl".format(
                    dirname
                ),"wb")
                pickle.dump(y_test, score_dump)
                score_dump.close()
                
            avg_test = mean(test_accs)
            avg_train = mean(train_accs)
            avg_val = mean(val_accs)
            
            test_accs.clear()
            train_accs.clear()
            val_accs.clear()
            
            print('Average test accuracy for {}, {}, {}: {}'.format(
                clf_names[classifier], 
                dataset,
                partition,
                avg_test
            ))
            
            print('Average train accuracy for {}, {}, {}: {}'.format(
                clf_names[classifier], 
                dataset,
                partition,
                avg_train
            ))
            
            print('Average val accuracy for {}, {}, {}: {}'.format(
                clf_names[classifier], 
                dataset,
                partition,
                avg_val
            ))

Classifier 0, dataset 0, partition 0.2, trial 0
Accuracy: 0.9
Best estimator: Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features=20, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1024, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False))])
Best params:  {'clf__max_features': 20}
Average test accuracy for rf, 0, 0.2: 0.9
Average train accuracy for rf, 0, 0.2: 1.0
Average val accuracy for rf, 0, 0.2: 0.7490842490842491
Classifier 0, dataset 0, partition 0.5, trial 0
Accuracy: 0.72
Best estimator: Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('clf', RandomForestClassifier(bootstrap=True, class_weight=No

Classifier 1, dataset 0, partition 0.8, trial 0
Accuracy: 0.75
Best estimator: Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('clf', LogisticRegression(C=1e-08, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l1', random_state=0, solver='warn', tol=0.0001,
          verbose=0, warm_start=False))])
Best params:  {'clf__C': 1e-08, 'clf__penalty': 'l1'}
Average test accuracy for lg, 0, 0.8: 0.75
Average train accuracy for lg, 0, 0.8: 0.8055555555555557
Average val accuracy for lg, 0, 0.8: 0.8333333333333334
Classifier 1, dataset 1, partition 0.2, trial 0
Accuracy: 0.6
Best estimator: Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('clf', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penal

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1