In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.20.1.


In [6]:
dataset = "militarized-interstate-disputes"

def load_dataset(dataset):
    df_train = pd.read_csv("./"+dataset+"/train.txt.gz", header=None)
    df_test = pd.read_csv("./"+dataset+"/test.txt.gz", header=None)

    train_size = len(df_train)
    df_tog = df_train.append(df_test)

    for col in df_tog.columns[np.where(df_tog.dtypes == 'object')]:
        df_tog[col] = pd.Categorical(df_tog[col])
        

    df_train, df_test = df_tog[:train_size], df_tog[train_size:]
    
    df_train = df_train.fillna(df_train.mode().iloc[0])
    df_test = df_test.fillna(df_test.mode().iloc[0])
    
    return df_train, df_test

df_train, df_test = load_dataset(dataset)

In [14]:
def create_classifiers():
    return [
        (KNeighborsClassifier(3), "KNeighborsClassifier", "k=3"),
        #(SVC(kernel="linear", C=0.025), "SVC", "kernel=\"linear\", C=0.025"),
        #SVC(gamma=2, C=1),
        #GaussianProcessClassifier(1.0 * RBF(1.0)),
        (DecisionTreeClassifier(max_depth=5), "DecisionTreeClassifier", "max_depth=5"),
        (RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), "RandomForestClassifier", "max_depth=5, n_estimators=10, max_features=1"),
        #(MLPClassifier(), "MLPClassifier", "default"),
        (AdaBoostClassifier(), "AdaBoostClassifier", "default"),
        (GaussianNB(), "GaussianNB", "default"),
        #QuadraticDiscriminantAnalysis()
    ]


classifiers = create_classifiers()

In [4]:
def get_X(df):
    return pd.get_dummies(df[df.columns[:-1]], dummy_na=True)
def get_Y(df):
    dfc = df[df.columns[-1]]
    return dfc.cat.codes if dfc.dtype.name == "category" else dfc

def run_classifiers(df_train, df_test, dtset_args):
    for cls in create_classifiers():
        model, cls_name, cls_args = cls
        
        # Fit and score model with given data
        model = model.fit(get_X(df_train), get_Y(df_train))
        score = model.score(get_X(df_test), get_Y(df_test))
        
        # Produce report string (I'd much rather use f strings but need to support older python)
        dtset_name, dtset_comm = dtset_args
        print(dtset_name + "\t" + cls_name  + "\t" + "{:.2f}".format(score*100) + "\t" + "Petr H." + "\t" + ", ".join([dtset_comm, cls_args]) )

run_classifiers(df_train, df_test, ("pamap-easy", "ORIGFEATS"))

pamap-easy	KNeighborsClassifier	58.22	Petr H.	ORIGFEATS, k=3
pamap-easy	DecisionTreeClassifier	77.63	Petr H.	ORIGFEATS, max_depth=5
pamap-easy	AdaBoostClassifier	70.32	Petr H.	ORIGFEATS, default
pamap-easy	GaussianNB	29.45	Petr H.	ORIGFEATS, default


In [11]:
def run_def_datasets(dtsets):
    datasets = [ (dtset, "ORIGFEATS, ONEHOT") for dtset in dtsets ]
    
    for dtst in datasets:
        dtset_name, _ = dtst
        df_train, df_test = load_dataset(dtset_name)
        run_classifiers(df_train, df_test, dtst)

In [13]:
dtsets_slow = [
    #"czech-car-accidents", 
    #"czech-presidental-election-easy", 
    #"czech-presidental-election-hard", 
    "formspring-myspace-data-for-cyberbullying", # Slow
    #"game-cheaters", 
    "it-cybertrolls-text-features", # Slow
    #"militarized-interstate-disputes",
    #"pamap-easy",
    #"psp2013-2017",
    #"student-survey-brno"
]

run_def_datasets(dtsets_slow)

formspring-myspace-data-for-cyberbullying	KNeighborsClassifier	90.62	Petr H.	ORIGFEATS, k=3
formspring-myspace-data-for-cyberbullying	DecisionTreeClassifier	93.92	Petr H.	ORIGFEATS, max_depth=5
formspring-myspace-data-for-cyberbullying	AdaBoostClassifier	94.95	Petr H.	ORIGFEATS, default
formspring-myspace-data-for-cyberbullying	GaussianNB	81.46	Petr H.	ORIGFEATS, default
it-cybertrolls-text-features	KNeighborsClassifier	78.84	Petr H.	ORIGFEATS, k=3
it-cybertrolls-text-features	DecisionTreeClassifier	81.75	Petr H.	ORIGFEATS, max_depth=5
it-cybertrolls-text-features	AdaBoostClassifier	81.78	Petr H.	ORIGFEATS, default
it-cybertrolls-text-features	GaussianNB	49.83	Petr H.	ORIGFEATS, default


In [15]:
dtsets_quick = [
    #"czech-car-accidents", 
    "czech-presidental-election-easy", 
     "czech-presidental-election-hard", 
    #"formspring-myspace-data-for-cyberbullying", # Slow
    "game-cheaters", 
    #"it-cybertrolls-text-features", # Slow
    "militarized-interstate-disputes",
    "pamap-easy",
    "psp2013-2017",
    "student-survey-brno"
]

run_def_datasets(dtsets_quick)

czech-presidental-election-easy	KNeighborsClassifier	58.77	Petr H.	ORIGFEATS, k=3
czech-presidental-election-easy	DecisionTreeClassifier	78.95	Petr H.	ORIGFEATS, max_depth=5
czech-presidental-election-easy	RandomForestClassifier	68.42	Petr H.	ORIGFEATS, max_depth=5, n_estimators=10, max_features=1
czech-presidental-election-easy	AdaBoostClassifier	71.93	Petr H.	ORIGFEATS, default
czech-presidental-election-easy	GaussianNB	17.54	Petr H.	ORIGFEATS, default
czech-presidental-election-hard	KNeighborsClassifier	57.02	Petr H.	ORIGFEATS, k=3
czech-presidental-election-hard	DecisionTreeClassifier	69.30	Petr H.	ORIGFEATS, max_depth=5
czech-presidental-election-hard	RandomForestClassifier	68.42	Petr H.	ORIGFEATS, max_depth=5, n_estimators=10, max_features=1
czech-presidental-election-hard	AdaBoostClassifier	68.42	Petr H.	ORIGFEATS, default
czech-presidental-election-hard	GaussianNB	17.54	Petr H.	ORIGFEATS, default
game-cheaters	KNeighborsClassifier	56.85	Petr H.	ORIGFEATS, k=3
game-cheaters	Deci