In [9]:
import numpy as np
import pandas as pd

import os
import seaborn as sns

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
# Loads dataset & processes it:
# - fills NA data
# - processes categorical data so that categories from both train&test are known
def load_dataset(dataset, drop_columns=None):
    df_train = pd.read_csv("./2019-npfl104-shared/data/"+dataset+"/train.txt.gz", header=None)
    df_test = pd.read_csv("./2019-npfl104-shared/data/"+dataset+"/test.txt.gz", header=None)

    train_size = len(df_train)
    df_tog = df_train.append(df_test)

    # Convert to categorical
    for col in df_tog.columns[np.where(df_tog.dtypes == 'object')]:
        df_tog[col] = pd.Categorical(df_tog[col])

    # Drop too unique columns e.g. ids
    for col in df_tog.columns:
        idlike_col = []
        if df_tog[col].nunique() > 0.6 * len(df_tog):
            idlike_col.append(col)
    df_tog = df_tog.drop(idlike_col, axis=1)
        
    # Explicitely drop specified columns
    if drop_columns:
        df_tog = df_tog.drop(drop_columns, axis=1)

    df_train, df_test = df_tog[:train_size], df_tog[train_size:]
    
    df_train = df_train.fillna(df_train.mode().iloc[0])
    df_test = df_test.fillna(df_test.mode().iloc[0])
    
    return df_train, df_test

# Used to split dataframe to features & target (last column)
def get_X(df):
    return pd.get_dummies(df[df.columns[:-1]], dummy_na=True)
def get_Y(df):
    dfc = df[df.columns[-1]]
    return dfc.cat.codes if dfc.dtype.name == "category" else dfc


dftr, dfte = load_dataset("pamap-easy")


In [3]:
classifiers = [
    (SVC(kernel="linear", C=1, gamma='scale'), "SVC", "l"),
    (SVC(kernel="poly", C=1, gamma='scale'), "SVC", "p"),
    (SVC(kernel="rbf", C=1, gamma='scale'), "SVC", "p"),
]

In [4]:
for cls in classifiers:
    model, cls_name, cls_args = cls
        
    score = cross_val_score(model, get_X(dftr), get_Y(dftr), cv=5)  
    print(score)

[0.93932695 0.94103194 0.94810625 0.94609894 0.93868505]
[0.93146647 0.93857494 0.944909   0.94806793 0.9411475 ]
[0.88970769 0.87813268 0.8792425  0.88038395 0.88278749]


In [6]:
parameters = {
    'kernel':['rbf'], 
    'C':[0.1, 1, 10, 100, 1000], 
    'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001]
}

In [15]:
model = SVC(gamma='scale')
gs = GridSearchCV(model, parameters, cv=3, n_jobs=-2, return_train_score=True)

res = gs.fit(get_X(dftr), get_Y(dftr))

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-2,
       param_grid={'kernel': ['rbf'], 'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.1, 0.01, 0.001, 0.0001, 1e-05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)