In [40]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV

In [41]:
# Input data
infolder = './data/'
infile = infolder + 'default_features_1059_tracks_with_continents.csv'

# Import data into dataframe
df = pd.read_csv(infile)

In [42]:
# Remove latitude and longitude columns from data
df=df.drop(columns=['lat','long'])

In [43]:
# Split data into input X and output y
X = df.iloc[:,1:-2]
y = df.iloc[:,-1]

# Split data into training and testing sets with 80/20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [44]:
# Intitial test using rbf kernel and default gamma
clf1 = SVC(kernel='rbf', gamma='scale')
clf1.fit(X_train, y_train)

# Print error of model to training data 
print(clf1.score(X_train, y_train))

0.8099173553719008


In [47]:
# Hyperparameter options
hyperparameters = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}

# Use 5-fold cross validation and grid search to test hyperparameters in dict and set to best result
clf2 = GridSearchCV(SVC(), hyperparameters, cv=5, iid=False, refit=True) 
clf2.fit(X_train, y_train) 

# Print error of model to training data
print(clf2.score(X_train, y_train))

# Print parameters of best model
print(clf2.best_estimator_)

0.9799291617473436
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [46]:
# Compare how models perform on test set
print(clf1.score(X_test, y_test), clf2.score(X_test, y_test))

0.6132075471698113 0.6367924528301887
