In [30]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

In [31]:
# Input data
infolder = './data/'
infile = infolder + 'default_features_1059_tracks_with_continents.csv'

# Import data into dataframe
df = pd.read_csv(infile)

In [32]:
# Remove latitude and longitude columns from data
df=df.drop(columns=['lat','long'])

In [33]:
# Split data into input X and output y
X = df.iloc[:,1:-2]
y = df.iloc[:,-1]

# Split data into training and testing sets with 80/20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [34]:
# Intitial test using newton-cg solver
clf1 = LogisticRegression(solver='newton-cg', multi_class='auto')
clf1.fit(X_train, y_train)

# Print error of model to training data 
print(clf1.score(X_train, y_train))

0.6824085005903188


In [35]:
# Hyperparameter options
hyperparameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'solver': ['newton-cg', 'liblinear'], 'penalty':['l2']}

# Use 5-fold cross validation and grid search to test hyperparameters in dict and set to best result
clf2 = GridSearchCV(LogisticRegression(multi_class='auto'), hyperparameters, cv=5, iid=False, refit=True) 
clf2.fit(X_train, y_train) 

# Print error of model to training data
print(clf2.score(X_train, y_train))

# Print parameters of best model
print(clf2.best_estimator_)

0.6458087367178277
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)


In [36]:
# Compare how models perform on test set
print(clf1.score(X_test, y_test), clf2.score(X_test, y_test))

0.5141509433962265 0.5471698113207547
