In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Definition of some helper functions that are useful later

# Helper function for training classifier. Gridsearch and K-fold cross validation also done.
def train_classifier(classifier, param_grid, cv, data):
    X_train, y_train = data
    class_gridsearch = GridSearchCV(estimator = classifier, param_grid = param_grid, cv = cv)
    class_gridsearch.fit(X_train, y_train) 
    return class_gridsearch.best_estimator_

# Helper function for evaluating trained classifier.
def evaluate_classifier(classifier, data):
    X_train, X_val, y_train, y_val = data
    y_pred_val = classifier.predict(X_val)
    y_pred_train = classifier.predict(X_train)  
    print(classification_report(y_val, y_pred_val))
    print('Accuracy score (validation data): ', accuracy_score(y_val, y_pred_val))
    print('Accuracy score (train data): ', accuracy_score(y_train, y_pred_train))
    
# Outputs test data predictions to csv for upload to Kaggle.    
def testdata_submission(classifier, test_data):
    submission = pd.DataFrame(index = test_data.index)
    submission['type'] = classifier.predict(test_data)
    submission.to_csv('submission.csv')

In [None]:
# Data loading and pre-processing
train_df = pd.read_csv('train.csv', index_col = 'id')
test_df = pd.read_csv('test.csv', index_col = 'id')

# Convert categorical color feature to one hot encoding
train_df = pd.get_dummies(train_df, columns = ['color'])
test_df = pd.get_dummies(test_df, columns = ['color'])

# Split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df.drop(['type'], axis = 1), train_df['type'], test_size = 0.25)

In [None]:
clfs = {}

# Logistic regression
param_grid = {'C': [1, 10, 100, 1000, 10000, 100000, 1000000], 'penalty': ['l1', 'l2']}
classifier = LogisticRegression()
clfs['logreg'] = train_classifier(classifier, param_grid, 5, (X_train, y_train))

evaluate_classifier(clfs['logreg'], (X_train, X_val, y_train, y_val))
testdata_submission(clfs['logreg'], test_df)

In [None]:
# Decision tree
param_grid = {}
classifier = DecisionTreeClassifier()
clfs['dt'] = train_classifier(classifier, param_grid, 5, (X_train, y_train))

evaluate_classifier(clfs['dt'], (X_train, X_val, y_train, y_val))
testdata_submission(clfs['dt'], test_df)

In [None]:
# Random forest
param_grid = {'n_estimators': [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 100]}
classifier = RandomForestClassifier()
clfs['rf'] = train_classifier(classifier, param_grid, 5, (X_train, y_train))

evaluate_classifier(clfs['rf'], (X_train, X_val, y_train, y_val))
testdata_submission(clfs['rf'], test_df)

In [None]:
# Linear Discriminant Analysis
param_grid = {}
classifier = LinearDiscriminantAnalysis()
clfs['lda'] = train_classifier(classifier, param_grid, 5, (X_train, y_train))

evaluate_classifier(clfs['lda'], (X_train, X_val, y_train, y_val))
testdata_submission(clfs['lda'], test_df)

In [None]:
# Bagging
param_grid = {}
classifier = BaggingClassifier()
clfs['bagging'] = train_classifier(classifier, param_grid, 5, (X_train, y_train))

evaluate_classifier(clfs['bagging'], (X_train, X_val, y_train, y_val))
testdata_submission(clfs['bagging'], test_df)