In [None]:
# Downloads the files to the
# Colab instance so I don't have to upload them here.

import requests

def save_file(url, file_name):
    r = requests.get(url)
    with open(file_name, 'wb') as f:
      f.write(r.content)

save_file('https://courses.cs.washington.edu/courses/cse416/21sp/homework/hw6/edx_train.csv', 
          'edx_train.csv')
save_file('https://courses.cs.washington.edu/courses/cse416/21sp/homework/hw6/edx_test.csv', 
          'edx_test.csv')



The following cell creates new dataframes for the given train and test set containing the original non-categorical columns along with the one-hot-encoded version of the categorical columns. The new test set contains the columns of the new train set so that the new test set can be predicted by the models, which require the test set to have the same number of columns as the train set in order to do predictions. Furthermore, I decided to use all features except for the userid_DI, course_id, start_time_DI, and last_event_DI columns because these four columns are just used for identification purposes. Ideally, they should not have anything to do with being certified (the target) since they act as identifiers that don't relate to the possible ability/potential of a person. The original train and test sets contained missing values so for the categorical columns, I filled these values with the string 'NaN', and for the non-categorical columns, I filled these values with 0. Inititally, I had also dropped the columns that contained missing values but this got rid of almost half of the total number of columns, including ones that at a glance appeared to be at least somewhat important. As a result, I decided to fill the missing values of these columns with actual values, as mentioned earlier, and found that doing so gave stronger prediction accuracies. 

In [None]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Reads training set and test set CSVs

edx_train = pd.read_csv('edx_train.csv')
edx_test = pd.read_csv('edx_test.csv')

# Got rid of columns (features) that seemed unimportant in training and test sets
# These columns are used for identification purposes where
# userid_DI is only used to identify a row,
# course_id is only used to denote a course for a row,
# and start_time_DI and last_event_DI are denoted dates

edx_train = edx_train.drop(columns = ['userid_DI',
                                      'course_id',
                                      'start_time_DI',
                                      'last_event_DI',])
edx_test = edx_test.drop(columns = ['userid_DI',
                                    'course_id',
                                    'start_time_DI',
                                    'last_event_DI',])

# Stores all categorical features

categorical_features = ['final_cc_cname_DI','LoE_DI','gender','grade']

# Fills missing values in categorical columns with the string 'NaN' in training and test sets
# Then fills missing values in non-categorical columns with 0

edx_train['final_cc_cname_DI'].fillna('NaN', inplace = True)
edx_train['LoE_DI'].fillna('NaN', inplace = True)
edx_train['gender'].fillna('NaN', inplace = True)
edx_train['grade'].fillna('NaN', inplace = True)
edx_train = edx_train.fillna(0)

edx_test['final_cc_cname_DI'].fillna('NaN', inplace = True)
edx_test['LoE_DI'].fillna('NaN', inplace = True)
edx_test['gender'].fillna('NaN', inplace = True)
edx_test['grade'].fillna('NaN', inplace = True)
edx_test = edx_test.fillna(0)

# Inserts a column into the test set in the place of the target column ('certified'), which wouldn't be present in test set
# All values in this new column are 0 
# This is so our models take in datasets with the same number of columns
# This column will not be looked at when determining predictions

edx_test.insert(3, 'certified', 0)

# Creates separate datasets with one containing non-categorical features and another with categorical features
# Does this for both the training and test set
# This is so when we one-hot-encode, only the categorical features will be affected

edx_train_not_categorical = edx_train.drop(columns=categorical_features)
edx_test_not_categorical = edx_test.drop(columns=categorical_features)
edx_train_categorical = edx_train[categorical_features]
edx_test_categorical = edx_test[categorical_features]

# One-Hot-Encodes the categorical features of the training and test sets so that the computer can understand the columns
# Fits the training set so that both the training set and test set have the same one-hot-encoded columns

encoder = OneHotEncoder(handle_unknown = 'ignore')
encoder.fit(edx_train_categorical)
edx_train_categorical = encoder.transform(edx_train_categorical).toarray()
edx_test_categorical = encoder.transform(edx_test_categorical).toarray()

# Converts one-hot-encoded datasets with the categorical features to DataFrame containing initial column names

column_names = encoder.get_feature_names(categorical_features)
edx_train_categorical = pd.DataFrame(edx_train_categorical, columns=column_names, index=edx_train.index)
edx_test_categorical = pd.DataFrame(edx_test_categorical, columns=column_names, index=edx_test.index)

# Combines the one-hot-endoded dataset with the categorical features and the regular dataset with the non-categorical features
# Does this for both the training and test set
# Creates new training and test datasets where the categorical features are one-hot-encoded

edx_train = pd.concat([edx_train_categorical,edx_train_not_categorical],axis=1)
edx_test = pd.concat([edx_test_categorical,edx_test_not_categorical],axis=1)

# Stores the name of the target column and the features 

target = 'certified'
features = list(edx_train.columns)
features.remove('certified')

The following cell creates a train and validation set from the original train set, which will be used by the models

In [None]:
# Splits training set into a new train set and validation set
# Validation set is 20% of the original train set

from sklearn.model_selection import train_test_split

train_data, validation_data = train_test_split(edx_train, test_size=0.2)

The following cell creates a majority class classifer that essentially predicts the label for each row as the most frequent label in the training dataset

In [None]:
# Creates a majority class classifier that "trains" on the training set by finding the most frequent label in the training set
# and "labels" all rows as that most frequent label
# Stores the training and validation accuracies

majority_label = train_data['certified'].mode()[0]

train_num_total = len(train_data)
train_num_majority_label = len(train_data[train_data['certified'] == majority_label])
majority_classifier_train_accuracy = train_num_majority_label/train_num_total

validation_num_total = len(validation_data)
validation_num_majority_label = len(validation_data[validation_data['certified'] == majority_label])
majority_classifier_validation_accuracy = validation_num_majority_label/validation_num_total



The following cell creates a decision tree model with the optimal minimum number of samples in a leaf out of 1, 10, 50, 100, 200, and 300 and max depth out of 1, 5, 10, 15, 20, 25, 30, 35, and 40. 

In [None]:
# Creates a decision tree classifier called 'decision_tree' that utilizes GridSearch using 6-fold validation
# to find the optimal minimum number of samples in a leaf and optimal maximum depth

hyperparameters = {'min_samples_leaf': [1,10,50,100,200,300],
                   'max_depth': [1,5,10,15,20,25,30,35,40]}
decision_tree = GridSearchCV(DecisionTreeClassifier(),cv=6,param_grid=hyperparameters,return_train_score=True)
decision_tree.fit(train_data[features],train_data[target])

# Stores the train and validation accuracies of the decision tree called 'decision_tree'

decision_tree_train_accuracy = accuracy_score(decision_tree.predict(train_data[features]),train_data[target])
decision_tree_val_accuracy = accuracy_score(decision_tree.predict(validation_data[features]),validation_data[target])

# Prints the max depth and minimum number of samples in a leaf

print(decision_tree.best_params_)

{'max_depth': 30, 'min_samples_leaf': 1}


The following cell prints the training and validation accuracies of the majority class classifier and decision tree. The decision tree model had the highest validation accuracy, making it the model I used for my Kaggle submission. 

In [None]:
# Prints the training and validation accuracies of the majority class classifier
# and the Decision Tree model called 'decision_tree' that used GridSearch

print('Majority Class Classifier')
print(majority_classifier_train_accuracy)
print(majority_classifier_validation_accuracy)

print('Decision Tree using Grid Search')
print(decision_tree_train_accuracy)
print(decision_tree_val_accuracy)


Majority Class Classifier
0.5585212674850129
0.545662100456621
Decision Tree using Grid Search
0.9968598344276335
0.9920091324200914


The following cell creates a new dataframe for submission containing the userid_DI column from the original test set and a predictions column containing predictions for the test set from my decision tree model. The new dataframe is then converted to a CSV file which I then downloaded and submitted to Kaggle. 

In [None]:
# Stores decision tree model predictions and converts the type to INT
# Creates a dataframe for submission containing userid_DI and prediction columns
# Saves dataframe to a csv file

predictions = decision_tree.predict(edx_test[features])
predictions = predictions.astype(int)

df_test = pd.read_csv('edx_test.csv')
to_save = df_test[['userid_DI']].copy()
to_save.loc[:, 'certified'] = predictions
to_save.to_csv('submission.csv', index=False)