Uses the `student-por.csv` file from the [Student Performance data](https://archive.ics.uci.edu/ml/datasets/Student+Performance).

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
url = 'https://gist.githubusercontent.com/DanielKerrigan/9cb6d175a6d6b966d99f8995a2578688/raw/aa0c86615fa060299eada78b96b78c1eec084f1b'

In [None]:
df_all = pd.read_csv(f'{url}/student-por.csv', sep=';').sample(frac=1).reset_index(drop=True)

df_all.drop(columns=['G1', 'G2'], inplace=True)
df_all.rename(columns={'G3': 'label'}, inplace=True)
df_all['label'] = pd.cut(
    df_all['label'],
    [0, 9, 11, 13, 15, 20],
    labels=['F', 'D', 'C', 'B', 'A'],
    include_lowest=True,
)

num_train = df_all.shape[0] // 2
df_train = df_all[:num_train].copy()
df_test = df_all[num_train:].copy()

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
num_train = df_train.shape[0]
num_test = df_test.shape[0]

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_one_hot = pd.get_dummies(df_all.drop(columns=['label']), drop_first=True)
df_one_hot['label'] = df_all['label']
df_train_one_hot = df_one_hot[:num_train]
df_test_one_hot = df_one_hot[num_train:]

In [None]:
df_train_one_hot.shape[0] == num_train and df_test_one_hot.shape[0] == num_test

In [None]:
df_train_one_hot.head()

In [None]:
X_train = df_train_one_hot.drop(columns=['label']).values
y_train = df_train_one_hot['label'].values

In [None]:
X_test = df_test_one_hot.drop(columns=['label']).values
y_test = df_test_one_hot['label'].values

In [None]:
X_train.shape[0] == num_train and X_test.shape[0] == num_test

In [None]:
parameters = {
    'n_estimators': [16, 32, 64, 128, 256, 512],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [8, 16, 32, 64],
    'max_features': [2, 4, 8, 16, 32]
}
clf = GridSearchCV(ExtraTreesClassifier(), parameters, scoring='accuracy', cv=5, refit=True)
clf.fit(X_train, y_train)

In [None]:
print(clf.best_score_)
print(clf.best_params_)

In [None]:
train_preds = clf.predict(X_train)

In [None]:
(train_preds == y_train).sum() / num_train

In [None]:
test_preds = clf.predict(X_test)

In [None]:
(test_preds == y_test).sum() / num_test

In [None]:
df_test['prediction'] = test_preds

In [None]:
df_test.to_csv('student-performance.csv', index=False, float_format='%.f')