Based on preliminary analysis of the data, we conclude that using random forest classifiers is the best way to classify drop-out. Given the imbalance in classes and the importance of misclassification of drop-outs, we use the Kappa estimator as  the benchmark to train the model.

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV
from sklearn.metrics import cohen_kappa_score, make_scorer

from imblearn.over_sampling import SMOTE

In [3]:
def compute_kappa(course_df):
    data_clean = pd.get_dummies(course_df)
        
    labels = np.array(data_clean.engaged)
    features = np.array(data_clean.drop('engaged', axis = 1))
        
    features_train, features_test, labels_train, labels_test = train_test_split(features, labels, 
                                                                                train_size = 0.8, 
                                                                                random_state = 20130810)
    
    sm = SMOTE(random_state = 20130810)
    features_train_smote, labels_train_smote = sm.fit_sample(features_train, labels_train) 
        
    grid_search = GridSearchCV(RandomForestClassifier(n_jobs = 3, 
                                                      n_estimators = 500,
                                                      warm_start = True,
                                                      random_state = 20130810),
                               param_grid = {'max_features': [6, 8, 10, 12]},
                               cv = RepeatedKFold(n_splits = 10, 
                                                  n_repeats = 3, 
                                                  random_state=20130810),
                               scoring = make_scorer(cohen_kappa_score))
        
    grid_search.fit(features_train_smote, labels_train_smote)
        
    kappa_train = max(grid_search.cv_results_['mean_test_score'])
        
    kappa_test = cohen_kappa_score(grid_search.best_estimator_.predict(features_test), labels_test)
    
    cv_result = grid_search.cv_results_
    
    all_splits_result = [max(cv_result[k]) for k in cv_result.keys() if 'split' in k and 'test_score' in k]
    
    return kappa_train, min(all_splits_result), max(all_splits_result), kappa_test

In [4]:
course_metrics = {"course_name": [], 
                  "kappa_train": [], 
                  "min_kappa_train": [],
                  "max_kappa_train": [],
                  "kappa_test": []}

### 1. CB22x - The Ancient Greek Hero

In [5]:
cb22x = pd.read_feather("data/HarvardX_CB22x_2013_Spring.feather")

In [6]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(cb22x)



Wall time: 16min 56s


In [7]:
course_metrics["course_name"].append('HarvardCB22x')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 2. CS50x - Introduction to Computer Science I

In [8]:
cs50x = pd.read_feather("data/HarvardX_CS50x_2012.feather")

In [9]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(cs50x)



Wall time: 2h 27min 33s


In [10]:
course_metrics["course_name"].append('HarvardCS50x')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 3. ER22x - Justice

In [11]:
er22x = pd.read_feather("data/HarvardX_ER22x_2013_Spring.feather")

In [12]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(er22x)



Wall time: 34min 23s


In [13]:
course_metrics["course_name"].append('HarvardER22x')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 4. PH207x - Health in Numbers: Quantitative Methods in Clinical & Public Health Research

In [14]:
ph207x = pd.read_feather("data/HarvardX_PH207x_2012_Fall.feather")

In [15]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(ph207x)



Wall time: 26min 51s


In [16]:
course_metrics["course_name"].append('HarvardPH207x')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 5. PH278x - Human Health and Global Environmental Change

In [17]:
ph278x = pd.read_feather("data/HarvardX_PH278x_2013_Spring.feather")

In [18]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(ph278x)



Wall time: 26min 33s


In [19]:
course_metrics["course_name"].append('HarvardPH278x')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 6. 6.002x (Fall) - Circuits and Electronics

In [20]:
mit6002x = pd.read_feather("data/MITx_6_002x_2012_Fall.feather")

In [21]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(mit6002x)



Wall time: 21min 30s


In [22]:
course_metrics["course_name"].append('MIT6002x_Fall')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 7. 6.002x (Spring) - Circuits and Electronics

In [23]:
mit6002x = pd.read_feather("data/MITx_6_002x_2013_Spring.feather")

In [24]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(mit6002x)



Wall time: 9min 47s


In [25]:
course_metrics["course_name"].append('MIT6002x_Spring')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 8. 14.73x - The Challenges of Global Poverty

In [26]:
mit1473x = pd.read_feather("data/MITx_14_73x_2013_Spring.feather")

In [27]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(mit1473x)



Wall time: 16min 34s


In [28]:
course_metrics["course_name"].append('MIT1473x')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 9. 2.01x - Elements of Structures

In [29]:
mit201x = pd.read_feather("data/MITx_2_01x_2013_Spring.feather")

In [30]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(mit201x)



Wall time: 4min 9s


In [31]:
course_metrics["course_name"].append('MIT201x')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 10. 3.091x(Fall) - Introduction to Solid State Chemistry

In [32]:
mit3091x = pd.read_feather("data/MITx_3_091x_2012_Fall.feather")

In [33]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(mit3091x)



Wall time: 6min 51s


In [34]:
course_metrics["course_name"].append('MIT3091x_Fall')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 11. 3.091x (Spring) - Introduction to Solid State Chemistry

In [35]:
mit3091x = pd.read_feather("data/MITx_3_091x_2013_Spring.feather")

In [36]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(mit3091x)



Wall time: 4min 43s


In [37]:
course_metrics["course_name"].append('MIT3091x_Spring')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 12. 6.00x (Fall) - Introduction to Computer Science and Programming

In [38]:
mit600x = pd.read_feather("data/MITx_6_00x_2012_Fall.feather")

In [39]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(mit600x)



Wall time: 40min 20s


In [40]:
course_metrics["course_name"].append('MIT600x_Fall')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 13. 6.00x (Spring) - Introduction to Computer Science and Programming

In [41]:
mit600x = pd.read_feather("data/MITx_6_00x_2013_Spring.feather")

In [42]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(mit600x)



Wall time: 39min 2s


In [43]:
course_metrics["course_name"].append('MIT600x_Spring')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 14. 8.02x - Electricity and Magnetism

In [44]:
mit802x = pd.read_feather("data/MITx_8_02x_2013_Spring.feather")

In [45]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(mit802x)



Wall time: 15min 44s


In [46]:
course_metrics["course_name"].append('MIT802x')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 15. 7.00x - Introduction to Biology - The Secret of Life

In [47]:
mit700x = pd.read_feather("data/MITx_7_00x_2013_Spring.feather")

In [48]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(mit700x)



Wall time: 11min 26s


In [49]:
course_metrics["course_name"].append('MIT700x')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

### 16. 8.MReVx - Mechanics ReView

In [50]:
mit8mrevx = pd.read_feather("data/MITx_8_MReV_2013_Summer.feather")

In [51]:
%%time
best_k_train, min_k_train, max_k_train, k_test = compute_kappa(mit8mrevx)



Wall time: 6min 10s


In [52]:
course_metrics["course_name"].append('MIT8MReVx')
course_metrics["kappa_train"].append(best_k_train)
course_metrics["min_kappa_train"].append(min_k_train)
course_metrics["max_kappa_train"].append(max_k_train)
course_metrics["kappa_test"].append(k_test)

In [53]:
course_kappa = pd.DataFrame(course_metrics)

In [54]:
course_kappa.to_feather("data/course_kappa.feather")