In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

First, let's read and unify the data

In [None]:
# Read both datasets
df_mat = pd.read_csv("../input/student-mat.csv")
df_por = pd.read_csv("../input/student-por.csv")

# Differentiate them by adding a new feature
df_mat["domain"] = "mat"
df_por["domain"] = "por"

# Merge them into a new single dataframe
df = pd.concat([df_mat, df_por])

# Shuffle and show the data
df = df.sample(frac=1).reset_index(drop=True)
df.head()

In [None]:
# Features that we will use
df_x_raw = df.drop(columns=["Walc", "Dalc", "G1", "G2", "G3"])
# df_x_raw = df[["sex", "age", "address", "famsize", "famrel", "failures", "Pstatus", "Medu", "Fedu", "goout", "studytime", "Dalc", "Walc"]]
df_y = df[["G3"]]
df_x_raw.head()

In [None]:
df_x_onehot = pd.get_dummies(df_x_raw, drop_first=True)
df_x_onehot.head()

In [None]:
from sklearn import preprocessing

x = df_x_onehot.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_x = pd.DataFrame(x_scaled, columns=df_x_onehot.columns)
df_x.head()

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_x.values, df_y.values, test_size=0.33, shuffle=False)

In [None]:
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error, mean_absolute_error

# Method used for getting a regression score
def regression_score(clf):
    y_pred = clf.predict(X_test)
    for metric in [r2_score, explained_variance_score, mean_squared_error, mean_absolute_error]:
        print("{}:\t{}".format(metric.__name__, metric(y_test, y_pred)))
        
def get_score(clf, metric):
    y_pred = clf.predict(X_test)
    return metric(y_test, y_pred)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, make_scorer, mean_squared_error

def MSE(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return mse

def R2(y_true, y_pred):    
    r2 = r2_score(y_true, y_pred)
    return r2

my_scorer = make_scorer(MSE, greater_is_better=False)

def grid_search(clf, params, cv=5):
    gsclf = GridSearchCV(estimator=clf,param_grid=params, cv=cv, scoring=my_scorer)
    gsclf.fit(X_train, y_train)
    return gsclf

In [None]:
from sklearn.linear_model import ElasticNet

params = {
    "alpha": np.linspace(0, 1, 20),
    "l1_ratio": np.linspace(0, 1, 20),
    "fit_intercept": [True, False],
}
clf_en = grid_search(ElasticNet(), params, 5)

regression_score(clf_en)
print(clf_en.best_params_)

In [None]:
from sklearn.svm import SVR

params = {
    "C": [1e0, 1e1, 1e2, 1e3],
    "gamma": np.logspace(-2, 2, 5)
}
clf_svr = grid_search(SVR(), params, 10)

regression_score(clf_svr)
print(clf_svr.best_params_)

In [None]:
from sklearn.ensemble import RandomForestRegressor

params = {
    "max_depth": np.arange(5, 35, 10),
    "n_estimators": np.arange(100, 1000, 300),
}
clf_rfr = grid_search(RandomForestRegressor(), params, 5)
regression_score(clf_rfr)
print(clf_rfr.best_params_)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

params = {
    "n_neighbors": np.arange(2, 100, 5)
}

clf_knn = grid_search(KNeighborsRegressor(), params, 5)
regression_score(clf_knn)
print(clf_knn.best_params_)

In [None]:
r2s = lambda x: get_score(x, r2_score)
maes = lambda x: get_score(x, mean_absolute_error)
bp = lambda x: x.best_params_
scores = pd.DataFrame.from_dict({
    "model": ["ElasticNet", "SVR", "RandomForrestRegressor", "KNN"],
    "best_params": [bp(clf_en), bp(clf_svr), bp(clf_rfr), bp(clf_knn)],
    "R2": [r2s(clf_en), r2s(clf_svr), r2s(clf_rfr), r2s(clf_knn)],
    "MAE": [maes(clf_en), maes(clf_svr), maes(clf_rfr), maes(clf_knn)],
})
scores = scores.sort_values(by=["R2"], ascending=False)

In [None]:
print(scores.to_string())