In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [4]:
College = pd.read_csv('College.csv')
College.Private.replace({'Yes':1, 'No':0}, inplace=True)
y = College.Apps
# The third column is the response, no. of applications
# The fist column, the university name, is dropped
idx = [x for x in range(1, 19) if x != 2]

X = College.iloc[:, idx].to_numpy()
y = College.Apps.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [16]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(X_train, y_train)
print("R^2 score: {}".format(reg.score(X_test, y_test)))

R^2 score: 0.9082978945824837


In [34]:
from sklearn.linear_model import RidgeCV

alphas = np.linspace(0.5, 5, 20)
reg = RidgeCV(alphas=alphas).fit(X_train, y_train)

print("R^2 score: {}".format(reg.score(X_test, y_test)))
print("alpha picked: {}".format(reg.alpha_))
# R^2 is improved slightly

R^2 score: 0.9084544694236504
alpha picked: 0.9736842105263157


In [36]:
from sklearn.linear_model import LassoCV

alphas = np.linspace(0.5, 10, 20)
reg = LassoCV(alphas=alphas).fit(X_train, y_train)

print("R^2 score: {}".format(reg.score(X_test, y_test)))
print("alpha picked: {}".format(reg.alpha_))
# R^2 is improved further

R^2 score: 0.909217952328365
alpha picked: 9.0




In [45]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

# The grid search for composite model is found in the following link:
# https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
param_grid = {'pca__n_components': np.arange(1, 18)}

pipe = make_pipeline(PCA(), LinearRegression())

search = GridSearchCV(pipe, param_grid, cv=5, iid=False)
search.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))]),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'pca__n_components': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [47]:
search.best_params_
search.best_score_
# TODO: plot R^2 score vs n_components

0.8966118607574872