In [14]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error

input_file = "train.csv"
output_file = "submission.txt"

# comma delimited is the default
df = pd.read_csv(input_file, header = 0)
# put the original column names in a python list
original_headers = list(df.columns.values)
# remove the non-numeric columns
df = df._get_numeric_data()
# put the numeric column names in a python list
numeric_headers = list(df.columns.values)
# create a numpy array with the numeric values for input into scikit-learn
data = df.values

x = data[:, 2:]
y = data[:, 1:2]

n = len(data)
folds = 10
lambdas = [0.01, 0.1, 1, 10, 100]


In [15]:
mse = make_scorer(mean_squared_error)
solution = []

# Test with just splitting data for comparison:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
clf = Ridge(1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

# Try with linear regression as well:
reg = LinearRegression().fit(X_train, y_train)
print(reg.score(X_test, y_test))

# Do 10-fold cross validation and calculate average RMSE for each lambda:
clf = Ridge()
for l in lambdas:
    print("lambda =", l, ":")
    clf.set_params(alpha=l)
    #R2s = cross_val_score(clf, x, y, cv=folds)
    #MSEs = cross_val_score(clf, x, y, cv=folds, scoring=mse)
    cv_results = cross_validate(clf, x, y, cv=folds, scoring=mse, return_estimator=True)
    
    print("Coefficients:")
    for model in cv_results['estimator']:
        print(model.coef_)
    
    print("Test MSEs:")
    MSEs = cv_results['test_score']
    
    RMSEs = np.sqrt(MSEs)
    print("MSEs:", MSEs)
    print("RMSEs:", RMSEs)
    
    print("Average RMSE:", RMSEs.mean())
    #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print()
    
    solution.append(RMSEs.mean())

0.6639584854705927
0.6696740912486652
lambda = 0.01 :
Coefficients:
[[-1.13420792e-01  3.61779729e-02  6.39812676e-03  2.20046137e+00
  -1.58207058e+01  4.26312545e+00 -1.02129887e-02 -1.34528676e+00
   2.70319222e-01 -1.16372628e-02 -1.01673938e+00  9.91537722e-03
  -4.44710305e-01]]
[[-1.08099461e-01  5.60494564e-02  2.19502537e-02  2.79264344e+00
  -1.85041633e+01  3.23752441e+00  8.90081708e-03 -1.54014437e+00
   2.82436189e-01 -1.17736261e-02 -9.72562311e-01  7.70698938e-03
  -5.79284098e-01]]
[[-1.06666763e-01  4.53501029e-02  1.93076194e-02  2.82217596e+00
  -1.75658243e+01  3.89283271e+00 -3.95103704e-03 -1.55419559e+00
   3.17460642e-01 -1.26284752e-02 -9.35105969e-01  8.30246125e-03
  -5.39033090e-01]]
[[-1.02241810e-01  5.57426792e-02  5.93855883e-02  2.62447987e+00
  -1.95989824e+01  3.46935002e+00  3.97630833e-03 -1.53368242e+00
   3.13002034e-01 -1.25613589e-02 -9.96781005e-01  9.08018315e-03
  -5.54439136e-01]]
[[-1.19140238e-01  4.62432148e-02  1.59370720e-02  3.3490950

In [17]:
np.savetxt(output_file, solution, fmt='%.2f')