In [None]:
import numpy as np
import util
import pickle
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV, Lasso
from sklearn.model_selection import GridSearchCV, train_test_split

In [None]:
seed=229
np.random.seed(seed)

trainfile = 'data/train.csv'
header, ids, X, Y = util.fetch_data(trainfile)

test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Ridge: Training

In [None]:
alphalist = np.logspace(-5, 5, 100)
ridge_model = RidgeCV(alphas=alphalist, scoring=util.gini_scorer)
ridge_model.fit(X, Y)

In [None]:
ridge_out = open('models/ridge.pickle', 'wb')
pickle.dump(ridge_model.best_estimator_, ridge_out)
ridge_out.close()

# Ridge: Evaluation

In [None]:
ridge_model = pickle.load(open('models/ridge.pickle'))

In [None]:
trainsizes, traingini, testgini = util.learning_curves(ridge_model, X_train, X_test, y_train, y_test)
print(traingini)
print('')
print(testgini)

In [None]:
plt.figure()
plt.plot(trainsizes, traingini, label='train gini')
plt.plot(trainsizes, testgini, label='test gini')
plt.xlabel('Training set size')
plt.ylabel('Normalized Gini coefficient')
plt.title('Ridge regression')
plt.legend()
plt.savefig('figures/learning_curves/ridge.eps', format='eps', dpi=1000)

In [None]:
plt.show()

# Ridge: Output

In [None]:
ridge_model.fit(X, Y)
print(util.gini_scorer(ridge_model, X, Y))

In [None]:
util.make_prediction(ridge_model, 'data/test.csv', 'predictions/ridge.csv')

# Lasso: Training

In [None]:
lasso_model = Lasso(n_jobs=-1)
c_validator = GridSearchCV(lasso_model, param_grid={'alpha': alphalist},
                          scoring=util.gini_scorer)
c_validator.fit(X, Y)

In [None]:
lasso_out = open('models/lasso.pickle', 'wb')
pickle.dump(c_validator.best_estimator_, lasso_out)
lasso_out.close()

# Lasso: Evaluation

In [None]:
lasso_model = pickle.load(open('models/lasso.pickle'))

In [None]:
trainsizes, traingini, testgini = util.learning_curves(lasso_model, X_train, X_test, y_train, y_test)
print(traingini)
print('')
print(testgini)

In [None]:
plt.figure()
plt.plot(trainsizes, traingini, label='train gini')
plt.plot(trainsizes, testgini, label='test gini')
plt.xlabel('Training set size')
plt.ylabel('Normalized Gini coefficient')
plt.title('Lasso regression')
plt.legend()
plt.savefig('figures/learning_curves/lasso.eps', format='eps', dpi=1000)

In [None]:
plt.show()

# Lasso: Output

In [None]:
lasso_model.fit(X, Y)
print(util.gini_scorer(lasso_model, X, Y))

In [None]:
util.make_prediction(lasso_model, 'data/test.csv', 'predictions/lasso.csv')