In [1]:
import numpy as np
import util
import pickle
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV, train_test_split

In [None]:
seed=229
np.random.seed(seed)

trainfile = 'data/train.csv'
header, ids, X, Y = util.fetch_data(trainfile)

test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Ridge: Training
Ridge linear regression uses an L2 regularization penalty.

In [None]:
alphalist = np.logspace(-5, 5, 100)
ridge = Ridge(n_jobs=-1)
ridge_model = GridSearchCV(ridge, params={'alphas': alphalist}, scoring=util.gini_scorer, n_jobs=-1)
ridge_model.fit(X, Y)

In [None]:
ridge_out = open('models/ridge.pickle', 'wb')
pickle.dump(ridge_model.best_estimator_, ridge_out)
ridge_out.close()

# Ridge: Evaluation

In [None]:
ridge_model = pickle.load(open('models/ridge.pickle'))

In [None]:
trainsizes, traingini, testgini = util.learning_curves(ridge_model, X_train, X_test, y_train, y_test)
print(traingini)
print('')
print(testgini)

In [None]:
plt.figure()
plt.plot(trainsizes, traingini, label='train gini')
plt.plot(trainsizes, testgini, label='test gini')
plt.xlabel('Training set size')
plt.ylabel('Normalized Gini coefficient')
plt.title('Ridge regression')
plt.legend()
plt.savefig('figures/learning_curves/ridge.eps', format='eps', dpi=1000)

In [None]:
plt.show()

# Ridge: Output

In [None]:
ridge_model.fit(X, Y)
print(util.gini_scorer(ridge_model, X, Y))

In [None]:
util.make_prediction(ridge_model, 'data/test.csv', 'predictions/ridge.csv')

# Lasso: Training
Lasso regression uses an L1 regularization penalty.

In [None]:
lasso_model = Lasso(n_jobs=-1)
c_validator = GridSearchCV(lasso_model, param_grid={'alpha': alphalist}, scoring=util.gini_scorer, n_jobs=-1)
c_validator.fit(X, Y)

In [None]:
lasso_out = open('models/lasso.pickle', 'wb')
pickle.dump(c_validator.best_estimator_, lasso_out)
lasso_out.close()

# Lasso: Evaluation

In [None]:
lasso_model = pickle.load(open('models/lasso.pickle'))

In [None]:
trainsizes, traingini, testgini = util.learning_curves(lasso_model, X_train, X_test, y_train, y_test)
print(traingini)
print('')
print(testgini)

In [None]:
plt.figure()
plt.plot(trainsizes, traingini, label='train gini')
plt.plot(trainsizes, testgini, label='test gini')
plt.xlabel('Training set size')
plt.ylabel('Normalized Gini coefficient')
plt.title('Lasso regression')
plt.legend()
plt.savefig('figures/learning_curves/lasso.eps', format='eps', dpi=1000)

In [None]:
plt.show()

# Lasso: Output

In [None]:
lasso_model.fit(X, Y)
print(util.gini_scorer(lasso_model, X, Y))

In [None]:
util.make_prediction(lasso_model, 'data/test.csv', 'predictions/lasso.csv')

# Elastic Net: Training
Elastic net regression uses both an L2- and an L1-norm penalty.

In [None]:
params={'l1_ratio': [0, .1, .5, .7, .9, .95, .99, 1], # L1 penalty strength as a fraction of L2 penalty strength
        'alpha': np.logspace(-5, 5, 20),
       }

enet = ElasticNet()
c_validator = GridSearchCV(enet, param_grid=params, scoring=util.gini_scorer, n_jobs=-1, verbose=2)
c_validator.fit(X, Y)

Fitting 3 folds for each of 160 candidates, totalling 480 fits
[CV] alpha=1e-05, l1_ratio=0 .........................................
[CV] alpha=1e-05, l1_ratio=0 .........................................
[CV] alpha=1e-05, l1_ratio=0 .........................................
[CV] alpha=1e-05, l1_ratio=0.1 .......................................
[CV] ........................ alpha=1e-05, l1_ratio=0.1, total=  39.6s
[CV] alpha=1e-05, l1_ratio=0.1 .......................................
[CV] ........................ alpha=1e-05, l1_ratio=0.1, total=  55.9s
[CV] alpha=1e-05, l1_ratio=0.1 .......................................
[CV] ........................ alpha=1e-05, l1_ratio=0.1, total=  41.7s
[CV] alpha=1e-05, l1_ratio=0.5 .......................................
[CV] ........................ alpha=1e-05, l1_ratio=0.5, total=  32.1s
[CV] alpha=1e-05, l1_ratio=0.5 .......................................
[CV] ........................ alpha=1e-05, l1_ratio=0.5, total=  37.3s
[CV] alpha=1e-



[CV] .......................... alpha=1e-05, l1_ratio=0, total=11.0min
[CV] .......................... alpha=1e-05, l1_ratio=0, total=11.0min
[CV] alpha=3.35981828628e-05, l1_ratio=0 .............................
[CV] alpha=3.35981828628e-05, l1_ratio=0 .............................




[CV] .......................... alpha=1e-05, l1_ratio=0, total=11.8min
[CV] alpha=3.35981828628e-05, l1_ratio=0.1 ...........................
[CV] ............ alpha=3.35981828628e-05, l1_ratio=0.1, total=  16.6s
[CV] alpha=3.35981828628e-05, l1_ratio=0.1 ...........................
[CV] ............ alpha=3.35981828628e-05, l1_ratio=0.1, total=  28.8s
[CV] alpha=3.35981828628e-05, l1_ratio=0.1 ...........................
[CV] ............ alpha=3.35981828628e-05, l1_ratio=0.1, total=  22.8s
[CV] alpha=3.35981828628e-05, l1_ratio=0.5 ...........................
[CV] ............ alpha=3.35981828628e-05, l1_ratio=0.5, total=  19.3s
[CV] alpha=3.35981828628e-05, l1_ratio=0.5 ...........................
[CV] ............ alpha=3.35981828628e-05, l1_ratio=0.5, total=  23.9s
[CV] alpha=3.35981828628e-05, l1_ratio=0.5 ...........................
[CV] ............ alpha=3.35981828628e-05, l1_ratio=0.5, total=  25.4s
[CV] alpha=3.35981828628e-05, l1_ratio=0.7 ...........................
[CV] .

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 15.6min


[CV] alpha=3.35981828628e-05, l1_ratio=0.9 ...........................
[CV] ............ alpha=3.35981828628e-05, l1_ratio=0.9, total=  16.1s
[CV] alpha=3.35981828628e-05, l1_ratio=0.9 ...........................
[CV] ............ alpha=3.35981828628e-05, l1_ratio=0.9, total=  14.8s
[CV] alpha=3.35981828628e-05, l1_ratio=0.9 ...........................
[CV] ............ alpha=3.35981828628e-05, l1_ratio=0.9, total=  14.5s
[CV] alpha=3.35981828628e-05, l1_ratio=0.95 ..........................
[CV] ........... alpha=3.35981828628e-05, l1_ratio=0.95, total=  14.6s
[CV] alpha=3.35981828628e-05, l1_ratio=0.95 ..........................
[CV] ........... alpha=3.35981828628e-05, l1_ratio=0.95, total=  14.8s
[CV] alpha=3.35981828628e-05, l1_ratio=0.95 ..........................
[CV] ........... alpha=3.35981828628e-05, l1_ratio=0.95, total=  14.3s
[CV] alpha=3.35981828628e-05, l1_ratio=0.99 ..........................
[CV] ........... alpha=3.35981828628e-05, l1_ratio=0.99, total=  14.7s
[CV] a



[CV] .............. alpha=3.35981828628e-05, l1_ratio=0, total= 8.4min
[CV] alpha=0.000112883789168, l1_ratio=0 .............................
[CV] .............. alpha=3.35981828628e-05, l1_ratio=0, total= 7.9min
[CV] alpha=0.000112883789168, l1_ratio=0 .............................
[CV] .............. alpha=3.35981828628e-05, l1_ratio=0, total= 7.9min
[CV] alpha=0.000112883789168, l1_ratio=0.1 ...........................
[CV] ............ alpha=0.000112883789168, l1_ratio=0.1, total=  14.4s
[CV] alpha=0.000112883789168, l1_ratio=0.1 ...........................
[CV] ............ alpha=0.000112883789168, l1_ratio=0.1, total=  15.1s
[CV] alpha=0.000112883789168, l1_ratio=0.1 ...........................
[CV] ............ alpha=0.000112883789168, l1_ratio=0.1, total=  14.3s
[CV] alpha=0.000112883789168, l1_ratio=0.5 ...........................
[CV] ............ alpha=0.000112883789168, l1_ratio=0.5, total=  12.1s
[CV] alpha=0.000112883789168, l1_ratio=0.5 ...........................
[CV] .

[CV] .............. alpha=0.0012742749857, l1_ratio=0.7, total=   9.5s
[CV] alpha=0.0012742749857, l1_ratio=0.7 .............................
[CV] .............. alpha=0.0012742749857, l1_ratio=0.7, total=   8.5s
[CV] alpha=0.0012742749857, l1_ratio=0.9 .............................
[CV] .............. alpha=0.0012742749857, l1_ratio=0.9, total=   8.5s
[CV] alpha=0.0012742749857, l1_ratio=0.9 .............................
[CV] .............. alpha=0.0012742749857, l1_ratio=0.9, total=   7.7s
[CV] alpha=0.0012742749857, l1_ratio=0.9 .............................
[CV] .............. alpha=0.0012742749857, l1_ratio=0.9, total=   7.9s
[CV] alpha=0.0012742749857, l1_ratio=0.95 ............................
[CV] ............. alpha=0.0012742749857, l1_ratio=0.95, total=   7.8s
[CV] alpha=0.0012742749857, l1_ratio=0.95 ............................
[CV] ............. alpha=0.0012742749857, l1_ratio=0.95, total=   7.1s
[CV] alpha=0.0012742749857, l1_ratio=0.95 ............................
[CV] .

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 47.7min


[CV] .............. alpha=0.0143844988829, l1_ratio=0.9, total=   7.6s
[CV] alpha=0.0143844988829, l1_ratio=0.9 .............................
[CV] .............. alpha=0.0143844988829, l1_ratio=0.9, total=   6.8s
[CV] alpha=0.0143844988829, l1_ratio=0.95 ............................
[CV] ............. alpha=0.0143844988829, l1_ratio=0.95, total=   6.6s
[CV] alpha=0.0143844988829, l1_ratio=0.95 ............................
[CV] ............. alpha=0.0143844988829, l1_ratio=0.95, total=   6.2s
[CV] alpha=0.0143844988829, l1_ratio=0.95 ............................
[CV] ............. alpha=0.0143844988829, l1_ratio=0.95, total=   5.9s
[CV] alpha=0.0143844988829, l1_ratio=0.99 ............................
[CV] ............. alpha=0.0143844988829, l1_ratio=0.99, total=   5.6s
[CV] alpha=0.0143844988829, l1_ratio=0.99 ............................
[CV] ............. alpha=0.0143844988829, l1_ratio=0.99, total=   6.5s
[CV] alpha=0.0143844988829, l1_ratio=0.99 ............................
[CV] .

[CV] ................. alpha=0.162377673919, l1_ratio=1, total=   6.3s
[CV] alpha=0.545559478117, l1_ratio=0 ................................
[CV] ................. alpha=0.162377673919, l1_ratio=0, total= 8.4min
[CV] alpha=0.545559478117, l1_ratio=0 ................................
[CV] ................. alpha=0.162377673919, l1_ratio=0, total= 8.4min
[CV] alpha=0.545559478117, l1_ratio=0 ................................
[CV] ................. alpha=0.162377673919, l1_ratio=0, total= 8.2min
[CV] alpha=0.545559478117, l1_ratio=0.1 ..............................
[CV] ............... alpha=0.545559478117, l1_ratio=0.1, total=   5.6s
[CV] alpha=0.545559478117, l1_ratio=0.1 ..............................
[CV] ............... alpha=0.545559478117, l1_ratio=0.1, total=   6.2s
[CV] alpha=0.545559478117, l1_ratio=0.1 ..............................
[CV] ............... alpha=0.545559478117, l1_ratio=0.1, total=   6.1s
[CV] alpha=0.545559478117, l1_ratio=0.5 ..............................
[CV] .

[CV] ................ alpha=6.15848211066, l1_ratio=0.7, total=   4.7s
[CV] alpha=6.15848211066, l1_ratio=0.7 ...............................
[CV] ................ alpha=6.15848211066, l1_ratio=0.7, total=   4.7s
[CV] alpha=6.15848211066, l1_ratio=0.7 ...............................
[CV] ................ alpha=6.15848211066, l1_ratio=0.7, total=   4.6s
[CV] alpha=6.15848211066, l1_ratio=0.9 ...............................
[CV] ................ alpha=6.15848211066, l1_ratio=0.9, total=   4.8s
[CV] alpha=6.15848211066, l1_ratio=0.9 ...............................
[CV] ................ alpha=6.15848211066, l1_ratio=0.9, total=   4.7s
[CV] alpha=6.15848211066, l1_ratio=0.9 ...............................
[CV] ................ alpha=6.15848211066, l1_ratio=0.9, total=   4.7s
[CV] alpha=6.15848211066, l1_ratio=0.95 ..............................
[CV] ............... alpha=6.15848211066, l1_ratio=0.95, total=   4.6s
[CV] alpha=6.15848211066, l1_ratio=0.95 ..............................
[CV] .

[CV] ............... alpha=69.5192796178, l1_ratio=0.99, total=   5.2s
[CV] alpha=69.5192796178, l1_ratio=0.99 ..............................
[CV] ............... alpha=69.5192796178, l1_ratio=0.99, total=   5.1s
[CV] alpha=69.5192796178, l1_ratio=1 .................................
[CV] .................. alpha=69.5192796178, l1_ratio=1, total=   4.7s
[CV] alpha=69.5192796178, l1_ratio=1 .................................
[CV] .................. alpha=69.5192796178, l1_ratio=1, total=   4.5s
[CV] alpha=69.5192796178, l1_ratio=1 .................................
[CV] .................. alpha=69.5192796178, l1_ratio=1, total=   4.6s
[CV] alpha=233.572146909, l1_ratio=0 .................................
[CV] .................. alpha=69.5192796178, l1_ratio=0, total= 6.7min
[CV] alpha=233.572146909, l1_ratio=0 .................................
[CV] .................. alpha=69.5192796178, l1_ratio=0, total= 6.7min
[CV] alpha=233.572146909, l1_ratio=0 .................................
[CV] .

[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 100.4min


[CV] .................. alpha=233.572146909, l1_ratio=0, total= 7.9min
[CV] alpha=784.759970351, l1_ratio=0 .................................


In [None]:
enet_out = open('models/elastic_net.pickle', 'wb')
pickle.dump(c_validator.best_estimator_, enet_out)
enet_out.close()

# Elastic Net: Evaluation

In [None]:
enet_model = pickle.load(open('models/elastic_net.pickle'))

In [None]:
trainsizes, traingini, testgini = util.learning_curves(enet_model, X_train, X_test, y_train, y_test)
print(traingini)
print('')
print(testgini)

In [None]:
plt.figure()
plt.plot(trainsizes, traingini, label='train gini')
plt.plot(trainsizes, testgini, label='test gini')
plt.xlabel('Training set size')
plt.ylabel('Normalized Gini coefficient')
plt.title('Lasso regression')
plt.legend()
plt.savefig('figures/learning_curves/enet.eps', format='eps', dpi=1000)

In [None]:
plt.show()

# Elastic Net: Output

In [None]:
enet_model.fit(X, Y)
print(util.gini_scorer(lasso_model, X, Y))

In [None]:
util.make_prediction(lasso_model, 'data/test.csv', 'predictions/enet.csv')