In [1]:
# CoEPrA Example
import os
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [3]:
source_dataset_path = os.path.join("book_code", "Section 4", "CoEPrA.csv")
with open(source_dataset_path) as raw_data:
    data = np.loadtxt(raw_data, delimiter=",")
print("---> Data Shape: {}".format(data.shape))

---> Data Shape: (89, 5788)


In [5]:
# Separate independent and dependent variables
X = data[:,0:5787]
y = data[:,5787]

In [7]:
# Split data into train and test
print("---> Data Split for training and testing")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\tX_training shape\t'{}'".format(X_train.shape))
print("\tX_test shape\t\t'{}'".format(X_test.shape))
print("\ty_training shape\t'{}'".format(y_train.shape))
print("\ty_test shape\t\t'{}'".format(y_test.shape))

---> Data Split for training and testing
	X_training shape	'(71, 5787)'
	X_test shape		'(18, 5787)'
	y_training shape	'(71,)'
	y_test shape		'(18,)'


In [8]:
# Trying the linear regression approach without regularization
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

  linalg.lstsq(X, y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [9]:
# Make predictions using the training set and calculate the mean squared error
y_train_pred = regr.predict(X_train)
print("---> Mean squared error on the training data: {:0.2f}".format(mean_squared_error(y_train, y_train_pred)))

---> Mean squared error on the training data: 0.08


In [10]:
# This probably means 'overfitting', right?
# Let's run a K-Fold Cross Validation
scores = cross_val_score(regr, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print("---> Running a k-fold cross validation")
print("\tScores sample '{}'".format(scores[:10]))
print("\tMean score: '{}'".format(np.mean(scores)))

---> Running a k-fold cross validation
	Scores sample '[-1.58031398e+24 -6.28451759e+23 -5.89220228e+23 -1.03515026e+23
 -6.16618077e+23]'
	Mean score: '-7.036238136104907e+23'


In [11]:
# Now let's try to make predictions using the testing set
y_testing_pred = regr.predict(X_test)
print("---> Mean squared error on the test data: {:0.2f}".format(mean_squared_error(y_test, y_testing_pred)))

---> Mean squared error on the test data: 3583363366497778925568000.00


In [12]:
# The mean squared error on the testing data is very high
# Let's try again but using L1 / Lasso Regularization
regr_lasso = linear_model.Lasso(alpha=0.3, max_iter=1000000)
regr_lasso.fit(X_train, y_train)

Lasso(alpha=0.3, copy_X=True, fit_intercept=True, max_iter=1000000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [13]:
print("---> Lasso L1 model")
print("\tSample weights: '{}'".format(regr_lasso.coef_[:20]))

---> Lasso L1 model
	Sample weights: '[-0.  0. -0.  0.  0.  0. -0.  0. -0.  0.  0. -0.  0. -0. -0.  0. -0. -0.
  0. -0.]'


In [16]:
lasso_nonzero_coef_indexes = np.nonzero(regr_lasso.coef_)
print("\tIndex of all non-zero coefficients:\n'{}'".format(lasso_nonzero_coef_indexes))

	Index of all non-zero coefficients:
'(array([  64,  136,  445,  451,  653,  715,  760,  787,  858, 1236, 1358,
       1422, 1430, 1732, 1737, 1874, 1879, 2065, 2247, 2374, 2380, 2581,
       2644, 2689, 2708, 2890, 3224, 3351, 3666, 3931, 3994, 4002, 4221,
       4303, 4510, 4573, 4574, 4637, 4645, 4819, 4952, 5153, 5154, 5280,
       5589, 5595, 5648, 5732]),)'
