In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
from numpy.linalg import matrix_rank
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LassoCV
from sklearn.kernel_ridge import KernelRidge

In [None]:
# Load in the data, drop specified columnn and apply log transformation

In [2]:
data = pd.read_csv('/Users/michaelguel/Desktop/CSE881/ProjectData/OnlineNewsPopularity/OnlineNewsPopularity.csv')

dropped = data.drop(columns=['url',' timedelta'])

dropped[' shares'] = np.log(dropped[' shares'])

xvars = dropped.iloc[:,:-1]

yvar = dropped.iloc[:,58]

xvars.head()


Unnamed: 0,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,0.0,4.680365,...,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875
1,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,0.0,4.913725,...,0.286915,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0
2,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,0.0,4.393365,...,0.495833,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0
3,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,0.0,4.404896,...,0.385965,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0
4,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,0.0,4.682836,...,0.411127,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364


In [None]:
# Use the rank command to calculate the rank of the matrix and confirm that it is 56

In [3]:
matrix_rank(xvars)

#q, r = np.linalg.qr(xvars.T)

#np.abs(np.diag(r)) > 2e-17




56

In [None]:
# We will identify the non-linear independent columns using QR decomposition

In [4]:
from scipy.linalg import qr

Q,R,P = qr(xvars,mode='economic',pivoting = True)
inv = P.argsort()
good_columns = (np.abs(np.diag(R)) > 1e-1) [inv]

fullrank = xvars.iloc[:,good_columns]

xvars.iloc[:,~good_columns].head()

Unnamed: 0,weekday_is_sunday,is_weekend
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [None]:
# We find that weekday_is_sunday and is_weekend are the linearly dependent columns which makes sense.
# Now we discard these below

In [5]:
xvarss = xvars.drop(columns=[' weekday_is_sunday',' is_weekend'])

xvarss.iloc[:,18:30].head()


Unnamed: 0,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess,weekday_is_monday
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,496.0,496.0,496.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,918.0,918.0,918.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,545.0,16000.0,3151.157895,1.0


In [6]:
matrix_rank(xvarss)

56

In [None]:
# We will use the first 2000 columns for training and the rest for testing below

In [None]:
# Here we define a function to standardize the data by subtracting their means and dividing by their
# standard deviations. 

In [7]:
def standardize(arr):
    result = arr.copy()
    m = arr.sum(axis = 0)/arr.shape[0]
    s = np.sqrt(np.sum((arr - m)**2, axis=0)/(arr.shape[0]-1))
    return (result - m)/s

In [None]:
# Standardize the predictor variables and add a columns of 1's

In [8]:
stand = standardize(xvarss)

onecol = pd.DataFrame(np.ones(len(stand)))

together = pd.concat([onecol,stand],axis=1)

onecolstand = together.rename(columns = {0:'int'})

In [9]:
trainx = onecolstand.iloc[:2001,]

trainy = yvar.iloc[:2001,]

testx = onecolstand.iloc[2001:,]

testy = yvar.iloc[2001:,]

In [10]:
trainx.head()


Unnamed: 0,int,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
0,1.0,0.757438,-0.695202,0.032771,0.000675,0.038657,-0.607455,-0.335562,-0.42652,-0.304264,...,0.237334,0.063864,-0.228938,-0.708361,-0.268891,-0.969874,0.671237,-0.97542,-1.810696,0.138918
1,1.0,-0.661648,-0.618786,0.016056,0.000675,0.031478,-0.6957,-0.594956,-0.42652,-0.304264,...,-0.640032,-0.870957,-0.228938,1.10216,1.367406,0.078641,-0.870796,-0.269073,0.837738,-0.689649
2,1.0,-0.661648,-0.712183,0.007644,0.000675,-0.007752,-0.6957,-0.594956,-0.42652,-0.304264,...,1.358384,0.063864,0.981786,-1.621776,-0.957859,-0.270864,-0.870796,-0.269073,0.837738,-0.689649
3,1.0,-0.661648,-0.032932,-0.012619,0.000675,-0.00721,-0.166227,-0.85435,-0.42652,-0.304264,...,0.307438,0.573766,0.174637,-0.862574,-0.268891,-0.620369,-0.870796,-0.269073,0.837738,-0.689649
4,1.0,1.230466,1.115425,-0.037654,0.000675,-0.045419,0.716228,4.074134,1.860037,-0.304264,...,0.548128,-0.870957,0.981786,0.30794,0.075593,0.602899,0.531052,0.244634,-1.569929,-0.087055


In [None]:
# MLR model and 10 Attributes

In [11]:
from sklearn import linear_model

regr = linear_model.LinearRegression()

regr.fit(trainx, trainy)

mlrpred = regr.predict(testx)


In [12]:
mlrcoef = pd.DataFrame(zip(trainx.columns,regr.coef_))

mlrcoef[2] = abs(mlrcoef[1])

mlrcoef.sort_values(by=[2],ascending=False)[:10]

Unnamed: 0,0,1,2
39,LDA_03,-2312111000.0,2312111000.0
40,LDA_04,-2265058000.0,2265058000.0
38,LDA_02,-2209930000.0,2209930000.0
36,LDA_00,-2059775000.0,2059775000.0
37,LDA_01,-1720879000.0,1720879000.0
4,n_non_stop_words,-154656700.0,154656700.0
45,rate_positive_words,5623282.0,5623282.0
46,rate_negative_words,4616611.0,4616611.0
0,int,-8090.477,8090.477
3,n_unique_tokens,-2.704661,2.704661


In [None]:
# CVLasso and 10 Attributes

In [13]:
from sklearn.linear_model import LassoCV

reg = LassoCV(cv=10,random_state=0).fit(trainx, trainy)

lassopred = reg.predict(testx)

In [14]:
lassocoef = pd.DataFrame(zip(trainx.columns,reg.coef_))

lassocoef[2] = abs(lassocoef[1])

lassocoef.sort_values(by=[2],ascending=False)[:10]

Unnamed: 0,0,1,2
27,self_reference_min_shares,0.219876,0.219876
16,data_channel_is_tech,0.090342,0.090342
31,weekday_is_tuesday,-0.088514,0.088514
30,weekday_is_monday,-0.086417,0.086417
35,weekday_is_saturday,0.086072,0.086072
15,data_channel_is_socmed,0.081243,0.081243
14,data_channel_is_bus,-0.080756,0.080756
32,weekday_is_wednesday,-0.079584,0.079584
56,abs_title_sentiment_polarity,0.069437,0.069437
33,weekday_is_thursday,-0.068193,0.068193


In [None]:
# Correlation of Pred and Actual for both models

In [15]:
print('MLR',np.corrcoef(mlrpred,testy))

print('Lasso',np.corrcoef(lassopred,testy))

MLR [[ 1.         -0.00663121]
 [-0.00663121  1.        ]]
Lasso [[1.         0.23265819]
 [0.23265819 1.        ]]


In [None]:
# Kernel Ridge

In [16]:
from sklearn.kernel_ridge import KernelRidge

krr = KernelRidge(alpha = 0.001, kernel='rbf', gamma = 1e-7)

krr.fit(trainx,trainy)

krrpred = krr.predict(testx)

In [None]:
# 10 Attributes

In [17]:
kridgecoef = pd.DataFrame(zip(trainx.columns,krr.dual_coef_))

kridgecoef[2] = abs(kridgecoef[1])

kridgecoef.sort_values(by=[2],ascending=False)[:10]

Unnamed: 0,0,1,2
19,kw_max_min,2397.642035,2397.642035
16,data_channel_is_tech,2246.651228,2246.651228
13,data_channel_is_entertainment,1829.577904,1829.577904
35,weekday_is_saturday,-1564.315563,1564.315563
36,LDA_00,1543.256566,1543.256566
41,global_subjectivity,-1390.472347,1390.472347
51,min_negative_polarity,-1389.961605,1389.961605
52,max_negative_polarity,-1325.371439,1325.371439
18,kw_min_min,1284.383884,1284.383884
29,self_reference_avg_sharess,-1237.47581,1237.47581


In [151]:
# Correlation between pred and actual

In [18]:
print('Kernel Ridge',np.corrcoef(krrpred,testy))

Kernel Ridge [[1.         0.26398433]
 [0.26398433 1.        ]]


In [None]:
# Compare all three correlations

In [19]:
print('MLR',np.corrcoef(mlrpred,testy))

print('Lasso',np.corrcoef(lassopred,testy))

print('Kernel Ridge',np.corrcoef(krrpred,testy))

MLR [[ 1.         -0.00663121]
 [-0.00663121  1.        ]]
Lasso [[1.         0.23265819]
 [0.23265819 1.        ]]
Kernel Ridge [[1.         0.26398433]
 [0.26398433 1.        ]]


In [None]:
# It would seem that the KernelRidge works the best based on correlation

In [20]:
xvars2 = xvarss.copy()

xvars2.loc[xvars2[' kw_min_min'] < 0, ' kw_min_min'] = 0

xvars2.loc[xvars2[' kw_avg_min'] < 0, ' kw_avg_min'] = 0

xvars2.loc[xvars2[' kw_min_avg'] < 0, ' kw_min_avg'] = 0

xvars2.iloc[:,17:29] = xvars2.iloc[:,17:29] + 1e-8

xvars2.iloc[:,17:29] = np.log(xvars2.iloc[:,17:29])

xvars2.iloc[:,17:29].head()

Unnamed: 0,kw_min_min,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess
0,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,6.206576,6.206576,6.206576
1,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681
2,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,6.822197,6.822197,6.822197
3,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681
4,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,-18.420681,6.300786,9.680344,8.055525


In [21]:
xxx = standardize(xvars2)

xxx

onecol2 = pd.DataFrame(np.ones(len(xxx)))

together2 = pd.concat([onecol2,xxx],axis=1)

onecolstand2 = together2.rename(columns = {0:'int'})

onecolstand2.head()

Unnamed: 0,int,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
0,1.0,0.757438,-0.695202,0.032771,0.000675,0.038657,-0.607455,-0.335562,-0.42652,-0.304264,...,0.237334,0.063864,-0.228938,-0.708361,-0.268891,-0.969874,0.671237,-0.97542,-1.810696,0.138918
1,1.0,-0.661648,-0.618786,0.016056,0.000675,0.031478,-0.6957,-0.594956,-0.42652,-0.304264,...,-0.640032,-0.870957,-0.228938,1.10216,1.367406,0.078641,-0.870796,-0.269073,0.837738,-0.689649
2,1.0,-0.661648,-0.712183,0.007644,0.000675,-0.007752,-0.6957,-0.594956,-0.42652,-0.304264,...,1.358384,0.063864,0.981786,-1.621776,-0.957859,-0.270864,-0.870796,-0.269073,0.837738,-0.689649
3,1.0,-0.661648,-0.032932,-0.012619,0.000675,-0.00721,-0.166227,-0.85435,-0.42652,-0.304264,...,0.307438,0.573766,0.174637,-0.862574,-0.268891,-0.620369,-0.870796,-0.269073,0.837738,-0.689649
4,1.0,1.230466,1.115425,-0.037654,0.000675,-0.045419,0.716228,4.074134,1.860037,-0.304264,...,0.548128,-0.870957,0.981786,0.30794,0.075593,0.602899,0.531052,0.244634,-1.569929,-0.087055


In [23]:
trainx2 = onecolstand2.iloc[:2001,]

trainy2 = yvar.iloc[:2001,]

testx2 = onecolstand2.iloc[2001:,]

testy2 = yvar.iloc[2001:,]

In [None]:
# MLR2

In [24]:
from sklearn import linear_model

regr2 = linear_model.LinearRegression()

regr2.fit(trainx2, trainy2)

mlrpred2 = regr2.predict(testx2)

In [None]:
# CVLasso2

In [25]:
from sklearn.linear_model import LassoCV

reg2 = LassoCV(cv=10,random_state=0,tol = 1).fit(trainx2, trainy2)

lassopred2 = reg2.predict(testx2)

In [None]:
# KRidge2

In [26]:
from sklearn.kernel_ridge import KernelRidge

krr2 = KernelRidge(alpha = 0.001, kernel='rbf', gamma = 1e-7)

krr2.fit(trainx2,trainy2)

krrpred2 = krr2.predict(testx2)

In [27]:
print('MLR',np.corrcoef(mlrpred2,testy2))

print('Lasso',np.corrcoef(lassopred2,testy2))

print('Kernel Ridge',np.corrcoef(krrpred2,testy2))

MLR [[ 1.         -0.00663298]
 [-0.00663298  1.        ]]
Lasso [[1.         0.20796671]
 [0.20796671 1.        ]]
Kernel Ridge [[1.         0.23611211]
 [0.23611211 1.        ]]


In [None]:
# It looks like the results became worse under the transformation as the correlation went down
# Kernel gives the best results

In [28]:
spamdata = pd.read_csv('/Users/michaelguel/Desktop/ProjectData/spambase/spambase.data',header = None)

spamdata.head()

use = spamdata.copy()


use.shape


(4601, 58)

In [29]:
shuffled = use.sample(frac = 1)

xs = shuffled.iloc[:,0:56]

ys = shuffled.iloc[:,57]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(xs,ys,test_size = 0.2,random_state = 0)

In [31]:
lamb = list(np.logspace(-4,3,num=11))

lamb

[0.0001,
 0.0005011872336272725,
 0.0025118864315095794,
 0.012589254117941661,
 0.0630957344480193,
 0.31622776601683794,
 1.584893192461111,
 7.943282347242805,
 39.81071705534969,
 199.52623149688787,
 1000.0]

In [32]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn import svm
from sklearn.linear_model import SGDClassifier

param_grid = {'alpha':lamb}
model_to_tune = SGDClassifier()

inner_cv = KFold(n_splits=5,shuffle=False)
outer_cv = KFold(n_splits=10,shuffle=False)



In [33]:
model = GridSearchCV(estimator = model_to_tune, param_grid=param_grid,cv = inner_cv, n_jobs=2)

model.fit(X_train,y_train)

print(model.best_params_)

{'alpha': 0.0005011872336272725}


In [34]:
model.best_params_['alpha']

0.0005011872336272725

In [35]:
clf = SGDClassifier(alpha = model.best_params_['alpha'])

clf.fit(X_train,y_train)

In [36]:
test_pred = cross_val_predict(clf,X_test,y_test,cv=outer_cv,n_jobs=2)

#print(test_score.mean(),test_score.std())

cmat = confusion_matrix(y_test,test_pred)

acc = accuracy_score(y_test,test_pred)

print(cmat)

print(1 - acc)

[[448  90]
 [135 248]]
0.24429967426710097


In [None]:
# Non-Linear SVM

In [37]:
lamb = list(np.logspace(-3,3,num=11))
sig = list(np.logspace(-3,3,num=11))

param_grid = {'C':lamb,'gamma':sig}
model_to_tune = SVC()


In [38]:
model = GridSearchCV(estimator = model_to_tune, param_grid=param_grid,cv = inner_cv, n_jobs=2)

model.fit(X_train,y_train)

print(model.best_params_)

{'C': 15.848931924611142, 'gamma': 0.015848931924611134}


In [39]:
model.best_params_

{'C': 15.848931924611142, 'gamma': 0.015848931924611134}

In [40]:
clf = SVC(C = model.best_params_['C'],gamma=model.best_params_['gamma'])

clf.fit(X_train,y_train)

In [41]:
test_pred = cross_val_predict(clf,X_test,y_test,cv=outer_cv,n_jobs=2)

#print(test_score.mean(),test_score.std())

cmat = confusion_matrix(y_test,test_pred)

acc = accuracy_score(y_test,test_pred)

print(cmat)

print(1 - acc)

[[491  47]
 [ 57 326]]
0.1129207383279045


In [None]:
# If we look back to the previous confusion matrix, we can see that there is a clear change in test error.
# We were able to halve the test error!