In [31]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

In [32]:
df = pd.read_csv("../data/OnlineNewsPopularity.csv").reset_index(drop = True)

In [33]:
df.head()

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,...,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,...,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711
2,http://mashable.com/2013/01/07/apple-40-billio...,731.0,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,...,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500
3,http://mashable.com/2013/01/07/astronaut-notre...,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,...,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,...,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505


In [34]:
df.columns = [x.strip() for x in df.columns]

In [35]:
df["success"] = (df["shares"] > 6000)*1

In [36]:
df["success"]

0        0
1        0
2        0
3        0
4        0
        ..
39639    0
39640    0
39641    0
39642    0
39643    0
Name: success, Length: 39644, dtype: int64

In [37]:
df['success'].value_counts(1)

0    0.896504
1    0.103496
Name: success, dtype: float64

In [8]:
ls_cont = ['n_tokens_title', 'n_tokens_content', 'n_unique_tokens', 'n_non_stop_words', 
           'n_non_stop_unique_tokens', 'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos', 
           'average_token_length', 'num_keywords', 'kw_min_min', 'kw_max_min', 'kw_avg_min', 
           'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg', 'kw_avg_avg', 
           'self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess', 
           'LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity', 
           'global_sentiment_polarity', 'global_rate_positive_words', 'global_rate_negative_words',
           'rate_positive_words', 'rate_negative_words', 'avg_positive_polarity', 'min_positive_polarity',
           'max_positive_polarity', 'avg_negative_polarity', 'min_negative_polarity', 
           'max_negative_polarity', 'title_subjectivity', 'title_sentiment_polarity', 
           'abs_title_subjectivity', 'abs_title_sentiment_polarity']
target = "shares"
target_disc = "success"

In [38]:
X = df[ls_cont]
yr = df[target]
yc = df[target_disc]

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, yc)

## Análisis Discriminante 

In [40]:
lda = LinearDiscriminantAnalysis()

In [41]:
lda.fit(X_train, y_train)

LinearDiscriminantAnalysis()

In [42]:
ls_res = cross_val_score(estimator = lda, X=X_train, y=y_train, cv=4, n_jobs=-1, scoring="roc_auc")

In [43]:
np.mean(ls_res), np.std(ls_res)

(0.6951649762300707, 0.00796387730463065)

In [15]:
LinearDiscriminantAnalysis?

In [44]:
lda.get_params()

{'n_components': None,
 'priors': None,
 'shrinkage': None,
 'solver': 'svd',
 'store_covariance': False,
 'tol': 0.0001}

In [45]:
param_grid = {"solver": ["svd", "lsqr", "eigen"],
              "shrinkage": [None, "auto"] + [x/100 for x in range(100)]+[x for x in range(100)]}

In [46]:
np.prod(list(map(len, param_grid.values())))

606

In [47]:
kb = SelectKBest(k = 5, score_func=f_classif)

In [48]:
sc = MinMaxScaler()

In [49]:
Xs = sc.fit_transform(X_train)

In [50]:
Xb = kb.fit_transform(Xs, y_train)

In [51]:
Xb

array([[0.01801802, 0.01331008, 0.07599314, 0.04348301, 0.49647353],
       [0.00900901, 0.01587026, 0.08624515, 0.0543486 , 0.05461114],
       [0.00900901, 0.01275184, 0.06717924, 0.04376836, 0.61298823],
       ...,
       [0.00900901, 0.0220811 , 0.08720475, 0.33824059, 0.02408856],
       [0.00900901, 0.03805828, 0.0717198 , 0.62890587, 0.38309445],
       [0.00900901, 0.01494392, 0.06985126, 0.69552728, 0.04322473]])

In [52]:
ls_best = [x for x, y in zip(ls_cont, kb.get_support()) if y]


In [53]:
ls_best

['num_imgs', 'kw_max_avg', 'kw_avg_avg', 'LDA_02', 'LDA_03']

In [54]:
rs = GridSearchCV(cv=4, error_score=-1000, estimator=lda, n_jobs=-1, scoring="roc_auc", param_grid=param_grid, verbose=True)

In [55]:

rs.fit(X=Xb, y=y_train)

Fitting 4 folds for each of 606 candidates, totalling 2424 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 608 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 1608 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 2424 out of 2424 | elapsed:   12.0s finished


GridSearchCV(cv=4, error_score=-1000, estimator=LinearDiscriminantAnalysis(),
             n_jobs=-1,
             param_grid={'shrinkage': [None, 'auto', 0.0, 0.01, 0.02, 0.03,
                                       0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1,
                                       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17,
                                       0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24,
                                       0.25, 0.26, 0.27, ...],
                         'solver': ['svd', 'lsqr', 'eigen']},
             scoring='roc_auc', verbose=True)

In [56]:
rs.best_score_

0.6682334460779451

In [57]:
pd.DataFrame(zip(ls_best, rs.best_estimator_.coef_[0])).sort_values(by = 1)

Unnamed: 0,0,1
1,kw_max_avg,-14.013427
3,LDA_02,-0.369492
4,LDA_03,0.40952
0,num_imgs,1.620094
2,kw_avg_avg,18.8708


In [30]:
pd.DataFrame(zip(ls_cont, kb.scores_)).sort_values(by = 1)

Unnamed: 0,0,1
30,global_rate_positive_words,0.004317
6,num_self_hrefs,0.025099
0,n_tokens_title,0.066879
24,LDA_01,0.08946
1,n_tokens_content,0.131819
2,n_unique_tokens,0.190076
3,n_non_stop_words,0.264535
42,abs_title_subjectivity,0.369267
4,n_non_stop_unique_tokens,0.376844
11,kw_min_min,0.603552
