In [2]:
import warnings

warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from LinearModel_exercise_pipes import *

In [3]:
file = r'D:\AI_ML\AI\Machine Learning in Python\data\data\facebook_comments.csv'

In [4]:
fb=pd.read_csv(file)

In [5]:
cyclic_feat=['Post Published Weekday','Base Date Time Weekday']
cat_feat=['page_category']
target=['Comments_in_next_H_hrs']
num_feat=[_ for _ in fb.columns if _ not in cyclic_feat+cat_feat+target]

In [6]:
p1=pdPipeline([
    ('select_cyclic',VarSelector(cyclic_feat)),
    ('cyclic_feat',custom_cyclic())
])
p2=pdPipeline([
    ('select_cat',VarSelector(cat_feat)),
    ('missing_treat',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(200))
])
p3=pdPipeline([
    ('select_num',VarSelector(num_feat)),
    ('missing_treat',DataFrameImputer())
])

data_pipe=FeatureUnion([
    ('p1',p1),
    ('p2',p2),
    ('p3',p3)
])

In [7]:
data_pipe.fit(fb)

FeatureUnion(transformer_list=[('p1',
                                pdPipeline(steps=[('select_cyclic',
                                                   VarSelector(feature_names=['Post '
                                                                              'Published '
                                                                              'Weekday',
                                                                              'Base '
                                                                              'Date '
                                                                              'Time '
                                                                              'Weekday'])),
                                                  ('cyclic_feat',
                                                   custom_cyclic())])),
                               ('p2',
                                pdPipeline(steps=[('select_cat',
                                     

In [8]:
x_train=pd.DataFrame(data=data_pipe.transform(fb),
                    columns=data_pipe.get_feature_names())
y_train=fb[target]

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

In [10]:
lr=LinearRegression()

In [13]:
cvmae=-np.array(cross_val_score(lr,x_train,y_train,cv=10,
                      scoring='r2',
                     n_jobs=-1))

In [14]:
cvmae.mean()

-0.29759765287536666

In [15]:
cvmae.std()

0.09660287990951694

In [16]:
from sklearn.linear_model import Lasso,Ridge
from sklearn.model_selection import GridSearchCV

In [17]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.6f} (std: {1:.6f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [18]:
### lasso
# this will take longer time to finish in comparison to ridge because of not having a closed form solution

model=Lasso(fit_intercept=True)
params ={'alpha': np.linspace(1,100,100)}
gs=GridSearchCV(model,
                         param_grid=params,
                         cv=10,
                         scoring='r2',
                        verbose=20,n_jobs=-1)
gs.fit(x_train,y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


GridSearchCV(cv=10, estimator=Lasso(), n_jobs=-1,
             param_grid={'alpha': array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,
        23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,
        34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,  44.,
        45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,
        56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,  66.,
        67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,  77.,
        78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,
        89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99.,
       100.])},
             scoring='r2', verbose=20)

In [19]:
report(gs.cv_results_,5)

Model with rank: 1
Mean validation score: 0.301228 (std: 0.096899)
Parameters: {'alpha': 1.0}

Model with rank: 2
Mean validation score: 0.300527 (std: 0.096313)
Parameters: {'alpha': 3.0}

Model with rank: 3
Mean validation score: 0.300492 (std: 0.096546)
Parameters: {'alpha': 4.0}

Model with rank: 4
Mean validation score: 0.300292 (std: 0.096818)
Parameters: {'alpha': 5.0}

Model with rank: 5
Mean validation score: 0.300270 (std: 0.096132)
Parameters: {'alpha': 2.0}



In [21]:
lasso_model=gs.best_estimator_
lasso_model

Lasso()

In [22]:
lasso_model.fit(x_train,y_train)

Lasso()

In [23]:
list(zip(data_pipe.get_feature_names(),lasso_model.coef_))

[('p1__Post Published Weekday_sin', -0.0),
 ('p1__Post Published Weekday_cos', -0.0),
 ('p1__Base Date Time Weekday_sin', 0.0),
 ('p1__Base Date Time Weekday_cos', -0.0),
 ('p2__page_category_Professional sports team', -0.0),
 ('p2__page_category_Musician/band', -0.0),
 ('p2__page_category_Artist', -0.0),
 ('p2__page_category_Political party', -0.0),
 ('p2__page_category_Community', -0.0),
 ('p2__page_category_Movie', 0.0),
 ('p2__page_category_Actor/director', -0.0),
 ('p2__page_category_University', 0.0),
 ('p2__page_category_Athlete', 0.0),
 ('p2__page_category_Public figure', -0.0),
 ('p2__page_category_Product/service', 0.0),
 ('p2__page_category_Non-profit organization', 0.0),
 ('p2__page_category_Church/religious organization', 0.0),
 ('p2__page_category_Record label', 0.0),
 ('p2__page_category_News/media website', -0.0),
 ('p2__page_category_Company', 0.0),
 ('p2__page_category_Entertainer', 0.0),
 ('p2__page_category_Personal blog', -0.0),
 ('p2__page_category_Education', 0.0

In [24]:
(lasso_model.coef_==0).sum()

47

In [25]:
from sklearn.metrics import r2_score

In [26]:
y_train_pred = lasso_model.predict(x_train)
# y_test_pred = lasso_model.predict(x_test)

In [27]:
train_score1 = r2_score(y_train, y_train_pred)
# test_score1 = r2_score(y_test, y_test_pred)

In [28]:
print (f'training data prediction score for r2 is = {train_score1}') # = 0.3202592463942481

training data prediction score for r2 is = 0.32282827538124603


In [None]:
### ridge

model=Ridge(fit_intercept=True)
params ={'alpha': np.linspace(1e7,1e9,100)}
gs=GridSearchCV(model,
                         param_grid=params,
                         cv=10,
                         scoring='neg_mean_absolute_error',
                        verbose=20,n_jobs=-1)
gs.fit(x_train,y_train)

In [None]:
report(gs.cv_results_,5)