# Imports

In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings ( "ignore" )
from tqdm import tqdm

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , roc_auc_score , confusion_matrix , classification_report
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV

In [22]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [23]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier

# Data

In [103]:
train = pd.read_csv ( "train.csv" )
test = pd.read_csv ( "test.csv" )

X_train = train.drop ( "Popularity" , axis = 1 )
y_train = train [ "Popularity" ]
X_test = test.drop ( "Popularity" , axis = 1 )
y_test = test [ "Popularity" ]

# Trying out all models

In [66]:
models = {
    "LogisticRegression":LogisticRegression(random_state=5,n_jobs=-1),
    "SGDClassifier":SGDClassifier(random_state=5,n_jobs=-1),
    "DecisionTreeClassifier":DecisionTreeClassifier(random_state=5),
    "RandomForestClassifier":RandomForestClassifier(random_state=5,n_jobs=-1),
    "GaussianNB":GaussianNB(random_state=5),
    "KNeighborsClassifier":KNeighborsClassifier(random_state=5,n_jobs=-1),
    "AdaBoostClassifier":AdaBoostClassifier(random_state=5),
    "GradientBoostingClassifier":GradientBoostingClassifier(),
    "XGBClassifier":XGBClassifier(random_state=5),
    "BaggingClassifier":BaggingClassifier(n_jobs=-1)
}

In [104]:
def compare_models(X_train,y_train,models):
    train_results ,test_results = [],[]
    print(f'Using {len(X_train.columns)} features...')
    for _,model in tqdm(models.items()):
        model.fit(X_train,y_train)
        train_predict = model.predict ( X_train )
        test_predict = model.predict ( X_test )
        train_results.append ( accuracy_score ( y_train , train_predict ) )
        test_results.append ( accuracy_score ( y_test , test_predict ) )
        
    comparison = pd.DataFrame ( )
    comparison ['model'] = models.keys()
    comparison [ "Train Score" ] = train_results
    comparison [ "Test Score" ] = test_results
    print(comparison.sort_values(by="Test Score",ascending=False))

In [105]:
compare_models(X_train,y_train,models)

Using 58 features...


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:33<00:00,  3.38s/it]


                        model  Train Score  Test Score
8               XGBClassifier     0.684541    0.656970
7  GradientBoostingClassifier     0.684901    0.655288
6          AdaBoostClassifier     0.664937    0.649655
3      RandomForestClassifier     0.983820    0.620397
9           BaggingClassifier     0.984973    0.619472
0          LogisticRegression     0.607892    0.609887
2      DecisionTreeClassifier     1.000000    0.577266
4                  GaussianNB     0.572577    0.577098
5        KNeighborsClassifier     0.718739    0.567261
1               SGDClassifier     0.518054    0.522953


# Feature selection

In [126]:
# Removing highly correlated values as per the insights from EDA
# n_non_stop_words, n_non_stop_unique_tokens, n_unique_token
# is_weekend
# kw_max_avg
highcorr=['n_non_stop_words','n_non_stop_unique_tokens','is_weekend','kw_max_avg']

In [133]:
%%time
def run_rfe(X_train , y_train, model):
    rfc=model.fit(X_train,y_train)
    rfe = RFE ( rfc , n_features_to_select = 3, verbose=2 )
    rfe.fit ( X_train , y_train )
    return rfe
    
rfe = run_rfe(X_train , y_train,XGBClassifier(random_state=5))

Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 fe

In [96]:
d = pd.DataFrame ( { "Columns" : X_train.columns , 
               "Support" : rfe.support_  , 
             "Ranking" : rfe.ranking_ } )
d.sort_values(by="Ranking",inplace=True)

n = 20
d['Columns'][:n]

14            data_channel_is_world
10    data_channel_is_entertainment
13             data_channel_is_tech
15                       kw_min_min
22                       kw_avg_avg
23        self_reference_min_shares
25       self_reference_avg_sharess
21                       kw_min_avg
45            min_positive_polarity
5                          num_imgs
17                       kw_avg_min
18                       kw_min_max
3                         num_hrefs
19                       kw_max_max
20                       kw_avg_max
29              weekday_is_thursday
27               weekday_is_tuesday
28             weekday_is_wednesday
35                           LDA_02
2                   n_unique_tokens
Name: Columns, dtype: object

In [99]:
selected = d['Columns'][:n]

X_train = train[selected]
y_train = train [ "Popularity" ]
X_test = test[selected]
y_test = test [ "Popularity" ]

In [102]:
compare_models(X_train,y_train,models)

Using 20 features.


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.36s/it]


                        model  Train Score  Test Score
7  GradientBoostingClassifier     0.675315    0.654868
8               XGBClassifier     0.674919    0.654700
6          AdaBoostClassifier     0.658054    0.647637
3      RandomForestClassifier     0.983495    0.610896
9           BaggingClassifier     0.982126    0.605095
0          LogisticRegression     0.602919    0.601648
4                  GaussianNB     0.582198    0.581722
2      DecisionTreeClassifier     1.000000    0.564907
5        KNeighborsClassifier     0.721586    0.564486
1               SGDClassifier     0.495928    0.491172


In [108]:
n = 15
# print(d['Columns'][:n])
selected = d['Columns'][:n]

X_train = train[selected]
y_train = train [ "Popularity" ]
X_test = test[selected]
y_test = test [ "Popularity" ]
compare_models(X_train,y_train,models)

Using 15 features...


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:12<00:00,  1.21s/it]


                        model  Train Score  Test Score
8               XGBClassifier     0.665189    0.652262
7  GradientBoostingClassifier     0.665838    0.646797
6          AdaBoostClassifier     0.651784    0.637885
3      RandomForestClassifier     0.982919    0.607870
9           BaggingClassifier     0.982775    0.606524
0          LogisticRegression     0.604216    0.601564
4                  GaussianNB     0.582018    0.581638
2      DecisionTreeClassifier     1.000000    0.568438
5        KNeighborsClassifier     0.721586    0.564486
1               SGDClassifier     0.495928    0.491172


# Features engineering

#### Experiment 1

In [None]:
# Change is_monday, etc to days

def week_encode(msg):
    if msg.endswith('monday'):
        return 1
    elif msg.endswith('tuesday'):
        return 2
    elif msg.endswith('wednesday'):
        return 3
    elif msg.endswith('thursday'):
        return 4
    elif msg.endswith('friday'):
        return 5
    elif msg.endswith('saturday'):
        return 6
    elif msg.endswith('sunday'):
        return 7
    else: 
        return 0
    

In [None]:
# feature - Day
train1 = train.copy()
test1 = test.copy()

weeks = ["weekday_is_monday", "weekday_is_tuesday", "weekday_is_wednesday",
"weekday_is_thursday", "weekday_is_friday", "weekday_is_saturday",
"weekday_is_sunday"]


train1['day'] = train1.apply(lambda x: week_encode(np.argmax(x[weeks])),axis=1)
test1['day'] = test1.apply(lambda x: week_encode(np.argmax(x[weeks])),axis=1)

train1.drop(columns = highcorr+weeks,inplace=True)
test1.drop(columns = highcorr+weeks,inplace=True)

X_train = train1.drop ( "Popularity" , axis = 1 )
y_train = train1 [ "Popularity" ]
X_test = test1.drop ( "Popularity" , axis = 1 )
y_test = test1 [ "Popularity" ]

In [151]:
rfe=run_rfe(X_train,y_train,XGBClassifier(random_state=5))

Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 fe

#### Top N features

In [155]:
d = pd.DataFrame ( { "Columns" : X_train.columns , 
               "Support" : rfe.support_  , 
             "Ranking" : rfe.ranking_ } )
d.sort_values(by="Ranking",inplace=True)

n = 20
print(d['Columns'][:n])
selected = d['Columns'][:n]


14            data_channel_is_world
13             data_channel_is_tech
10    data_channel_is_entertainment
15                       kw_min_min
22                       kw_avg_avg
23        self_reference_min_shares
47                              day
25       self_reference_avg_sharess
2                   n_unique_tokens
38            min_positive_polarity
3                         num_hrefs
20                       kw_avg_max
17                       kw_avg_min
19                       kw_max_max
18                       kw_min_max
21                       kw_min_avg
28                           LDA_02
31              global_subjectivity
27                           LDA_01
5                          num_imgs
Name: Columns, dtype: object


In [156]:
X_train = train1[selected]
y_train = train1 [ "Popularity" ]
X_test = test1[selected]
y_test = test1 [ "Popularity" ]
compare_models(X_train,y_train,models)

Using 20 features...


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:18<00:00,  1.83s/it]


                        model  Train Score  Test Score
8               XGBClassifier     0.676685    0.650748
7  GradientBoostingClassifier     0.676144    0.650244
6          AdaBoostClassifier     0.659856    0.645788
9           BaggingClassifier     0.984649    0.610224
0          LogisticRegression     0.605297    0.604170
3      RandomForestClassifier     0.984324    0.601900
4                  GaussianNB     0.582523    0.580881
1               SGDClassifier     0.577261    0.580209
5        KNeighborsClassifier     0.721586    0.564486
2      DecisionTreeClassifier     1.000000    0.558265


### Experiment 2

In [161]:
channels = train.loc[:,"data_channel_is_lifestyle":"data_channel_is_world"].columns
print(channels)

def channel_encode(msg):
    if msg.endswith('lifestyle'):
        return 1
    elif msg.endswith('entertainment'):
        return 2
    elif msg.endswith('bus'):
        return 3
    elif msg.endswith('socmed'):
        return 4
    elif msg.endswith('tech'):
        return 5
    elif msg.endswith('world'):
        return 6
    else: 
        return 0

Index(['data_channel_is_lifestyle', 'data_channel_is_entertainment',
       'data_channel_is_bus', 'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world'],
      dtype='object')


In [None]:
# feature - Day
train1['channels'] = train1.apply(lambda x: channel_encode(np.argmax(x[channels])),axis=1)
test1['channels'] = test1.apply(lambda x: channel_encode(np.argmax(x[channels])),axis=1)

# drop various channels
for df in [train1,test1]:
    df.drop(columns=channels,inplace=True)

In [189]:
X_train = train1.drop ( "Popularity" , axis = 1 )
y_train = train1 [ "Popularity" ]
X_test = test1.drop ( "Popularity" , axis = 1 )
y_test = test1 [ "Popularity" ]

In [177]:
rfe=run_rfe(X_train,y_train,XGBClassifier(random_state=5))

Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 fe

In [191]:
d = pd.DataFrame ( { "Columns" : X_train.columns , 
               "Support" : rfe.support_  , 
             "Ranking" : rfe.ranking_ } )
d.sort_values(by="Ranking",inplace=True)

n = 20
print(d['Columns'][:n])
selected = d['Columns'][:n]

X_train = train1[selected]
y_train = train1 [ "Popularity" ]
X_test = test1[selected]
y_test = test1 [ "Popularity" ]

42                      channels
17     self_reference_min_shares
16                    kw_avg_avg
41                           day
9                     kw_min_min
19    self_reference_avg_sharess
24                        LDA_04
32         min_positive_polarity
15                    kw_min_avg
13                    kw_max_max
2                n_unique_tokens
3                      num_hrefs
21                        LDA_01
20                        LDA_00
14                    kw_avg_max
11                    kw_avg_min
22                        LDA_02
12                    kw_min_max
5                       num_imgs
25           global_subjectivity
Name: Columns, dtype: object


In [192]:
compare_models(X_train,y_train,models)

Using 20 features...




  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

 10%|████████▎                                                                          | 1/10 [00:00<00:07,  1.15it/s]

 20%|████████████████▌                                                                  | 2/10 [00:02<00:07,  1.01it/s]

 30%|████████████████████████▉                                                          | 3/10 [00:02<00:06,  1.14it/s]

 40%|█████████████████████████████████▏                                                 | 4/10 [00:03<00:04,  1.34it/s]

 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:05<00:03,  1.21it/s]

 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:08<00:04,  1.45s/it]

 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:11<00:04,  2.13s/it]

 90%|█████████████████████████

                        model  Train Score  Test Score
8               XGBClassifier     0.673910    0.648730
7  GradientBoostingClassifier     0.675568    0.645367
6          AdaBoostClassifier     0.657766    0.637296
3      RandomForestClassifier     0.984000    0.610896
0          LogisticRegression     0.605910    0.605179
9           BaggingClassifier     0.983243    0.604086
4                  GaussianNB     0.582162    0.580797
2      DecisionTreeClassifier     1.000000    0.568438
5        KNeighborsClassifier     0.721586    0.564486
1               SGDClassifier     0.503387    0.497057


# Parameter tuning

In [77]:
compare_models(X_train,y_train,models)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:15<00:00,  1.58s/it]


                        model  Train Score  Test Score
7  GradientBoostingClassifier     0.636360    0.613334
8               XGBClassifier     0.632973    0.613250
6          AdaBoostClassifier     0.616973    0.608626
0          LogisticRegression     0.592757    0.591307
3      RandomForestClassifier     0.982559    0.571885
4                  GaussianNB     0.559495    0.562637
1               SGDClassifier     0.565117    0.560955
9           BaggingClassifier     0.981045    0.560367
5        KNeighborsClassifier     0.710126    0.553809
2      DecisionTreeClassifier     1.000000    0.541365


Removing highly correlated values as per the insights from EDA

X_train 

In [None]:
X_train 