In [11]:
import Utils as utils
import TFIDF_Classification_Utils as cls_utils

import pandas as pd
import numpy as np
from sklearn.externals import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, classification_report

import matplotlib
matplotlib.use('MacOSX')
%matplotlib inline
from matplotlib import pyplot as plt

In [12]:
###### TO LOAD PIPELINE ########
# pipeline = joblib.load('/home/Virality_Predictor/models/Classification_EN_pipeline.pkl') 

In [13]:
# Load articles
shared_articles_df = utils.load_shared_articles('shared_articles.csv', ['en'])

# Load users with raw virality values
user_interactions_df_raw = utils.load_user_interactions('users_interactions.csv', shared_articles_df, to_normalize=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['domain_name'] = df['url'].apply(split_url)


In [14]:
# Calculate virality classses
shared_articles_df = cls_utils.calculate_virality(shared_articles_df, user_interactions_df_raw)


In [15]:
shared_articles_df.virality.shape

(2149,)

In [16]:
shared_articles_df.groupby(["virality"]).size()

virality
1    2124
2      21
3       3
9       1
dtype: int64

In [17]:
shared_articles_df_upsampled = cls_utils.upsample_minority_classes(shared_articles_df)
shared_articles_df_upsampled.groupby(["virality"]).size()

virality
1    2124
2    2103
3    2121
9    2123
dtype: int64

In [18]:
# Create train and test sets
articles_train, labels_train, articles_test, labels_test = cls_utils.get_train_test_datasets(shared_articles_df_upsampled)

Dataset shapes: Train data 6776 , Test data 1695 , Train labels  6776 , Test labels 1695


In [19]:
# Load cleaned text from /datasets
train_df = cls_utils.clean_all_text(load=True, df=articles_train, name='articles_train_EN_upsampled', lang='en')
test_df = cls_utils.clean_all_text(load=True, df=articles_test, name='articles_test_EN_upsampled', lang='en')

In [20]:
# Check train labels distribution
x = labels_train.to_frame(name='virality')
x.groupby(['virality']).size()

virality
1    1703
2    1670
3    1698
9    1705
dtype: int64

In [21]:
# Check test labels distribution
x = labels_test.to_frame(name='virality')
x.groupby(['virality']).size()

virality
1    421
2    433
3    423
9    418
dtype: int64

In [22]:
# Parameters set for grid search
parameters = {
'model__n_neighbors': (5, 10, 40),
'model__weights': ('uniform', 'distance'), 
}

In [23]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 50,
                        ngram_range = (1, 1),
                        min_df = 3,
                        max_df = 200)),
     ('model', KNeighborsClassifier())
   ])

In [24]:
# Run grid search for the given model
cls_utils.hyperparameter_search(parameters, pipeline, train_df, labels_train)

model__n_neighbors: 5
model__weights: 'distance'
{'mean_fit_time': array([12.86554758, 13.78421696, 15.40431341, 18.5750649 , 19.12839087,
       15.51068338]), 'std_fit_time': array([0.04760908, 0.65515824, 1.3782171 , 0.28405418, 0.04351134,
       3.37013161]), 'mean_score_time': array([ 8.24771452,  8.20206873, 10.50406122, 13.24259241, 14.27251466,
       10.50388495]), 'std_score_time': array([0.04806108, 0.54700922, 2.09741859, 0.38145993, 0.11187266,
       0.98283225]), 'param_model__n_neighbors': masked_array(data=[5, 5, 10, 10, 40, 40],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_model__weights': masked_array(data=['uniform', 'distance', 'uniform', 'distance',
                   'uniform', 'distance'],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'model__n_neighbors': 5, 'model__weights': 'uniform'}, {'model__n_neighbors': 

In [25]:
# New pipeline with optimum parameters from grid search

In [26]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 50,
                        ngram_range = (1, 1),
                        min_df = 3,
                        max_df = 200)),
     ('model', KNeighborsClassifier(n_neighbors= 5, weights= 'distance'))
   ])

In [27]:
pipeline.fit(train_df, labels_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=200, max_features=50, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...i',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='distance'))])

In [28]:
labels_pred = pipeline.predict(test_df)

In [29]:
print(classification_report(labels_test, labels_pred))

              precision    recall  f1-score   support

           1       1.00      0.63      0.78       421
           2       0.99      0.70      0.82       433
           3       0.38      1.00      0.55       423
           9       0.00      0.00      0.00       418

   micro avg       0.58      0.58      0.58      1695
   macro avg       0.59      0.58      0.53      1695
weighted avg       0.60      0.58      0.54      1695



  'precision', 'predicted', average, warn_for)


In [30]:
f1_score(labels_test, labels_pred, average='weighted')

  'precision', 'predicted', average, warn_for)


0.5379195267337858

In [31]:
pipeline.named_steps['tfidf'].get_feature_names()

['ad',
 'afraid',
 'ambiguity',
 'analytics',
 'angular',
 'assert',
 'assertion',
 'bdd',
 'bitcoin',
 'blockchain',
 'bock',
 'callback',
 'changelog',
 'compete',
 'couchdb',
 'deepequal',
 'destination',
 'drupal',
 'electron',
 'elixir',
 'enforce',
 'fixture',
 'frontend',
 'growth',
 'handler',
 'humility',
 'icon',
 'independent',
 'jquery',
 'kurzweil',
 'le',
 'liquibase',
 'logical',
 'martin',
 'meeting',
 'mysql',
 'npm',
 'override',
 'parent',
 'procedure',
 'progressive',
 'qunit',
 'responsible',
 'ruby',
 'science',
 'superior',
 'synchronous',
 'typescript',
 'vr',
 'xml']

In [32]:
joblib.dump(pipeline, '/home/Virality_Predictor/models/Classification_EN_pipeline.pkl')

['Classification_EN_pipeline.pkl']