In [18]:
import Utils as utils
import TFIDF_Regression_Utils as reg_utils

import pandas as pd
import numpy as np
from sklearn.externals import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

import matplotlib
matplotlib.use('MacOSX')
%matplotlib inline
from matplotlib import pyplot as plt

In [19]:
###### TO LOAD PIPELINE ########
# pipeline = joblib.load('/home/Virality_Predictor/models/Regression_EN_pipeline.pkl') 

In [2]:
# Load articles
shared_articles_df = utils.load_shared_articles('shared_articles.csv', ['en'])

# Load users with raw virality values
user_interactions_df_raw = utils.load_user_interactions('users_interactions.csv', shared_articles_df, to_normalize=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['domain_name'] = df['url'].apply(split_url)


In [3]:
# Calculate normalized virality 
shared_articles_df = reg_utils.calculate_virality(shared_articles_df, user_interactions_df_raw)

In [4]:
# Create train and test sets
articles_train, labels_train, articles_test, labels_test = reg_utils.get_train_test_datasets(shared_articles_df)

Dataset shapes: Train data 1719 , Test data 430 , Train labels  1719 , Test labels 430


In [5]:
# Load cleaned text from /datasets
train_df = reg_utils.clean_all_text(load=True, df=articles_train, name='articles_train_EN', lang='en')
test_df = reg_utils.clean_all_text(load=True, df=articles_test, name='articles_test_EN', lang='en')

In [6]:
# Parameters set for grid search

parameters = {
'model__learning_rate': ('constant', 'optimal', 'adaptive'),
'model__eta0': (0.1, 0.01, 0.001),  
}

In [7]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 50,
                        ngram_range = (1, 1),
                        min_df = 3,
                        max_df = 200)),
     ('model', SGDRegressor())
   ])

In [8]:
# Run grid search for the given model

reg_utils.hyperparameter_search(parameters, pipeline, train_df, labels_train)

model__eta0: 0.01
model__learning_rate: 'constant'
{'mean_fit_time': array([2.39180334, 2.13651745, 2.09034864, 2.27103209, 2.45637441,
       2.58608135, 2.70478233, 2.68191973, 2.0196677 ]), 'std_fit_time': array([0.05697837, 0.12000692, 0.1927234 , 0.07343953, 0.10380488,
       0.18528218, 0.26356858, 0.29042156, 0.02068962]), 'mean_score_time': array([0.92423217, 1.19902293, 1.26872333, 1.00620929, 1.09202131,
       1.07623299, 1.03056113, 1.30037705, 0.78072262]), 'std_score_time': array([0.08482565, 0.13842939, 0.25081099, 0.09718332, 0.08914669,
       0.13846503, 0.05530157, 0.25807994, 0.07162826]), 'param_model__eta0': masked_array(data=[0.1, 0.1, 0.1, 0.01, 0.01, 0.01, 0.001, 0.001, 0.001],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_model__learning_rate': masked_array(data=['constant', 'optimal', 'adaptive', 'constant',
                   'optimal', 'adaptive



In [9]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 50,
                        ngram_range = (1, 1),
                        min_df = 3,
                        max_df = 200)),
     ('model', SGDRegressor(learning_rate= 'constant',
                            eta0= 0.01))
   ])

In [10]:
pipeline.fit(train_df, labels_train)



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=200, max_features=50, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...m_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False))])

In [11]:
pipeline.score(test_df, labels_test)

0.037858791306834116

In [12]:
labels_pred = pipeline.predict(test_df)

In [13]:
pipeline.named_steps['tfidf'].get_feature_names()

['ad',
 'ai',
 'algorithm',
 'analytics',
 'architecture',
 'artificial',
 'aws',
 'bank',
 'bitcoin',
 'blockchain',
 'bot',
 'brand',
 'browser',
 'button',
 'car',
 'card',
 'class',
 'command',
 'compute',
 'container',
 'database',
 'docker',
 'drupal',
 'element',
 'enterprise',
 'error',
 'financial',
 'growth',
 'host',
 'input',
 'java',
 'javascript',
 'layer',
 'leader',
 'load',
 'map',
 'marketing',
 'module',
 'neural',
 'node',
 'pattern',
 'percent',
 'query',
 'rule',
 'sale',
 'science',
 'storage',
 'stream',
 'study',
 'window']

In [14]:
joblib.dump(pipeline, '/home/Virality_Predictor/models/Regression_EN_pipeline.pkl')

['Regression_EN_pipeline.pkl']