In [14]:
import Utils as utils
import TFIDF_Regression_Utils as reg_utils

import pandas as pd
import numpy as np
from sklearn.externals import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

import matplotlib
matplotlib.use('MacOSX')
%matplotlib inline
from matplotlib import pyplot as plt

In [30]:
###### TO LOAD PIPELINE ########
# pipeline = joblib.load('/home/Virality_Predictor/models/Regression_PT_pipeline.pkl') 

In [16]:
# Load articles
shared_articles_df = utils.load_shared_articles('shared_articles.csv', ['pt'])

# Load users with raw virality values
user_interactions_df_raw = utils.load_user_interactions('users_interactions.csv', shared_articles_df, to_normalize=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['domain_name'] = df['url'].apply(split_url)


In [17]:
# Calculate virality classses
shared_articles_df = reg_utils.calculate_virality(shared_articles_df, user_interactions_df_raw)

In [18]:
# Create train and test sets
articles_train, labels_train, articles_test, labels_test = reg_utils.get_train_test_datasets(shared_articles_df)

Dataset shapes: Train data 658 , Test data 165 , Train labels  658 , Test labels 165


In [19]:
# Load cleaned text from /datasets
train_df = reg_utils.clean_all_text(load=True, df=articles_train, name='articles_train_PT', lang='pt')
test_df = reg_utils.clean_all_text(load=True, df=articles_test, name='articles_test_PT', lang='pt')

In [20]:
# Parameters set for grid search

parameters = {
'model__learning_rate': ('constant', 'optimal', 'adaptive'),
'model__eta0': (0.1, 0.01, 0.001),  
}

In [21]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 50,
                        ngram_range = (1, 1),
                        min_df = 3,
                        max_df = 200)),
     ('model', SGDRegressor())
   ])

In [22]:
# Run grid search for the given model

reg_utils.hyperparameter_search(parameters, pipeline, train_df, labels_train)



model__eta0: 0.01
model__learning_rate: 'constant'
{'mean_fit_time': array([0.80932895, 0.7436436 , 0.91354545, 1.19212723, 0.99094447,
       0.8367641 , 0.79306777, 1.09811393, 0.70732498]), 'std_fit_time': array([0.0196803 , 0.0627941 , 0.3081662 , 0.04540936, 0.08224536,
       0.16580063, 0.08934602, 0.07202808, 0.01401557]), 'mean_score_time': array([0.26346238, 0.22894001, 0.25885757, 0.34275015, 0.21876391,
       0.24876817, 0.33586971, 0.35275412, 0.24687433]), 'std_score_time': array([0.01826992, 0.03894604, 0.02152423, 0.07957141, 0.0447089 ,
       0.04107295, 0.07330255, 0.05964201, 0.00336226]), 'param_model__eta0': masked_array(data=[0.1, 0.1, 0.1, 0.01, 0.01, 0.01, 0.001, 0.001, 0.001],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_model__learning_rate': masked_array(data=['constant', 'optimal', 'adaptive', 'constant',
                   'optimal', 'adaptive



In [23]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features = 50,
                        ngram_range = (1, 1),
                        min_df = 3,
                        max_df = 200)),
     ('model', SGDRegressor(learning_rate= 'adaptive',
                            eta0= 0.01))
   ])

In [24]:
pipeline.fit(train_df, labels_train)



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=200, max_features=50, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...m_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False))])

In [25]:
pipeline.score(test_df, labels_test)

0.05613324851212265

In [26]:
labels_pred = pipeline.predict(test_df)

In [27]:
mean_squared_error(labels_pred, labels_test)

0.0007041780291948364

In [28]:
pipeline.named_steps['tfidf'].get_feature_names()

['agora',
 'alguns',
 'ante',
 'aqui',
 'caso',
 'cliente',
 'clientes',
 'coisas',
 'criar',
 'dado',
 'dentro',
 'desenvolvimento',
 'deve',
 'digital',
 'diz',
 'então',
 'gestão',
 'google',
 'grande',
 'hoje',
 'informações',
 'internet',
 'menos',
 'milhões',
 'negócio',
 'negócios',
 'novo',
 'onde',
 'outras',
 'parte',
 'paulo',
 'plataforma',
 'possível',
 'primeiro',
 'processo',
 'produtos',
 'projeto',
 'projetos',
 'qualquer',
 'rede',
 'segundo',
 'seguro',
 'sempre',
 'serviços',
 'sistema',
 'trabalho',
 'tudo',
 'usuário',
 'valor',
 'vida']

In [29]:
joblib.dump(pipeline, '/home/Virality_Predictor/models/Regression_PT_pipeline.pkl')

['Regression_PT_pipeline.pkl']