In [98]:
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re
import pandas as pd
import pickle
import sklearn

In [99]:
df = pd.read_csv('data/clean_data.csv', index_col=0)

In [100]:
df.sort_values(by='likes_count', ascending=False).iloc[:3,:]

Unnamed: 0,id,date,user_id,username,name,tweet,replies_count,retweets_count,likes_count
256,1542206287915585536,2022-06-29,1255794072280842240,heavymental_es,Heavy Mental,El sentido de la vida ya no es 42 -- Light Men...,2,3,23
244,1541678686751133698,2022-06-28,1162694149956603904,thebridge_tech,The Bridge,Gracias @HeavyMental_es por la tarde de ayer y...,0,4,22
195,1541046264539893760,2022-06-26,1255794072280842240,heavymental_es,Heavy Mental,"🕣 TIC, TAC: MAÑANA os vemos a todos a las 7.30...",6,7,17


In [101]:
tweet_1 = df.sort_values(by='likes_count', ascending=False).loc[256,['tweet']][0]
tweet_2 = df.sort_values(by='likes_count', ascending=False).loc[244,['tweet']][0]
tweet_3 = df.sort_values(by='likes_count', ascending=False).loc[195,['tweet']][0]

In [102]:
def spanish_stemmer(x):
    stemmer = SnowballStemmer('spanish')
    return " ".join([stemmer.stem(word) for word in x.split()])

def remove_stopwords(df):
    spanish_stopwords = stopwords.words('spanish')
    return " ".join([word for word in df.split() if word not in spanish_stopwords])

def remove_links(df):
    return " ".join(['{link}' if ('http') in word else word for word in df.split()])

def signs_tweets(tweet):
    signos = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\¿)|(\@)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
    return signos.sub('', tweet.lower())

In [103]:
filename = 'finished_model.model'
with open(filename, 'rb') as archivo_entrada:
    model = pickle.load(archivo_entrada)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [105]:
tweets = [tweet_1,tweet_2, tweet_3]
df_tweets = pd.DataFrame(data=None, columns=['content', 'content_clean', 'Polarity_Pos', 'Polarity_Neg'])
for tweet in tweets:
    text = pd.Series(tweet)
    test_clean = pd.DataFrame(text, columns=['content'])
    # Signos de puntuacion
    test_clean['content_clean'] = test_clean['content'].apply(signs_tweets)

    # Eliminamos links
    test_clean['content_clean'] = test_clean['content_clean'].apply(remove_links)

    # Nos cargamos stopwords
    test_clean['content_clean'] = test_clean['content_clean'].apply(remove_stopwords)

    # Aplicamos el Stemmer
    test_clean['content_clean'] = test_clean['content_clean'].apply(spanish_stemmer)
    result = model.predict_proba(test_clean['content_clean'])
    test_clean['Polarity_Pos'] = pd.Series(result[0][0])
    test_clean['Polarity_Neg'] = pd.Series(result[0][1])
    df_tweets.loc[len(df_tweets),:] = test_clean.loc[0,:]
    

#  ¿Cuáles son las predicciones? Interpreta los resultados.

In [114]:
df_tweets

Unnamed: 0,content,content_clean,Polarity_Pos,Polarity_Neg
0,El sentido de la vida ya no es 42 -- Light Men...,vid -- light mental edicion especial thebridge...,0.69511,0.30489
1,Gracias @HeavyMental_es por la tarde de ayer y...,graci heavymental_ tard ayer asistent ¡exit ab...,0.971576,0.028424
2,"🕣 TIC, TAC: MAÑANA os vemos a todos a las 7.30...",🕣 tic tac mañan vem pm event direct madr thebr...,0.754592,0.245408


# ¿Qué variables han podido influir más en las predicciones del modelo? 

In [155]:
model[1].coef_

array([[-1.61173785e-01,  5.82203991e-01,  4.98376702e-01,
        -6.15336877e-02,  1.95989011e-01, -4.69166313e-01,
         4.57428613e-01, -6.43551319e-01, -9.27232445e-01,
        -3.46981965e-01, -3.28751733e-01, -4.75327048e-02,
         9.01718129e-01,  1.80381663e-01, -6.56063195e-01,
        -8.96960588e-01,  1.30397517e-01,  2.55499683e-01,
        -1.64743026e-01,  2.63246950e-01, -8.72093881e-02,
         7.85050090e-02, -6.90934888e-02,  7.42152959e-01,
         1.26495361e-01,  3.15766193e-02, -3.93879914e-02,
         5.30747943e-02,  7.06051685e-01,  2.43757248e-01,
         4.48428396e-01,  9.16904155e-01,  4.84511393e-02,
        -1.65807955e-01,  1.27586721e-02,  1.65764831e-01,
         8.06189775e-01, -3.16105481e-01, -3.54163018e-01,
        -5.89289951e-01, -1.25421614e+00,  3.52686718e-01,
         1.22434243e-01, -2.60070483e-01, -6.95110494e-02,
         5.07409328e-02,  9.99339372e-02,  5.16715141e-02,
         8.07647959e-01,  9.42101428e-01, -1.03390589e+0