# Gift Recommender Engine: Evaluation

Use celebrities Tweets to see their interests.

## Import Libraries

In [31]:
import numpy as np
import pandas as pd

In [32]:
df = pd.read_csv('datasets/twitter-profiles/report_barackobama/tweets.csv')

In [33]:
df.head()

Unnamed: 0,Tweet Id,Tweet URL,Tweet Posted Time,Tweet Content,Tweet Type,Client,Retweets received,Likes received,User Id,Name,Username,Verified or Non-Verified,Profile URL,Protected or Not Protected
0,"""1219267208250941440""",https://twitter.com/barackobama/status/1219267...,2020-01-20 14:35:29,"""Every so often, I re-read Dr. King’s Letter f...",Tweet,Twitter for iPhone,50301,239729,"""813286""","""Barack Obama""",barackobama,Verified,https://twitter.com/barackobama,Not Protected
1,"""1218174463046553600""",https://twitter.com/barackobama/status/1218174...,2020-01-17 14:13:19,"""In every scene, you are my star, @MichelleOba...",Tweet,Twitter for iPhone,394988,2758964,"""813286""","""Barack Obama""",barackobama,Verified,https://twitter.com/barackobama,Not Protected
2,"""1216725015695183872""",https://twitter.com/barackobama/status/1216725...,2020-01-13 14:13:44,"""Glad to see American Factory’s Oscar nod for ...",Tweet,Twitter for iPhone,11017,127215,"""813286""","""Barack Obama""",barackobama,Verified,https://twitter.com/barackobama,Not Protected
3,"""1216428587328458754""",https://twitter.com/barackobama/status/1216428...,2020-01-12 18:35:50,"""Our fellow Americans in Puerto Rico can use o...",Tweet,Twitter for iPhone,31512,135165,"""813286""","""Barack Obama""",barackobama,Verified,https://twitter.com/barackobama,Not Protected
4,"""1215378018790707205""",https://twitter.com/barackobama/status/1215378...,2020-01-09 21:01:14,"""Here’s the thing: Even with problems of this ...",Reply,Twitter for iPhone,6389,36772,"""813286""","""Barack Obama""",barackobama,Verified,https://twitter.com/barackobama,Not Protected


## Importing Models

In [30]:
import pickle

# Naive Bayes Model
filename = open('models/nb_baseline2.sav', 'rb')
nb = pickle.load(filename)

# Support Vector Classifier Model
filename = open('models/linear_svc_baseline2.sav', 'rb')
ovr_svc = pickle.load(filename)

# Import Vectorizer
filename = open('models/tfidf_vectorizer2.sav', 'rb')
tfidf_model = pickle.load(filename)

# Import Reference Dictionary
filename = open('models/reference-dict.pickle', 'rb')
ref = pickle.load(filename)

In [34]:
import re
import string
import nltk
import spacy
from nltk.probability import FreqDist

stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['im', "oh", "i'm", "lol", "gonna", 'ill'])
nlp = spacy.load('en_core_web_sm')

def spacy_lemmatize(text):
    if type(text) == list:
        doc = nlp(u"{}".format(' '.join(text)))
    else:
        doc = nlp(u"{}".format(text))
    lemmatized = list()
    for token in doc:
        lemmatized.append(token.lemma_)
    
    return lemmatized

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def preprocess(text):
    text=re.sub(r'http\S+', '',text)
    text = re.sub('@[^\s]+','',text)
    text = re.sub('&lt;/?[a-z]+&gt;', '', text)
    text = text.replace('&amp', '&')
    text = re.sub(r"[^\w\s]", "", text)
    text = deEmojify(text)
    text = text.split() #split into list
    #text = [re.sub(r'^https?:\/\/.*[\r\n]*', '', s, flags=re.MULTILINE) for s in text] #remove any links
    #text = [re.sub('@[^\s]+','', s) for s in text] #remove @
    text = [s.lower() for s in text] #convert every character into lowercase
    #text = [re.sub(rf"[{string.punctuation}]", " ", s) for s in text] #remove punctuations
    text = [re.sub(r'[0-9]', ' ', s) for s in text] #remove all digits
    text = ' '.join(text)  #resplits
    text = [s for s in text.split() if len(s) >= 2] #removes words with one word length
    text = [s for s in text if s not in stopwords] #remove all stopwords
    text = ' '.join(spacy_lemmatize(text)) #lemmatize text using spacy and join into a string
    text = ' '.join([s for s in text.split() if len(s) > 2])
    return text

In [35]:
df['clean-tweets'] = df['Tweet Content'].map(preprocess)
tweets = df[['Tweet Content', 'clean-tweets']].rename(columns={'Tweet Content': 'tweet'})

In [36]:
all_words = ' '.join([char for char in tweets['clean-tweets'].to_list()]).split()
tweet_length = tweets['tweet'].apply(lambda x: len(x.split())).to_list()
clean_tweet_length = tweets['clean-tweets'].apply(lambda x: len(x.split())).to_list()
tweets['tweet-len'] = tweet_length
tweets['clean-len'] = clean_tweet_length

In [37]:
tweets = tweets[tweets['clean-len'] >= 5]

In [38]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
tweets['vader-sentiment'] = tweets['tweet'].apply(lambda x: analyzer.polarity_scores(x))
tweets['vader-pos'] = tweets['vader-sentiment'].apply(lambda x: x['pos'])
tweets['vader-neu'] = tweets['vader-sentiment'].apply(lambda x: x['neu'])
tweets['vader-neg'] = tweets['vader-sentiment'].apply(lambda x: x['neg'])
tweets['vader-compound'] = tweets['vader-sentiment'].apply(lambda x: x['compound'])

In [39]:
tweets_filtered = tweets[tweets['vader-compound'] >= 0.6]
tweets_filtered.shape

(679, 9)

In [40]:
clean_tweets = tweets_filtered['clean-tweets'].to_list()
nb_topic_pred = list()
svc_topic_pred = list()

for tweet in clean_tweets:
    nb_topic = ref[nb.predict(tfidf_model.transform([tweet]))[0]]
    nb_topic_pred.append(nb_topic)
    
    svc_topic = ref[ovr_svc.predict(tfidf_model.transform([tweet]))[0]]
    svc_topic_pred.append(svc_topic)

In [41]:
nb_series = pd.Series(nb_topic_pred).value_counts()[:3]
nb_series

Books       269
Sports       82
Business     74
dtype: int64

In [43]:
svc_series = pd.Series(svc_topic_pred).value_counts()[:3]
svc_series

Self-care    181
Sports        85
Nature        83
dtype: int64

In [None]:
# TRAIN LDA ON AMAZON DATASET --> USE LDA MODEL ON TWEETS TO IDENTIFY KEYWORDS --> INPUT TO CLASSIFIER