# Gift Recommender Engine: Obtain Twitter Users

## Import Libraries and Scrape User Twitter

In [9]:
import tweepy
import pickle
from scripts.keys import *

def get_user_tweets(user_id, consumer_key, consumer_secret, access_token, access_token_secret):
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth)

    tweets = api.user_timeline(screen_name=user_id, count=200, include_rts=True, tweet_mode='extended')

    return tweets

In [10]:
twitter = TwitterKeys()
consumer_key = twitter.consumer_key
consumer_secret = twitter.consumer_secret
access_token = twitter.access_token
access_secret = twitter.access_secret

In [26]:
import json
user_id = ''

#tweets = get_user_tweets(user_id, consumer_key, consumer_secret, access_token, access_secret)

In [188]:
user_id = ''

tweets = get_user_tweets(user_id, consumer_key, consumer_secret, access_token, access_secret)

filename = open('nisha.sav', 'wb')
pickle.dump(tweets, filename)

## Data Compilation

In [98]:
import pandas as pd

def user_tweet_df(filename, username):
    
    tweets = pickle.load(open(filename, 'rb'))
    
    all_tweets = []
    username = tweets[0]._json['user']['screen_name']
    for tweet in tweets:
        all_tweets.append(tweet._json['full_text'])
        
    df = pd.DataFrame({'user': username, 'Tweet Content': all_tweets})
    return df

In [63]:
sohaib_df = user_tweet_df('sohaib.sav', 'rb')
sohaib_df.rename(columns={'tweets': 'Tweet Content'}, inplace=True)

In [99]:
carr1eg_df = user_tweet_df('carr.sav', 'rb')

In [113]:
user3_df = user_tweet_df('bintur.sav', 'rb')

In [142]:
sarah_df = user_tweet_df('sarah.sav', 'rb')

In [153]:
kyle_df = user_tweet_df('kyle.sav', 'rb')

In [162]:
may_df = user_tweet_df('may.sav', 'rb')

In [168]:
marie_df = user_tweet_df('marie.sav', 'rb')

In [189]:
nisha_df = user_tweet_df('nisha.sav', 'rb')

## Data Cleaning

In [149]:
import numpy as np
import re
import string
import nltk
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['im', "oh", "i'm", "lol", "gonna", 'ill'])
nlp = spacy.load('en_core_web_sm')

def spacy_lemmatize(text):
    if type(text) == list:
        doc = nlp(u"{}".format(' '.join(text)))
    else:
        doc = nlp(u"{}".format(text))
    lemmatized = list()
    for token in doc:
        lemmatized.append(token.lemma_)
    
    return lemmatized

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)


def tweet_preprocess(text):
    text=re.sub(r'http\S+', '',text)
    text = re.sub('@[^\s]+','',text)
    text = re.sub('&lt;/?[a-z]+&gt;', '', text)
    text = text.replace('&amp', '&')
    text = re.sub(r"[^\w\s]", "", text)
    text = deEmojify(text)
    text = text.split() #split into list
    #text = [re.sub(r'^https?:\/\/.*[\r\n]*', '', s, flags=re.MULTILINE) for s in text] #remove any links
    #text = [re.sub('@[^\s]+','', s) for s in text] #remove @
    text = [s.lower() for s in text] #convert every character into lowercase
    #text = [re.sub(rf"[{string.punctuation}]", " ", s) for s in text] #remove punctuations
    text = [re.sub(r'[0-9]', ' ', s) for s in text] #remove all digits
    text = ' '.join(text)  #resplits
    text = [s for s in text.split() if len(s) >= 2] #removes words with one word length
    text = [s for s in text if s not in stopwords] #remove all stopwords
    text = ' '.join(spacy_lemmatize(text)) #lemmatize text using spacy and join into a string
    text = ' '.join([s for s in text.split() if len(s) > 2])
    return text


class TweetCategory:

    def __init__(self, model, vectorizer, tweet_data, reference):
        self.data = tweet_data
        self.model = model
        self.vectorizer = vectorizer
        self.ref = reference
        self.analyzer = SentimentIntensityAnalyzer()

    def process_user_tweets(self):
        self.data['clean-tweet'] = self.data['Tweet Content'].map(tweet_preprocess)
        self.data = self.data[['Tweet Content', 'clean-tweet']].rename(columns={'Tweet Content': 'tweet'})

        self.data['vader-sentiment'] = self.data['tweet'].apply(lambda x: self.analyzer.polarity_scores(x))
        self.data['vader-pos'] = self.data['vader-sentiment'].apply(lambda x: x['pos'])
        self.data['vader-neu'] = self.data['vader-sentiment'].apply(lambda x: x['neu'])
        self.data['vader-neg'] = self.data['vader-sentiment'].apply(lambda x: x['neg'])
        self.data['vader-compound'] = self.data['vader-sentiment'].apply(lambda x: x['compound'])


    def predict_topics(self, sentiment_thresh, confidence_thresh):
        self.predict_df = self.data[(self.data['vader-compound'] >= sentiment_thresh) & (self.data['clean-tweet'] != '')]
        
        tweets_transformed = self.vectorizer.transform(self.predict_df['clean-tweet'])
        predicted_category = self.model.predict(tweets_transformed)

        p = np.array(self.model.decision_function(tweets_transformed))
        probability = np.exp(p)/np.sum(np.exp(p), axis=1, keepdims=True)
        probability_list = [max(prob) for prob in probability]

        self.predict_df['predicted'] = predicted_category
        self.predict_df['probability'] = probability_list
        self.predict_df['predicted'] = self.predict_df['predicted'].apply(lambda x: self.ref[x])

        top_categories = self.predict_df[self.predict_df['probability'] >= confidence_thresh]['predicted'].value_counts()[:3]       

        return top_categories

In [66]:
import pickle

# Naive Bayes Model
filename = open('models/nb_baseline2.sav', 'rb')
nb = pickle.load(filename)

# Support Vector Classifier Model
filename = open('models/linear_svc_baseline2.sav', 'rb')
ovr_svc = pickle.load(filename)

# Import Vectorizer
filename = open('models/tfidf_vectorizer2.sav', 'rb')
tfidf_model = pickle.load(filename)

# Import Reference Dictionary
filename = open('models/reference-dict.pickle', 'rb')
ref = pickle.load(filename)

In [76]:
key = {v: k for k, v in ref.items()}

## User 1

Expected: Art, Books, Food, Household/Decor/Cooking, Movies, Music, Nature/Animals/Green, Sports, Travel

In [82]:
sohaib_class = TweetCategory(ovr_svc, tfidf_model, sohaib_df, key)
sohaib_class.process_user_tweets()
top_topics = sohaib_class.predict_topics(0, 0.2)

In [93]:
top_topics

Sports                 4
Nature                 2
Electronics/Gadgets    2
Name: predicted, dtype: int64

## User 2

Expected: Travel, Work, Self-care, Tech, Books

In [106]:
carr_class = TweetCategory(ovr_svc, tfidf_model, carr1eg_df, key)
carr_class.process_user_tweets()
top_topics = carr_class.predict_topics(0, 0.2)

In [107]:
top_topics

Electronics/Gadgets    1
Name: predicted, dtype: int64

## User 3

Expected: Food, Tech, Business, Household/Cooking, Self-Care

In [150]:
user3_class = TweetCategory(ovr_svc, tfidf_model, user3_df, key)
user3_class.process_user_tweets()
top_topics = user3_class.predict_topics(0, 0.2)

In [151]:
top_topics

Food                   2
Alcohol                2
Electronics/Gadgets    2
Name: predicted, dtype: int64

In [141]:
user3 = user3_class.predict_df
user3[(user3['predicted'] == 'Food') & (user3['probability'] >= 0.2)]

Unnamed: 0,tweet,clean-tweet,vader-sentiment,vader-pos,vader-neu,vader-neg,vader-compound,predicted,probability
38,I use it to cook everything https://t.co/4qZOW...,use cook everything,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,Food,0.275017
123,Coconut breaded shrimp 😍😍\nAlhamdulilaah for food,coconut bread shrimp alhamdulilaah food,"{'neg': 0.0, 'neu': 0.667, 'pos': 0.333, 'comp...",0.333,0.667,0.0,0.7184,Food,0.688882


## User 4

Expected: Music (likes to sing, great taste in music), Self-care (into make-up and fashion and things like that), Business, Art

In [160]:
user4_class = TweetCategory(ovr_svc, tfidf_model, sarah_df, key)
user4_class.process_user_tweets()
top_topics = user4_class.predict_topics(0, 0.2)

In [161]:
top_topics

Business     3
Self-care    2
Music        2
Name: predicted, dtype: int64

In [187]:
#user4_class.predict_df[-10:]['tweet'].to_list()

## User 5

Expected: Art, Tech, Food, Coffee, Gaming, Household, Movies, Music, Sports, Travel

In [154]:
user5_class = TweetCategory(ovr_svc, tfidf_model, kyle_df, key)
user5_class.process_user_tweets()
top_topics = user5_class.predict_topics(0, 0.2)

In [155]:
top_topics

Coffee    13
Sports    11
Nature     4
Name: predicted, dtype: int64

## User 6

Expected: Music (likes to sing), Self-care (always stressed), Work or Business (very work oriented), Art (likes to draw). Bad topics: books, sports, alcohol, coffee, household.

In [164]:
user6_class = TweetCategory(ovr_svc, tfidf_model, may_df, key)
user6_class.process_user_tweets()
top_topics = user6_class.predict_topics(0, 0.2)

In [193]:
may_df = user6_class.predict_df
may_df[may_df['probability'] >= 0.2]

Unnamed: 0,tweet,clean-tweet,vader-sentiment,vader-pos,vader-neu,vader-neg,vader-compound,predicted,probability
0,RT @ltc_angel: If #bitcoin keeps dropping like...,bitcoin keep drop like may create onlyfan,"{'neg': 0.0, 'neu': 0.753, 'pos': 0.247, 'comp...",0.247,0.753,0.0,0.5574,Business,0.423039
66,RT @jaxkkkie: I stopped fuckin with a lot of p...,stop fuckin lot people life great lil boring g...,"{'neg': 0.11, 'neu': 0.601, 'pos': 0.29, 'comp...",0.29,0.601,0.11,0.7964,Self-care,0.208391
75,RT @majestcbitch: SOMEONE SAID “WAIT START AGA...,someone say wait start record lmaoooooo,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,Music,0.400637
86,Your mind is always eavesdropping on your self...,mind always eavesdrop self talk think positive...,"{'neg': 0.0, 'neu': 0.644, 'pos': 0.356, 'comp...",0.356,0.644,0.0,0.802,Self-care,0.307861
94,Accept that you will not find your comfort zon...,accept find comfort zone people people change ...,"{'neg': 0.068, 'neu': 0.78, 'pos': 0.151, 'com...",0.151,0.78,0.068,0.6237,Self-care,0.288829


## User 7

Expected: Tech/Work/Business (she likes psychology), Nature (she likes animals and shes vegan), Books (she's really smart)

In [169]:
user7_class = TweetCategory(ovr_svc, tfidf_model, marie_df, key)
user7_class.process_user_tweets()
top_topics = user7_class.predict_topics(0, 0.2)

In [177]:
top_topics

Electronics/Gadgets    3
Work                   2
Music                  1
Name: predicted, dtype: int64

In [175]:
marie = user7_class.predict_df
marie[(marie['predicted'] == 'Electronics/Gadgets') & (marie['probability'] >= 0.2)]

Unnamed: 0,tweet,clean-tweet,vader-sentiment,vader-pos,vader-neu,vader-neg,vader-compound,predicted,probability
45,RT @metzpsych: SO MANY OPEN DATA SETS FOR TEAC...,many open datum set teach research awesome,"{'neg': 0.0, 'neu': 0.788, 'pos': 0.212, 'comp...",0.212,0.788,0.0,0.7034,Electronics/Gadgets,0.221494
56,😍 Great opportunity to get into #dataanalysis ...,great opportunity get dataanalysis without pro...,"{'neg': 0.0, 'neu': 0.541, 'pos': 0.459, 'comp...",0.459,0.541,0.0,0.8805,Electronics/Gadgets,0.318611
73,@chartgerink *googles pull requests* 😂,google pull request,"{'neg': 0.15, 'neu': 0.551, 'pos': 0.299, 'com...",0.299,0.551,0.15,0.4404,Electronics/Gadgets,0.279361


In [178]:
marie[marie['predicted'] == 'Nature']

Unnamed: 0,tweet,clean-tweet,vader-sentiment,vader-pos,vader-neu,vader-neg,vader-compound,predicted,probability
9,RT @StudentIOS: Calling #students of the #Neth...,call student netherlandsrecent graduate dutch ...,"{'neg': 0.0, 'neu': 0.892, 'pos': 0.108, 'comp...",0.108,0.892,0.0,0.3182,Nature,0.093215
20,RT @jtrialerror: You have ONE WEEK left to reg...,one week leave register openscience ranking de...,"{'neg': 0.0, 'neu': 0.889, 'pos': 0.111, 'comp...",0.111,0.889,0.0,0.3595,Nature,0.095111
41,"RT @womensart1: 'Smoothie and Pegasus', two of...",smoothie pegasus two artist helga stentzel clo...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,Nature,0.145242
52,RT @ChelseaParlett: your model isn’t revolutio...,model revolutionary overfit,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,Nature,0.093911
72,RT @SAPoliceNews: ‼️New Dog Operations initiat...,new dog operation initiative announce small ar...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,Nature,0.207651
133,RT @chrisdc77: This is remarkable. I look forw...,remarkable look forward see response one,"{'neg': 0.0, 'neu': 0.806, 'pos': 0.194, 'comp...",0.194,0.806,0.0,0.5574,Nature,0.093791


## User 8

In [190]:
user8_class = TweetCategory(ovr_svc, tfidf_model, nisha_df, key)
user8_class.process_user_tweets()
top_topics = user8_class.predict_topics(0, 0.2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [191]:
top_topics

Books     6
Nature    3
Work      2
Name: predicted, dtype: int64