In [32]:
import pandas as pd
import pickle
import os
import glob
import re
import string
import emoji
import numpy as np


from bs4 import BeautifulSoup

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.corpus import wordnet

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [2]:
def create_dictionaries_of_tweets():
    '''Creates a dictionary of tweets for each politician'''
    
    out_dict = {}
    
    for filename in glob.glob('data/*.pkl'):
        politician_handle = filename.split('/')[1].split('_')[0]
        with open(filename, 'rb') as file:    
            tweet_df = pickle.load(file)
            out_dict[politician_handle] = tweet_df['tweet'].drop_duplicates().to_frame()
    return(out_dict)

politician_dict = create_dictionaries_of_tweets()

In [3]:
def clean_tweet_sentiment_analysis(tweet):
    
    #converts html to text
    tweet = BeautifulSoup(tweet, 'lxml').text
    
    #removes links
    tweet = re.sub(r'http\S+', '', tweet)
    
    #removes twitter usernames
    tweet = re.sub(r'(\s)?@\w+', '', tweet)
    
    return(tweet)

for df in politician_dict.values():
    df['tweet'] = df['tweet'].apply(lambda x: clean_tweet_sentiment_analysis(x))

In [4]:
sentiment_analyzer = SentimentIntensityAnalyzer()

def calc_tweet_sentiments(tweet):    
    sentiment_scores = sentiment_analyzer.polarity_scores(tweet)
            
    return sentiment_scores

for df in politician_dict.values():
    df['sentiment_scores'] = df['tweet'].apply(lambda x: calc_tweet_sentiments(x))
    
for df in politician_dict.values():
    df['compound_score'] = df['sentiment_scores'].apply(lambda x: x['compound'])


In [41]:
custom_stop_words = ['rt', 'amp', 'u', 'w', 'im', 'live', 'must', 'join', 'tune', 'pm', 'et', 
                         'year', 'say', 'get', 'it']
stop_words = custom_stop_words + stopwords.words('english')

punctuation = string.punctuation + "'"

def clean_tweet_tfidf(tweet):
    
    tweet = tweet.lower()
    
    #removing punctuation
    cleaned_tweet = re.sub('[%s]' % re.escape(punctuation), '', tweet)
    
    #removing emojis
    cleaned_tweet = re.sub(emoji.get_emoji_regexp(), r"", cleaned_tweet)
        
    #remove stop words
    cleaned_tweet = ' '.join([item for item in cleaned_tweet.split() if item not in stop_words])
        
    return(cleaned_tweet)

for df in politician_dict.values():
    df['cleaned_tweets'] = df['tweet'].apply(lambda x: clean_tweet_tfidf(x))

In [42]:
politician_dict['AndrewYang']

Unnamed: 0,tweet,sentiment_scores,compound_score,cleaned_tweets,lemmatized_tweets
0,You know who else gets sworn in today? Jon Oss...,"{'neg': 0.0, 'neu': 0.867, 'pos': 0.133, 'comp...",0.5673,know else gets sworn today jon ossoff reverend...,know else get sworn today jon ossoff reverend ...
1,Awful and unacceptable.,"{'neg': 0.857, 'neu': 0.143, 'pos': 0.0, 'comp...",-0.7184,awful unacceptable,awful unacceptable
2,I enjoyed the Asian American Secret Service ag...,"{'neg': 0.0, 'neu': 0.633, 'pos': 0.367, 'comp...",0.7003,enjoyed asian american secret service agent be...,enjoy asian american secret service agent behi...
3,Amanda Gorman.,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,amanda gorman,amanda gorman
4,The 46th President of the United States. 👍🇺🇸🎉,"{'neg': 0.0, 'neu': 0.621, 'pos': 0.379, 'comp...",0.6705,46th president united states,46th president united state
...,...,...,...,...,...
2994,Another 1.9 million unemployed - 25% of the wo...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,another 19 million unemployed 25 workforce work,another 19 million unemployed 25 workforce work
2995,the scale of the protests is enormous,"{'neg': 0.241, 'neu': 0.759, 'pos': 0.0, 'comp...",-0.2263,scale protests enormous,scale protest enormous
2997,Thank you Marshall! We will get it out to peo...,"{'neg': 0.0, 'neu': 0.781, 'pos': 0.219, 'comp...",0.4738,thank marshall people,thank marshall people
2998,Thanks Erik. 👍,"{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'comp...",0.4404,thanks erik,thanks erik


In [43]:
#lemmatization function taken from Selva Prabhakaran's post on Machine Learning Plus
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_tweets(tweets):
    #initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    #lemmatize the words
    return [' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(tweet)]) for tweet in tweets]

for df in politician_dict.values():
    df['lemmatized_tweets'] = lemmatize_tweets(df['cleaned_tweets'])


In [69]:
def get_politician_vectors(df, twitter_handle, n_words = 250):
    
    #create the document-term matrix
    tfidf = TfidfVectorizer(stop_words = custom_stop_words, 
                            ngram_range = (1, 3), min_df = 5, max_df = .9, binary = True)
    doc_word = tfidf.fit_transform(df['lemmatized_tweets'])
    
    word_scores = pd.DataFrame(doc_word.toarray(), columns = tfidf.get_feature_names()).sum()
    
    politician_word_vector = pd.DataFrame(word_scores).transpose()   
    
    top_words = word_scores.nlargest(n_words).index
    for word in top_words:
        
     
    #get sentiment for top words and add them to the dataframe
    if top_words == True:
        top_words = word_scores.nlargest(top_words_amount).index
        for word in top_words:
            mini_df = pd.DataFrame(result,columns=['text'])
            tweets_for_this_word = mini_df[mini_df.text.str.contains(word)].text.to_list()
            if len(tweets_for_this_word) != 0:
                senti_df = create_sentiment_vectors(tweets_for_this_word,twitter_handle,word)
                for column in senti_df.columns:
                    final[column] = senti_df.at[twitter_handle,column]

    #return final, number_of_tweets
    
get_politician_vectors(politician_dict['AndrewYang']['lemmatized_tweets'], 'AndrewYang')

thanks


NameError: name 'result' is not defined

In [79]:
word_sent = 0
for i, row in politician_dict['AndrewYang'].iterrows():
    if 'thank' in row['lemmatized_tweets']:
        word_sent += row['compound_score']
word_sent

271.28020000000066

In [None]:
politician_dict = {key: clean_tweets_tfidf(value).to_frame() for (key, value) in politician_dict.items()}