In [1]:
import pandas as pd
import pickle
import os
import glob
import re
import string
import emoji
import numpy as np
import matplotlib.pyplot as plt

import twint
import nest_asyncio
nest_asyncio.apply()

from bokeh.plotting import figure, output_file, save
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, Legend, HoverTool
from bokeh.embed import components
output_notebook()

from bs4 import BeautifulSoup

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.corpus import wordnet

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

In [2]:
with open('data/word_matrices/nyc_mayor.pkl', 'rb') as file:
    nyc_mayor_word_matrix = pickle.load(file)

In [3]:
def scrape_user(username):
    if username[0] == '@':
        username = username[1:]
    
    c = twint.Config()
    c.Hide_output = True
    c.Username = username
    c.Pandas = True
    c.Limit = 500
    c.Filter_retweets = False
    
    twint.run.Search(c)
    
    tweets_df = twint.storage.panda.Tweets_df
    
    columns_wanted = ['date', 'tweet']
    tweets_df_filtered = tweets_df[columns_wanted]
        
    return tweets_df_filtered

def clean_tweet_sentiment_analysis(tweet):
    
    #converts html to text
    tweet = BeautifulSoup(tweet, 'lxml').text
    
    #removes links
    tweet = re.sub(r'http\S+', '', tweet)
    
    #removes twitter usernames
    tweet = re.sub(r'(\s)?@\w+', '', tweet)
    
    return(tweet)

def calc_tweet_sentiment(tweet):    
    sentiment_analyzer = SentimentIntensityAnalyzer()
    return sentiment_analyzer.polarity_scores(tweet)['compound']

def clean_tweet_tfidf(tweet):
    
    tweet = tweet.lower()
    
    #removing punctuation
    cleaned_tweet = re.sub('[%s]' % re.escape(custom_punctuation), '', tweet)
    
    #removing emojis
    cleaned_tweet = re.sub(emoji.get_emoji_regexp(), r"", cleaned_tweet)
        
    #remove stop words
    cleaned_tweet = ' '.join([item for item in cleaned_tweet.split() if item not in custom_stop_words])
        
    return(cleaned_tweet)

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_tweets(tweets):
    #initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    #lemmatize the words
    return [' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(tweet)]) for tweet in tweets]

def calc_word_vectors(df, words):        
    word_scores = []
    for word in words:
        word_df = df[df['lemmatized_tweets'].str.contains(word)]
        if word_df.empty:
            word_scores.append(0)
        else:
            word_scores.append(word_df['sentiment'].mean())
    return(word_scores)

def create_similarity_matrix(similarity_tool, df_vectors):
        
    similarities = np.zeros((len(df_vectors), len(df_vectors)))
    
    for i in range(len(df_vectors)):
        politician_1 = df_vectors.iloc[i,:].values.reshape(1, -1)
        for j in range(i, len(df_vectors)):
            politician_2 = df_vectors.iloc[j, :].values.reshape(1, -1)
            similarities[i][j] = similarities[j][i] = similarity_tool(politician_1,politician_2)
                
    df = pd.DataFrame(similarities, index = df_vectors.index, columns = df_vectors.index)      
    
    return df

def determine_user_similarity(username, reference_matrix):
    matrix_copy = reference_matrix.copy()
    
    #scrape profile of entered user
    user_df = scrape_user(username)
    
    #clean tweets for sentiment analysis
    user_df['tweet'] = user_df['tweet'].apply(lambda x: clean_tweet_sentiment_analysis(x))
    
    #perform sentiment analysis on user's tweets
    user_df['sentiment'] = user_df['tweet'].apply(lambda x: calc_tweet_sentiment(x))
    
    #prepare tweets for tf-idf vectorization
    user_df['cleaned_tweets'] = user_df['tweet'].apply(lambda x: clean_tweet_tfidf(x))
    user_df['lemmatized_tweets'] = lemmatize_tweets(user_df['cleaned_tweets'])
    
    #determine word vectors for entered user
    user_word_vector = calc_word_vectors(user_df, reference_matrix.columns.to_list())
    
    matrix_copy.loc[username] = user_word_vector
        
    cos_sim_df = create_similarity_matrix(cosine_similarity, matrix_copy)
    
    similarity_vector = cos_sim_df[username].sort_values(ascending=False).iloc[1:].to_frame()
    
    return(similarity_vector)

In [4]:
%%time

custom_stop_words = [word.replace("'", "") for word in stopwords.words('english')] + ['rt', 'amp', 'u', 'w', 'im', 'live', 'must', 'join', 'tune', 'pm', 'et', 
                         'year', 'say', 'get', 'it']
custom_punctuation = '!"#&\'()*+,-./:;<=>?@[\\]^_`{|}~\''
    
a = determine_user_similarity('joebiden', nyc_mayor_word_matrix)

print(a.sort_values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

<bound method DataFrame.sort_values of                            joebiden
Andrew Yang                0.529395
Joycelyn Taylor            0.477954
Barbara Kavovit            0.477477
Carlos Menchaca            0.471327
Dianne Morales             0.463020
Christopher S. Krietchman  0.445554
Eric Adams                 0.445111
Scott Stringer             0.443579
Paperboy Prince            0.436959
Maya Wiley                 0.397343
Cleopatra Fitzgerald       0.368175
Abbey Laurel Smith         0.330906
Stacey Prussman            0.261853
Angelo Pinto               0.126468
Bill Pepitone              0.085179>
CPU times: user 9.77 s, sys: 399 ms, total: 10.2 s
Wall time: 14.9 s


In [5]:
with open('data/nyc_mayor_info_all.csv', 'rb') as f:
    mayor_info = pd.read_csv(f)
    
mayor_info.head()

Unnamed: 0.1,Unnamed: 0,name,party,website,twitter_handles
0,0,Eric Adams,Democrat,https://www.ericadams2021.com/,ericadamsfornyc
1,1,Art Chang,Democrat,https://www.chang.nyc,achangnyc
2,2,Eddie Cullen,Democrat,https://www.eddiecullen.com/,eddiecullen4nyc
3,3,Shaun Donovan,Democrat,https://www.shaunfornyc.com/,ShaunDonovanNYC
4,4,Thomas Downs,Other,https://www.downsformayor.nyc/,DownsforMayor


In [None]:
c1 = RdBu3[2] # red
c2 = RdBu3[0] # blue
source = ColumnDataSource(dict(
    xs=[[1,2,2], [1,2,2], [3,4,4], [3,4,4]],
    ys=[[3,3,4], [1,1,2], [3,3,4], [1,1,2]],
    color=[c1, c2, c1, c2],
    label=['hi', 'lo', 'hi', 'lo']
))

p = figure(x_range=(0, 7), y_range=(0, 5), plot_height=300)

# Note legend field matches the column in `source`
p.patches( xs='xs', ys='ys', color='color', legend='label', source=source)
show(p)


In [37]:
def make_sim_bar_chart(matrix, info_df):
    
    merged = matrix.merge(info_df, left_index = True, right_on='name')
    
    color_dict = {'Democrat': 'royalblue', 'Republican': 'firebrick', 'Other': 'gray'}
    colors = [color_dict[party] for party in merged['party']]
    
    source = ColumnDataSource(data={
        'x': [i.replace("-"," ") for i in matrix.index],
        'top': matrix.values.flatten(),
        'color': colors,
        'label': merged['party'].to_list(),
        'width': [0.9]*len(merged)})
    
    p = figure(x_range=matrix.index.to_list(), plot_height=400, plot_width=700, toolbar_location=None)
    p.add_layout(Legend(), 'right')
    
    p.vbar(x='x', top='top', width='width', color='color', legend_field='label', source=source)

    p.xgrid.grid_line_color = None
    p.yaxis.axis_label = 'Similarity'
    p.xaxis.major_label_orientation = 45
    p.xaxis.major_label_text_font_size = "12pt"
    p.axis.axis_label_text_font_size = "12pt"
    p.add_tools(HoverTool(tooltips=[("Politician", "@x"), ("Similarity", "@top")]))
    
    return(p)

fig = make_sim_bar_chart(a, mayor_info)
show(fig)

In [16]:
def make_sim_bar_chart(matrix, info_df):
    
    merged = matrix.merge(info_df, left_index = True, right_on='name')
    
    color_dict = {'Democrat': 'royalblue', 'Republican': 'firebrick', 'Other': 'gray'}
    
    colors = [color_dict[party] for party in merged['party']]
    
    p = figure(x_range=matrix.index.to_list(), plot_height=400,
               plot_width=600, toolbar_location=None)

    p.vbar(x=[i.replace("-"," ") for i in matrix.index], top=matrix.values.flatten(), 
           width=0.9, color=colors, label=merged['party'].to_list())

    p.xgrid.grid_line_color = None
    p.yaxis.axis_label = 'Similarity'
    p.xaxis.major_label_orientation = 45
    p.xaxis.major_label_text_font_size = "12pt"
    p.axis.axis_label_text_font_size = "12pt"
    p.add_tools(HoverTool(tooltips=[("Politician", "@x"), ("Similarity", "@top")]))
    
    return(p)

fig = make_sim_bar_chart(a, mayor_info)
show(fig)

AttributeError: unexpected attribute 'label' to VBar, possible attributes are bottom, fill_alpha, fill_color, hatch_alpha, hatch_color, hatch_extra, hatch_pattern, hatch_scale, hatch_weight, js_event_callbacks, js_property_callbacks, line_alpha, line_cap, line_color, line_dash, line_dash_offset, line_join, line_width, name, subscribed_events, tags, top, width or x

In [7]:
def plot_wordcloud(my_dict, politician):
    '''Plots a word cloud for text of given politicians Tweets'''
    
    wordcloud = WordCloud(width = 800, height = 400, 
                          background_color ='white', 
                          min_font_size = 10).generate(''.join(my_dict[politician]))

    fig, ax = plt.subplots(figsize = (8, 8), facecolor = None)
    plt.title(f'{politician} word cloud', fontsize=18)
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.show() 
    
    return(fig)

