The purpose of this is to create bag of words and then matching each word to the dictionary once in order to minimize workload

In [1]:
# load packages
import pandas as pd
import matplotlib.pyplot as plt
from google.cloud import bigquery
import os
import numpy as np
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

pd.set_option('display.max_colwidth', -1)
plt.style.use('fivethirtyeight')
%load_ext google.cloud.bigquery

In [2]:
# add NRC data
filepath = "/Users/shimengfeng/Documents/Master_Columbia/Fall 2019/capstone/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
emolex_df = pd.read_csv(filepath,  names=["word", "emotion", "association"], skiprows=45, sep='\t')
# emolex_df.head()

In [3]:
# for words that do not have any emotion associated, delete them from dict and create a more cleaned one
df_count = emolex_df.groupby('word').sum().reset_index()
word_with_emotion_list = df_count[df_count.association !=0].word.tolist()

Add a sample data as test

In [4]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] =\
'/Users/shimengfeng/Documents/Master_Columbia/Fall 2019/capstone/\
dsi-capstone-f19-group-1-6c986cf239c5.json'

In [5]:
%%bigquery valid_tweets
SELECT DISTINCT id, full_text_cleaned
FROM tweets.all_valid_tweets
where lang = 'en'
limit 1000

In [8]:
# perform bag of word on the text
# valid_tweets.head()

Unnamed: 0,id,full_text_cleaned
0,501537798110072800,send shockwaves w hands up in regards to the death of michael brown ynt dkp http …
1,499265945873838100,stop being surprised that president hasnt spoken about michaelbrown but spoke on robinwilliams
2,504061933772095500,melissa harris perry destroys time’s joe klein’s attempt to malign michael brown $URL$ tcot
3,504415277790949400,has the cdc learned nothing from ferguson? cdcwhistleblower fraud is like africanamericans being targeted by police …
4,499762501454544900,blessings and safety to the people on the front line capturing situation in furguson your words are our strongest weapon! mikebrown


In [6]:
# clean additional information from the tweet

#clean number and URL
valid_tweets['final_text'] = valid_tweets['full_text_cleaned'].str.replace(r"[0-9]+", "").\
apply(lambda x: x.replace('$URL$',''))

#remove stop words
stop_words = set(stopwords.words('english')) 
valid_tweets['final_text'] = valid_tweets['final_text'].apply(lambda x: ' '.join([item for item in word_tokenize(x) if item not in stop_words]))
  
# valid_tweets.head(5)

In [7]:
# apply bag of words on the cleaned text 
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(valid_tweets['final_text'])
# print(vectorizer.get_feature_names())
# print(X.toarray())  

In [15]:
X[:,unique_word_list].toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
# only recreate the columns have key words match the dictionary
word_list = vectorizer.get_feature_names()
unique_word_list = [word_list.index(word) for word in word_list if word in word_with_emotion_list]
matrix_word_header = np.array(word_list)[unique_word_list] # get the header
#get the words that are non-zero

In [17]:
tweets = X[:,unique_word_list].toarray()
tweets.shape

(10000, 1791)

In [None]:
#generate a dict that with each emotion as key
# df_dict = emolex_df[emolex_df.association!=0][['word','emotion']]
# emotion_dict= df_dict.groupby('word').apply(lambda x: x.drop('word', axis=1).to_dict('list')).to_dict()
# emotion_dict            

In [None]:
#create new dictionary - position-word-emotion
# tweets_emotion_dict={}
# keys = list(np.array(word_list)[unique_word_list])
# for word in keys:
#     tweets_emotion_dict.update({word:emotion_dict[word]['emotion']})
#     # obtain all the emotions associate with the word
# tweets_emotion_dict

In [20]:
emotions = emolex_df.emotion.unique()
emotion_dict = {}
for emotion in emotions:
    emotion_df = emolex_df[emolex_df.emotion ==emotion]
    emotion_df = emotion_df[emotion_df.association ==1]
    emotion_dict[emotion] = emotion_df['word'].tolist()

In [21]:
matrix_word_header

array(['aberration', 'abort', 'abuse', 'account', 'accountable',
       'accounts', 'action', 'actual', 'advice', 'aggressive', 'agree',
       'aid', 'alive', 'ambulance', 'amen', 'amnesty', 'amour', 'anchor',
       'angel', 'anger', 'angry', 'anonymous', 'armed', 'armor',
       'armored', 'arrest', 'art', 'ashamed', 'ass', 'assassination',
       'assault', 'atrocious', 'attack', 'attacking', 'attempt',
       'attention', 'attorney', 'august', 'autopsy', 'avoid', 'awful',
       'axiom', 'baby', 'bad', 'badly', 'bankrupt', 'beautiful', 'beauty',
       'bee', 'believed', 'birth', 'bitch', 'black', 'blackness', 'blame',
       'bless', 'blind', 'bloody', 'blurred', 'bomber', 'bounty', 'boy',
       'boycott', 'break', 'breakdown', 'brilliant', 'broke', 'brother',
       'brutality', 'brute', 'budget', 'bully', 'bury', 'busted', 'calls',
       'calm', 'camouflage', 'candidate', 'cannon', 'captain', 'case',
       'center', 'change', 'chaos', 'child', 'choir', 'chorus', 'church',
  

In [22]:
# select one emotion and get the matrix relates to the emotion
def search_emotion_matrix (emotion_dict, emotion, matrix_word_header, tweets):
    emotion_word = np.array(emotion_dict[emotion]) # get all words relates to one emotion
#     print(emotion_word)
    # get the position of the matrix
    word_position = np.where(np.isin(matrix_word_header, emotion_word)==True)
    tweets_emotion_words = matrix_word_header[word_position]
    #get the updated matrix
    tweets_emotion = tweets[:,word_position[0]]
    # get non_zero output
    non_zero_count = np.nonzero(tweets_emotion)
    return non_zero_count, tweets_emotion_words


In [23]:
#get all the ids and convert
ids = valid_tweets.id.tolist()
ids = [str(i) for i in ids]

In [24]:
# create the final output dictionary 
for emotion in emotions: 
    non_zero_count, tweets_emotion_words = search_emotion_matrix (emotion_dict, emotion, matrix_word_header, tweets)
    id_keys = np.array(ids)[non_zero_count[0]] # get all id
    words_value = tweets_emotion_words[non_zero_count[1]]
    new_dict = {}
    for (key, value) in zip(id_keys,words_value):
        if key in new_dict:
            new_dict[key].append(value)
        else:
            new_dict[key] = [value]
    #save the created dictionary file as pickle
    with open('BOG_results/NRC_output_{}.pickle'.format(emotion),'wb') as f:
        pickle.dump(new_dict, f)

NameError: name 'tweets' is not defined