In [277]:
# Prepare environment and load data
import pandas as pd
import nltk
import feather
import re, math
from collections import Counter

event_dict = feather.read_dataframe('../parsed_data/parsed_dict.feather')
email_df = feather.read_dataframe('../parsed_data/simplified_email.feather')

In [278]:
# Helper function to extract nouns from a Python string object
def extract_nouns(txt):
    nouns = []

    # create list of words in a text, taking out punctuations, symbols etc.
    words = nltk.word_tokenize(txt)
    # categorise all words in text with tags
    tags = nltk.pos_tag(words)

    # select all words categorised as nouns
    for word, pos in tags:
        if (pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS'):
            nouns.append(word.lower())

    return nouns

# Helper function to convert text to vector
WORD = re.compile(r'\w+')

def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)

# Helper function to compute cosine similarity
def cosine_sim(vect1, vect2):
    a = set(vect1)
    b = set(vect2)
    top = len(a & b)
    bottom = len(a | b)
    if not bottom:
        sim = 0
    else:
        sim = top / float(bottom)

    return sim

In [279]:
# Testing for regular expression
import re
text='xd8, xa7, xd9, xd8, xb1, xd8, expression'
t = re.sub(r'(^|\s)x(\w+,)', r'', text)
t
# re.sub(r'(^|\s),', r'\1', text1)

' expression'

In [280]:
## Prepare event dataframe
# type(event_dict.dictionary[0])
# event_dict.columns.values.tolist()
event_dict.drop(['NA'], inplace=True, axis=1)
event_dict = event_dict.transpose()
event_dict.columns = event_dict.loc['event']
event_dict = event_dict.reindex(event_dict.index.drop(['event']))
event_dict = event_dict.applymap(lambda z: re.sub(r'(^|\s)x(\w+,)', r'', z))
event_dict.head()

event,benghazi,iran_deal,hillary,doctrine,arab_spring,benghazi_committe
dictionary,"benghazi, attack, took, place, even, septemb, ...","joint, comprehens, plan, action, jcpoa, persia...","hillari, dian, rodham, clinton, da, born, octo...","hillari, doctrin, term, us, describ, agenda, f...","arab, spring, arab, ar, rab, al, revolutionari...","unit, state, hous, select, committe, event, su..."


In [281]:
# Only keep nouns in event_dict
df_dict1 = event_dict.applymap(extract_nouns)

In [282]:
email_df.columns.tolist()
#email_df.DocNumber

['DocNumber', 'date', 'edited', 'email_raw']

In [283]:
## Event_dict matrix for cosine similarities
# event_mat = event_dict.values
# e, d = event_dict.shape
# event_mat.shape
# event_mat

In [284]:
## Prepare email dataframe
# type(email_df.email_raw[1])
# email_df.columns.values.tolist()
## Convert string in email_raw column to list of strings
# email_df.email_raw = email_df.email_raw.apply(lambda x: x.split(","))

## New dataframe for analysis
email_df1 = email_df[['DocNumber', 'email_raw']]
email_df1.set_index(['DocNumber'], inplace=True)

In [285]:
# email_df1.columns.values.tolist()
type(email_df1.email_raw[1])

str

In [286]:
# Only keep nouns in emails
email_df1 = email_df1.head()
email_df1.email_raw = email_df1.email_raw.apply(lambda x: extract_nouns(x))

In [287]:
type(email_df1.email_raw[1])

list

In [289]:
# Create new dataframe to contain cosine similarities
# cosim_df = pd.DataFrame.from_dict(dict([for l in list_events ]))
cosim_df = email_df.head()[['DocNumber', 'date']]

beng = text_to_vector(event_dict.benghazi[0])
iran = text_to_vector(event_dict.iran_deal[0])
hill = text_to_vector(event_dict.hillary[0])
doct = text_to_vector(event_dict.doctrine[0])
spring = text_to_vector(event_dict.arab_spring[0])
comm = text_to_vector(event_dict.benghazi_committe[0])
# print cosine_sim(beng, hill)

benghazi = []
iran_deal = []
hillary = []
doctrine = []
arab_spring = []
benghazi_committe = []

# email_df1 = email_df1.email_raw.apply(lambda x: text_to_vector(x))

for index, emails in email_df1.iterrows():
    for email in emails:
        benghazi.append(round(cosine_sim(beng, email), 3))
        iran_deal.append(round(cosine_sim(iran, email), 3))
        hillary.append(round(cosine_sim(hill, email), 3))
        doctrine.append(round(cosine_sim(doct, email), 3))
        arab_spring.append(round(cosine_sim(spring, email), 3))
        benghazi_committe.append(round(cosine_sim(comm, email), 3))
        
cosim_df['benghazi'] = benghazi
cosim_df['iran_deal'] = iran_deal
cosim_df['hillary'] = hillary
cosim_df['doctrine'] = doctrine
cosim_df['arab_spring'] = arab_spring
cosim_df['benghazi_committe'] = benghazi_committe
cosim_df

Unnamed: 0,DocNumber,date,benghazi,iran_deal,hillary,doctrine,arab_spring,benghazi_committe
0,C05739545,09/12/2012,0.011,0.008,0.007,0.009,0.005,0.015
1,C05739546,03/03/2011,0.098,0.062,0.072,0.081,0.095,0.099
2,C05739547,09/12/2012,0.011,0.008,0.008,0.014,0.007,0.019
3,C05739550,09/12/2012,0.021,0.013,0.013,0.023,0.016,0.028
4,C05739554,03/11/2011,0.095,0.061,0.07,0.075,0.09,0.096


In [None]:
# cosine_sim(vect1, vect2):

data = {"A": "hi there look at me", "B": "i like the way you look"}
vocab = {"C": "no you look at me", "D": "i said look at me first"}
a_df = pd.DataFrame(data.items(), columns=['A', 'B'])
b_df = pd.DataFrame(vocab.items(), columns=['C', 'D'])
# a_df = pd.DataFrame.from_dict({"A": "hi there look at me", "B": "no you look at me"}, index=np.array())
# a_new_df = cosine_sim(a_df['B'], b_df.loc[, x])
# np.multiply(df['A'], df['B'])
# a_new_df


In [None]:
# new_df = email_df.DocNumber
new_df = pd.DataFrame(columns=b_df.columns.values.tolist())


from collections import defaultdict
dates_dict = defaultdict(list)

for event, vaocab in cur:
    dates_dict[key].append(date)
C = []
D = []
for index1, row1 in a_df.iterrows():
    for index2, row2 in b_df.iterrows():
#         print cosine_sim(row1.B, row2.D)
        C.append(cosine_sim(row1.B, row2.C))
        D.append(cosine_sim(row1.B, row2.D))
        
# new_df.columns.values.tolist()
new_df['C'] = C
new_df['D'] = D
new_df

In [None]:
type(event_dict.head())

In [None]:
# Create new dataframe containing similarities
sim_df = pd.DataFrame(columns=set(event_dict.event.tolist()))
sim_df['date'] = email_df.date
# sim_df

In [None]:
# data = df.values
# m, k = data.shape

# mat = np.zeros((m, m))

# for i in xrange(m):
#     for j in xrange(m):
#         if i != j:
#             mat[i][j] = cosine(data[i,:], data[j,:])
#         else:
#             mat[i][j] = 0.