# Bag of words 


Data pre-processsing for text analysis and pairwise word overlap

* 1) Load libraries and define functions
* 2) Import data: parsed email data and path length data
* 3) Process and store each user's vocabulary 
* 4) Calcualte pairwise % overlaps 
* 5) Create pairwise lists of  non-overlapping words 

## 1. Loading libraries and define functions

In [18]:
# NLP
from subprocess import check_output
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer 
from nltk.util import ngrams 
from nltk.probability import FreqDist
import string
import os
import pandas as pd
import re
import nltk

In [2]:
def email_clean(text):
    text = re.sub(r'\n--.*?\n', '', text, flags=re.DOTALL)
    text = re.sub(r'Forwarded by.*?Subject:', '', text, flags=re.DOTALL) 
    text = re.sub(r'Fwd:.*?Subject:', '', text, flags=re.DOTALL) 
    text = re.sub(r'Fw:.*?Subject:', '', text, flags=re.DOTALL)     
    text = re.sub(r'FW:.*?Subject:', '', text, flags=re.DOTALL)         
    text = re.sub(r'Forwarded:.*?Subject:', '', text, flags=re.DOTALL)         
    text = re.sub(r'From:.*?Subject:', '', text, flags=re.DOTALL)
    text = re.sub(r'PM', '', text, flags=re.DOTALL)
    text = re.sub(r'AM', '', text, flags=re.DOTALL)
    
    return text

def clean(text):
    stop = set(stopwords.words('english'))
    stop.update(("to","cc","subject","http","from","sent",
                 "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 
                 "enron america corp", "enron", "etc", "na", firstname, lastname))
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()
    porter= PorterStemmer()
    
    text=text.rstrip()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    stop_free = " ".join([i for i in text.lower().split() if((i not in stop) and (not i.isdigit()))])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    #stem = " ".join(porter.stem(token) for token in normalized.split())
    
    return normalized

## 2. Importing data 
 
I will use parsed email data; path length data; user-address key data

In [7]:
email_path = 'C:/Users/Margeum/Dropbox/DS projects/05. Email data/emails_in_csv'
os.chdir(email_path)

emails_df = pd.read_csv('./emails_parsed.csv')
pw_path_length_df = pd.read_csv('./pw_path_length_df.csv')
address_user_df = pd.read_csv('./address_user_df.csv')

In [9]:
pw_paths_df = pw_path_length_df[pw_path_length_df['s_path_length'] > 0]
print ("Total number of pairs:" + str(len(pw_paths_df)))

Total number of pairs:18091


### Cleaning text for each user

Clean text and store text frequency as a list of lists 
This will be used in TF-IDF as well

In [19]:
# On a CPU, it takes about 30 minutes

tf_list = []

i = 0

for index, row in address_user_df.iterrows():
    
    text_cleaned_i = []

    user_i = row['user']
    print user_i
    
    lastname = row['address'].split('@')[0].split('.')[-1]
    firstname = str(row['address'].split('.')[0])

    text_to_clean_df_i = emails_df[emails_df["user"] == user_i][["content", "user"]].reset_index()
        
    for text in text_to_clean_df_i['content']:
        text_cleaned_i.append(clean(email_clean(text)).split())

    unlisted_text_cleaned_i = [item for sublist in text_cleaned_i for item in sublist]
    freqdist_user_i = nltk.FreqDist(ngrams(unlisted_text_cleaned_i, 1))

    tf_list.append(freqdist_user_i)    



allen-p
arnold-j
arora-h
badeer-r
bailey-s
bass-e
baughman-d
beck-s
blair-l
brawner-s
buy-r
campbell-l
carson-m
cash-m
causholli-m
corman-s
cuilla-m
dasovich-j
davis-d
dean-c
delainey-d
derrick-j
dickson-s
donoho-l
donohoe-t
dorland-c
ermis-f
farmer-d
fischer-m
forney-j
fossum-d
gang-l
gay-r
geaccone-t
germany-c
giron-d
griffith-j
grigsby-m
guzman-m
haedicke-m
hain-m
harris-s
hayslett-r
heard-m
hendrickson-s
hernandez-j
hodge-j
holst-k
horton-s
hyatt-k
hyvl-d
jones-t
kaminski-v
kean-s
keavey-p
keiser-k
king-j
kitchen-l
kuykendall-t
lavorato-j
lay-k
lenhart-m
lewis-a
lokay-m
lokey-t
love-p
lucci-p
maggi-m
mann-k
martin-t
may-l
mccarty-d
mcconnell-m
mckay-b
mckay-j
mclaughlin-e
meyers-a
motley-m
neal-s
nemec-g
panus-s
parks-j
pereira-s
perlingiere-d
pimenov-v
platter-p
presto-k
quenet-j
quigley-d
rapp-b
reitmeyer-j
richey-c
ring-a
ring-r
rogers-b
ruscitti-k
sager-e
saibi-e
salisbury-h
sanchez-m
sanders-r
scholtes-d
schoolcraft-d
schwieger-j
scott-s
semperger-c
shackleton-s
shankman-j
sha

In [26]:
bow_list = []

i = 0

for index, row in address_user_df.iterrows():
 # Convert frequency into indicator (from frequency dict to bag of words)
    bow_i = []
    user_i = row['user']
    print user_i
    
    freqdist_user_i = tf_list[index]
    
    for k in range(len(freqdist_user_i)):
        bow_i.append(str(list(freqdist_user_i)[k][0]))
    bow_list.append(bow_i)
            

allen-p
arnold-j
arora-h
badeer-r
bailey-s
bass-e
baughman-d
beck-s
blair-l
brawner-s
buy-r
campbell-l
carson-m
cash-m
causholli-m
corman-s
cuilla-m
dasovich-j
davis-d
dean-c
delainey-d
derrick-j
dickson-s
donoho-l
donohoe-t
dorland-c
ermis-f
farmer-d
fischer-m
forney-j
fossum-d
gang-l
gay-r
geaccone-t
germany-c
giron-d
griffith-j
grigsby-m
guzman-m
haedicke-m
hain-m
harris-s
hayslett-r
heard-m
hendrickson-s
hernandez-j
hodge-j
holst-k
horton-s
hyatt-k
hyvl-d
jones-t
kaminski-v
kean-s
keavey-p
keiser-k
king-j
kitchen-l
kuykendall-t
lavorato-j
lay-k
lenhart-m
lewis-a
lokay-m
lokey-t
love-p
lucci-p
maggi-m
mann-k
martin-t
may-l
mccarty-d
mcconnell-m
mckay-b
mckay-j
mclaughlin-e
meyers-a
motley-m
neal-s
nemec-g
panus-s
parks-j
pereira-s
perlingiere-d
pimenov-v
platter-p
presto-k
quenet-j
quigley-d
rapp-b
reitmeyer-j
richey-c
ring-a
ring-r
rogers-b
ruscitti-k
sager-e
saibi-e
salisbury-h
sanchez-m
sanders-r
scholtes-d
schoolcraft-d
schwieger-j
scott-s
semperger-c
shackleton-s
shankman-j
sha

In [21]:
# For path_length of 3+, pairwise 
distant_pairs = pw_paths_df[pw_paths_df['s_path_length'] > 2]
len(distant_pairs)  #9085

9085

For each pair < 3, let's create overlap scores;

In [114]:
# For pw_paths_df, run 1) overlap; union; intersect  --> a list
# For pw_paths_df, run 2) i_minus_j (we have both ways --> so, I only take care about i here)


user_i_list = []
user_j_list = []
bow_count_i_list = []
bow_count_j_list = []
count_overlap_list = []
count_union_list = []
count_intersect_list = []
non_overlap_list = []

for index, row in pw_paths_df.iterrows():
    
    ## Need to check this step 
    try: 
        index_i = address_user_df.index[address_user_df["address"]==row["node_i"]].tolist()[0]
        try:
            index_j = address_user_df.index[address_user_df["address"]==row["node_j"]].tolist()[0]

            user_i_list.append(address_user_df.loc[address_user_df["address"]==row["node_i"], 'user'].item()) 
            user_j_list.append(address_user_df.loc[address_user_df["address"]==row["node_j"], 'user'].item())

            bow_i = bow_list[index_i]   # Bag of words for user i
            bow_j = bow_list[index_j]    # Bag of words for user j

            union = list(set().union(bow_i, bow_j))
            intersect = list(set(bow_i) & set(bow_j))

            bow_count_i_list.append(float(len(bow_i)))
            bow_count_j_list.append(float(len(bow_j)))
            count_overlap_list.append((float(len(intersect)))/(float(len(union))))
            count_union_list.append(float(len(union)))
            count_intersect_list.append(float(len(intersect)))

            if row['s_path_length' > 2]:
                bow_i_df = pd.DataFrame(bow_i, columns = ["word"])
                bow_i_df["intersect"] = pd.DataFrame((bow_i_df["word"].isin(bow_j)))  # if intersect = False, then, complement
                non_overlap = list(bow_i_df[bow_i_df["intersect"]==False]["word"])
                non_overlap_list.append(non_overlap)

        except: 
            print ("At " + str(index) + " , node:" + str(node_j))
            pass
    except: 
        print ("At " + str(index) + " , node:" + str(node_i))
        pass
        


At 7 , node:dana.davis@enron.com
At 11 , node:dana.davis@enron.com
At 28 , node:dana.davis@enron.com
At 69 , node:dana.davis@enron.com
At 71 , node:dana.davis@enron.com
At 77 , node:dana.davis@enron.com
At 97 , node:dana.davis@enron.com
At 116 , node:dana.davis@enron.com
At 140 , node:dana.davis@enron.com
At 143 , node:dana.davis@enron.com
At 144 , node:dana.davis@enron.com
At 159 , node:dana.davis@enron.com
At 218 , node:dana.davis@enron.com
At 221 , node:dana.davis@enron.com
At 246 , node:dana.davis@enron.com
At 252 , node:dana.davis@enron.com
At 291 , node:dana.davis@enron.com
At 317 , node:dana.davis@enron.com
At 334 , node:dana.davis@enron.com
At 336 , node:dana.davis@enron.com
At 376 , node:dana.davis@enron.com
At 399 , node:dana.davis@enron.com
At 403 , node:dana.davis@enron.com
At 404 , node:dana.davis@enron.com
At 421 , node:dana.davis@enron.com
At 448 , node:dana.davis@enron.com
At 456 , node:dana.davis@enron.com
At 458 , node:dana.davis@enron.com
At 523 , node:dana.davis@enr

At 1878 , node:albert.meyers@enron.com
At 1879 , node:albert.meyers@enron.com
At 1880 , node:albert.meyers@enron.com
At 1881 , node:albert.meyers@enron.com
At 1882 , node:albert.meyers@enron.com
At 1884 , node:albert.meyers@enron.com
At 1885 , node:albert.meyers@enron.com
At 1886 , node:albert.meyers@enron.com
At 1887 , node:albert.meyers@enron.com
At 1888 , node:albert.meyers@enron.com
At 1889 , node:albert.meyers@enron.com
At 1893 , node:dana.davis@enron.com
At 1894 , node:dana.davis@enron.com
At 1903 , node:dana.davis@enron.com
At 1918 , node:dana.davis@enron.com
At 1959 , node:dana.davis@enron.com
At 1972 , node:dana.davis@enron.com
At 1978 , node:dana.davis@enron.com
At 2004 , node:dana.davis@enron.com
At 2039 , node:dana.davis@enron.com
At 2066 , node:dana.davis@enron.com
At 2082 , node:dana.davis@enron.com
At 2090 , node:dana.davis@enron.com
At 2091 , node:dana.davis@enron.com
At 2129 , node:dana.davis@enron.com
At 2153 , node:dana.davis@enron.com
At 2157 , node:dana.davis@enron

At 3799 , node:dana.davis@enron.com
At 3840 , node:dana.davis@enron.com
At 3842 , node:dana.davis@enron.com
At 3864 , node:dana.davis@enron.com
At 3892 , node:dana.davis@enron.com
At 3896 , node:dana.davis@enron.com
At 3898 , node:dana.davis@enron.com
At 3934 , node:dana.davis@enron.com
At 3958 , node:dana.davis@enron.com
At 3977 , node:dana.davis@enron.com
At 3979 , node:dana.davis@enron.com
At 4016 , node:dana.davis@enron.com
At 4041 , node:dana.davis@enron.com
At 4045 , node:dana.davis@enron.com
At 4046 , node:dana.davis@enron.com
At 4062 , node:dana.davis@enron.com
At 4066 , node:dana.davis@enron.com
At 4082 , node:dana.davis@enron.com
At 4093 , node:dana.davis@enron.com
At 4123 , node:dana.davis@enron.com
At 4125 , node:dana.davis@enron.com
At 4131 , node:dana.davis@enron.com
At 4167 , node:dana.davis@enron.com
At 4199 , node:dana.davis@enron.com
At 4225 , node:dana.davis@enron.com
At 4241 , node:dana.davis@enron.com
At 4243 , node:dana.davis@enron.com
At 4283 , node:dana.davis@en

At 5358 , node:dana.davis@enron.com
At 5386 , node:dana.davis@enron.com
At 5400 , node:dana.davis@enron.com
At 5404 , node:dana.davis@enron.com
At 5405 , node:dana.davis@enron.com
At 5428 , node:dana.davis@enron.com
At 5467 , node:dana.davis@enron.com
At 5469 , node:dana.davis@enron.com
At 5486 , node:dana.davis@enron.com
At 5514 , node:dana.davis@enron.com
At 5551 , node:dana.davis@enron.com
At 5577 , node:dana.davis@enron.com
At 5595 , node:dana.davis@enron.com
At 5597 , node:dana.davis@enron.com
At 5637 , node:dana.davis@enron.com
At 5662 , node:dana.davis@enron.com
At 5663 , node:dana.davis@enron.com
At 5667 , node:dana.davis@enron.com
At 5673 , node:dana.davis@enron.com
At 5687 , node:dana.davis@enron.com
At 5728 , node:dana.davis@enron.com
At 5730 , node:dana.davis@enron.com
At 5759 , node:dana.davis@enron.com
At 5792 , node:dana.davis@enron.com
At 5793 , node:dana.davis@enron.com
At 5803 , node:dana.davis@enron.com
At 5818 , node:dana.davis@enron.com
At 5822 , node:dana.davis@en

At 9194 , node:dana.davis@enron.com
At 9217 , node:dana.davis@enron.com
At 9247 , node:dana.davis@enron.com
At 9253 , node:dana.davis@enron.com
At 9279 , node:dana.davis@enron.com
At 9294 , node:dana.davis@enron.com
At 9329 , node:dana.davis@enron.com
At 9354 , node:dana.davis@enron.com
At 9372 , node:dana.davis@enron.com
At 9374 , node:dana.davis@enron.com
At 9381 , node:dana.davis@enron.com
At 9435 , node:dana.davis@enron.com
At 9436 , node:dana.davis@enron.com
At 9446 , node:dana.davis@enron.com
At 9450 , node:albert.meyers@enron.com
At 9451 , node:albert.meyers@enron.com
At 9452 , node:albert.meyers@enron.com
At 9453 , node:albert.meyers@enron.com
At 9454 , node:albert.meyers@enron.com
At 9455 , node:albert.meyers@enron.com
At 9456 , node:albert.meyers@enron.com
At 9457 , node:albert.meyers@enron.com
At 9458 , node:albert.meyers@enron.com
At 9459 , node:albert.meyers@enron.com
At 9460 , node:albert.meyers@enron.com
At 9461 , node:albert.meyers@enron.com
At 9462 , node:albert.meyers

At 10779 , node:dana.davis@enron.com
At 10781 , node:dana.davis@enron.com
At 10785 , node:dana.davis@enron.com
At 10786 , node:dana.davis@enron.com
At 10800 , node:albert.meyers@enron.com
At 10801 , node:albert.meyers@enron.com
At 10802 , node:albert.meyers@enron.com
At 10803 , node:albert.meyers@enron.com
At 10804 , node:albert.meyers@enron.com
At 10805 , node:albert.meyers@enron.com
At 10806 , node:albert.meyers@enron.com
At 10807 , node:albert.meyers@enron.com
At 10808 , node:albert.meyers@enron.com
At 10809 , node:albert.meyers@enron.com
At 10810 , node:albert.meyers@enron.com
At 10811 , node:albert.meyers@enron.com
At 10812 , node:albert.meyers@enron.com
At 10813 , node:albert.meyers@enron.com
At 10814 , node:albert.meyers@enron.com
At 10815 , node:albert.meyers@enron.com
At 10816 , node:albert.meyers@enron.com
At 10817 , node:albert.meyers@enron.com
At 10818 , node:albert.meyers@enron.com
At 10819 , node:albert.meyers@enron.com
At 10820 , node:albert.meyers@enron.com
At 10821 , n

At 12212 , node:dana.davis@enron.com
At 12214 , node:dana.davis@enron.com
At 12243 , node:dana.davis@enron.com
At 12253 , node:dana.davis@enron.com
At 12278 , node:dana.davis@enron.com
At 12279 , node:dana.davis@enron.com
At 12283 , node:dana.davis@enron.com
At 12307 , node:dana.davis@enron.com
At 12328 , node:dana.davis@enron.com
At 12344 , node:dana.davis@enron.com
At 12346 , node:dana.davis@enron.com
At 12352 , node:dana.davis@enron.com
At 12403 , node:dana.davis@enron.com
At 12413 , node:dana.davis@enron.com
At 12414 , node:dana.davis@enron.com
At 12441 , node:dana.davis@enron.com
At 12455 , node:dana.davis@enron.com
At 12466 , node:dana.davis@enron.com
At 12477 , node:dana.davis@enron.com
At 12484 , node:dana.davis@enron.com
At 12529 , node:dana.davis@enron.com
At 12545 , node:dana.davis@enron.com
At 12549 , node:dana.davis@enron.com
At 12555 , node:dana.davis@enron.com
At 12556 , node:dana.davis@enron.com
At 12570 , node:dana.davis@enron.com
At 12627 , node:dana.davis@enron.com
A

At 14329 , node:dana.davis@enron.com
At 14358 , node:dana.davis@enron.com
At 14360 , node:dana.davis@enron.com
At 14369 , node:dana.davis@enron.com
At 14375 , node:dana.davis@enron.com
At 14414 , node:dana.davis@enron.com
At 14439 , node:dana.davis@enron.com
At 14443 , node:dana.davis@enron.com
At 14445 , node:albert.meyers@enron.com
At 14446 , node:albert.meyers@enron.com
At 14447 , node:albert.meyers@enron.com
At 14448 , node:albert.meyers@enron.com
At 14449 , node:albert.meyers@enron.com
At 14450 , node:albert.meyers@enron.com
At 14451 , node:albert.meyers@enron.com
At 14452 , node:albert.meyers@enron.com
At 14453 , node:albert.meyers@enron.com
At 14454 , node:albert.meyers@enron.com
At 14455 , node:albert.meyers@enron.com
At 14456 , node:albert.meyers@enron.com
At 14457 , node:albert.meyers@enron.com
At 14458 , node:albert.meyers@enron.com
At 14459 , node:albert.meyers@enron.com
At 14460 , node:albert.meyers@enron.com
At 14461 , node:albert.meyers@enron.com
At 14462 , node:albert.m

At 15782 , node:dana.davis@enron.com
At 15783 , node:dana.davis@enron.com
At 15793 , node:dana.davis@enron.com
At 15813 , node:dana.davis@enron.com
At 15838 , node:dana.davis@enron.com
At 15856 , node:dana.davis@enron.com
At 15858 , node:dana.davis@enron.com
At 15898 , node:dana.davis@enron.com
At 15916 , node:dana.davis@enron.com
At 15920 , node:dana.davis@enron.com
At 15921 , node:dana.davis@enron.com
At 15937 , node:dana.davis@enron.com
At 15938 , node:dana.davis@enron.com
At 15942 , node:dana.davis@enron.com
At 15959 , node:dana.davis@enron.com
At 15999 , node:dana.davis@enron.com
At 16001 , node:dana.davis@enron.com
At 16019 , node:dana.davis@enron.com
At 16045 , node:dana.davis@enron.com
At 16094 , node:dana.davis@enron.com
At 16110 , node:dana.davis@enron.com
At 16126 , node:dana.davis@enron.com
At 16128 , node:dana.davis@enron.com
At 16168 , node:dana.davis@enron.com
At 16191 , node:dana.davis@enron.com
At 16195 , node:dana.davis@enron.com
At 16198 , node:dana.davis@enron.com
A

At 17198 , node:dana.davis@enron.com
At 17238 , node:dana.davis@enron.com
At 17250 , node:dana.davis@enron.com
At 17271 , node:dana.davis@enron.com
At 17276 , node:dana.davis@enron.com
At 17279 , node:dana.davis@enron.com
At 17280 , node:dana.davis@enron.com
At 17284 , node:dana.davis@enron.com
At 17292 , node:dana.davis@enron.com
At 17309 , node:dana.davis@enron.com
At 17349 , node:dana.davis@enron.com
At 17351 , node:dana.davis@enron.com
At 17377 , node:dana.davis@enron.com
At 17396 , node:dana.davis@enron.com
At 17445 , node:dana.davis@enron.com
At 17468 , node:dana.davis@enron.com
At 17470 , node:dana.davis@enron.com
At 17509 , node:dana.davis@enron.com
At 17521 , node:dana.davis@enron.com
At 17544 , node:dana.davis@enron.com
At 17548 , node:dana.davis@enron.com
At 17549 , node:dana.davis@enron.com
At 17568 , node:dana.davis@enron.com
At 17598 , node:dana.davis@enron.com
At 17613 , node:dana.davis@enron.com
At 17642 , node:dana.davis@enron.com
At 17655 , node:dana.davis@enron.com
A

In [110]:
pw_paths_df['user_i'] = user_i_list 
pw_paths_df['user_j'] = user_j_list 
pw_paths_df['bow_count_i'] = bow_count_i_list 
pw_paths_df['bow_count_j'] = bow_count_j_list 
pw_paths_df['count_overlap'] = count_overlap_list 
pw_paths_df['count_union'] = count_union_list 
pw_paths_df['count_intersect'] = count_intersect_list 
pw_paths_df['non_overlap'] = non_overlap_list 

albert.meyers@enron.com


'meyers-a'

In [93]:
bow_i.isin(bow_j)
bow_i

Unnamed: 0,word
0,nicholas
1,dip
2,final
3,resolve
4,miller
5,hammering
6,trick
7,pack
8,imminent
9,go
