# Bag of words 


Data pre-processsing for text analysis and pairwise word overlap

* 1) Load libraries and define functions
* 2) Import data: parsed email data and path length data
* 3) Process and store each user's vocabulary 
* 4) Calcualte pairwise % overlaps 
* 5) Create pairwise lists of  non-overlapping words 

## 1. Loading libraries and define functions

In [188]:
# NLP
from subprocess import check_output
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer 
from nltk.util import ngrams 
from nltk.probability import FreqDist
import string
import os
import pandas as pd
import re
import nltk

In [189]:
def email_clean(text):
    text = re.sub(r'\n--.*?\n', '', text, flags=re.DOTALL)
    text = re.sub(r'enron.com', '', text, flags=re.DOTALL)
    text = re.sub(r'Forwarded by.*?Subject:', '', text, flags=re.DOTALL) 
    text = re.sub(r'Fwd:.*?Subject:', '', text, flags=re.DOTALL) 
    text = re.sub(r'Fw:.*?Subject:', '', text, flags=re.DOTALL)     
    text = re.sub(r'FW:.*?Subject:', '', text, flags=re.DOTALL)         
    text = re.sub(r'Forwarded:.*?Subject:', '', text, flags=re.DOTALL)         
    text = re.sub(r'From:.*?Subject:', '', text, flags=re.DOTALL)
    text = re.sub(r'PM', '', text, flags=re.DOTALL)
    text = re.sub(r'AM', '', text, flags=re.DOTALL)
    
    return text

def clean(text):
    stop = set(stopwords.words('english'))
    stop.update(("to","cc","subject","http","from","sent",
                 "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 
                 "enron america corp", "enron", "etc", "na", firstname, lastname))
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()
    porter= PorterStemmer()
    
    text=text.rstrip()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    stop_free = " ".join([i for i in text.lower().split() if((i not in stop) and (not i.isdigit()))])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    #stem = " ".join(porter.stem(token) for token in normalized.split())
    
    return normalized

## 2. Importing data 
 
I will use parsed email data; path length data; user-address key data

In [190]:
path_to_email_data = 'C:/Users/Margeum/Dropbox/DS projects/05. Email data/emails_in_csv'
os.chdir(path_to_email_data)

emails_df = pd.read_csv('./emails_parsed.csv')
pw_path_length_df = pd.read_csv('./pw_path_length_df.csv')
address_user_df = pd.read_csv('./address_user_df.csv')

In [191]:
pw_paths_df = pw_path_length_df[pw_path_length_df['s_path_length'] > 0]
print ("Total number of pairs:" + str(len(pw_paths_df)))

Total number of pairs:18090


### Cleaning text for each user

Clean text and store text frequency as a list of lists 
This will be used in TF-IDF as well

In [192]:
# On a CPU, it takes about 30 minutes

tf_list = []

i = 0

for index, row in address_user_df.iterrows():
    
    text_cleaned_i = []

    user_i = row['user']
    print user_i
    
    lastname = row['address'].split('@')[0].split('.')[-1]
    firstname = str(row['address'].split('.')[0])

    text_to_clean_df_i = emails_df[emails_df["user"] == user_i][["content", "user"]].reset_index()
        
    for text in text_to_clean_df_i['content']:
        text_cleaned_i.append(clean(email_clean(text)).split())

    unlisted_text_cleaned_i = [item for sublist in text_cleaned_i for item in sublist]
    freqdist_user_i = nltk.FreqDist(ngrams(unlisted_text_cleaned_i, 1))

    tf_list.append(freqdist_user_i)    



allen-p
arnold-j
arora-h
badeer-r
bailey-s
bass-e
baughman-d
beck-s
blair-l
brawner-s
buy-r
campbell-l
carson-m
cash-m
causholli-m
corman-s
cuilla-m
dasovich-j
davis-d
dean-c
delainey-d
derrick-j
dickson-s
donoho-l
donohoe-t
dorland-c
ermis-f
farmer-d
fischer-m
forney-j
fossum-d
gang-l
gay-r
geaccone-t
germany-c
giron-d
griffith-j
grigsby-m
guzman-m
haedicke-m
hain-m
harris-s
hayslett-r
heard-m
hendrickson-s
hernandez-j
hodge-j
holst-k
horton-s
hyatt-k
hyvl-d
jones-t
kaminski-v
kean-s
keavey-p
keiser-k
king-j
kitchen-l
kuykendall-t
lavorato-j
lay-k
lenhart-m
lewis-a
lokay-m
lokey-t
love-p
lucci-p
maggi-m
mann-k
martin-t
may-l
mccarty-d
mcconnell-m
mckay-b
mckay-j
mclaughlin-e
meyers-a
motley-m
neal-s
nemec-g
panus-s
parks-j
pereira-s
perlingiere-d
pimenov-v
platter-p
presto-k
quenet-j
quigley-d
rapp-b
reitmeyer-j
richey-c
ring-a
ring-r
rogers-b
ruscitti-k
sager-e
saibi-e
salisbury-h
sanchez-m
sanders-r
scholtes-d
schoolcraft-d
schwieger-j
scott-s
semperger-c
shackleton-s
shankman-j
sha

In [193]:
bow_list = []

i = 0

for index, row in address_user_df.iterrows():
 # Convert frequency into indicator (from frequency dict to bag of words)
    bow_i = []
    user_i = row['user']
    print user_i
    
    freqdist_user_i = tf_list[index]
    
    for k in range(len(freqdist_user_i)):
        bow_i.append(str(list(freqdist_user_i)[k][0]))
    bow_list.append(bow_i)
            

allen-p
arnold-j
arora-h
badeer-r
bailey-s
bass-e
baughman-d
beck-s
blair-l
brawner-s
buy-r
campbell-l
carson-m
cash-m
causholli-m
corman-s
cuilla-m
dasovich-j
davis-d
dean-c
delainey-d
derrick-j
dickson-s
donoho-l
donohoe-t
dorland-c
ermis-f
farmer-d
fischer-m
forney-j
fossum-d
gang-l
gay-r
geaccone-t
germany-c
giron-d
griffith-j
grigsby-m
guzman-m
haedicke-m
hain-m
harris-s
hayslett-r
heard-m
hendrickson-s
hernandez-j
hodge-j
holst-k
horton-s
hyatt-k
hyvl-d
jones-t
kaminski-v
kean-s
keavey-p
keiser-k
king-j
kitchen-l
kuykendall-t
lavorato-j
lay-k
lenhart-m
lewis-a
lokay-m
lokey-t
love-p
lucci-p
maggi-m
mann-k
martin-t
may-l
mccarty-d
mcconnell-m
mckay-b
mckay-j
mclaughlin-e
meyers-a
motley-m
neal-s
nemec-g
panus-s
parks-j
pereira-s
perlingiere-d
pimenov-v
platter-p
presto-k
quenet-j
quigley-d
rapp-b
reitmeyer-j
richey-c
ring-a
ring-r
rogers-b
ruscitti-k
sager-e
saibi-e
salisbury-h
sanchez-m
sanders-r
scholtes-d
schoolcraft-d
schwieger-j
scott-s
semperger-c
shackleton-s
shankman-j
sha

Let's create overlap scores and lists of non-overlapping words (i.e., pair-wise jargon lists)

In [194]:
# For pw_paths_df, run 1) overlap; union; intersect  --> a list
# For pw_paths_df, run 2) i_minus_j (we have both ways --> so, I only take care about i here)

node_i_list = []
node_j_list = []
pw_pl_list = []
user_i_list = []
user_j_list = []
bow_count_i_list = []
bow_count_j_list = []
count_overlap_list = []
count_union_list = []
count_intersect_list = []
non_overlap_list = []
unused_ij = []

for index, row in pw_paths_df.iterrows():
    
    ## Need to check this step 
    try: 
        index_i = address_user_df.index[address_user_df["address"]==row["node_i"]].tolist()[0]
        try:
            index_j = address_user_df.index[address_user_df["address"]==row["node_j"]].tolist()[0]

            user_i_list.append(address_user_df.loc[address_user_df["address"]==row["node_i"], 'user'].item()) 
            user_j_list.append(address_user_df.loc[address_user_df["address"]==row["node_j"], 'user'].item())
            node_i_list.append(row["node_i"])
            node_j_list.append(row["node_j"])
            pw_pl_list.append(row["s_path_length"])


            bow_i = bow_list[index_i]   # Bag of words for user i
            bow_j = bow_list[index_j]    # Bag of words for user j

            union = list(set().union(bow_i, bow_j))
            intersect = list(set(bow_i) & set(bow_j))

            bow_count_i_list.append(float(len(bow_i)))
            bow_count_j_list.append(float(len(bow_j)))
            count_overlap_list.append((float(len(intersect)))/(float(len(union))))
            count_union_list.append(float(len(union)))
            count_intersect_list.append(float(len(intersect)))

            if row['s_path_length' > 2]:
                bow_i_df = pd.DataFrame(bow_i, columns = ["word"])
                bow_i_df["intersect"] = pd.DataFrame((bow_i_df["word"].isin(bow_j)))  # if intersect = False, then, complement
                non_overlap = list(bow_i_df[bow_i_df["intersect"]==False]["word"])
                non_overlap_list.append(non_overlap)
            else: 
                non_overlap_list.append('Short path')

        except: 
            unused_ij.append(node_j)
#            print ("At " + str(index) + " , node:" + str(node_j))
            pass
    except: 
        unused_ij.append(node_i)
#        print ("At " + str(index) + " , node:" + str(node_i))
        pass
        


In [195]:
pw_pl_df = pd.DataFrame(node_i_list, columns =['node_i'])
pw_pl_df['node_j'] = node_j_list
pw_pl_df['stranger_score'] = pw_pl_list
pw_pl_df['user_i'] = user_i_list 
pw_pl_df['user_j'] = user_j_list 
pw_pl_df['bow_count_i'] = bow_count_i_list 
pw_pl_df['bow_count_j'] = bow_count_j_list 
pw_pl_df['count_overlap'] = count_overlap_list 
pw_pl_df['count_union'] = count_union_list 
pw_pl_df['count_intersect'] = count_intersect_list 
#print len(non_overlap_list)
#len(pw_pl_df)
pw_pl_df['non_overlap'] = non_overlap_list 


In [266]:
pw_pl_df.to_csv('pairwise_nw_bow_df.csv', index=True)

In [265]:
for i in range(len(address_user_df)):
    user_bow_df = pw_pl_df[pw_pl_df['node_i']==address_user_df.iloc[i]["address"]]
    user_bow_df = user_bow_df[user_bow_df['stranger_score']>3]
    user_bow_df = user_bow_df[['node_j', 'non_overlap']]
    user_file_name = str(address_user_df.iloc[i]["user"]) + '_nonoverlap.pkl'
    user_file_name
    user_bow_df.to_pickle(user_file_name)