In [1]:
import json
import pandas as pd

In [2]:
path = 'Sender-receiver-weight.csv'
sendReceiverdf = pd.read_csv(path)

In [3]:
sendReceiverdf

Unnamed: 0,Sender,Receiver,Weight
0,Richard Shapiro,Leslie Lawner,2
1,Richard Shapiro,Sandra McCubbin,2
2,Richard Shapiro,Jeff Dasovich,97
3,Richard Shapiro,Linda Robertson,7
4,Richard Shapiro,Paul Kaufman,7
...,...,...,...
4503,Benjamin Rogers,Randall Martin,4
4504,Benjamin Rogers,Michelle Kelso,2
4505,Kerri Thompson,Kate Symes,684
4506,Kerri Thompson,Carla Hoffman,15


In [4]:
import networkx as nx

G_comm = nx.DiGraph()


for idx, row in sendReceiverdf.iterrows():
    G_comm.add_edge(row['Sender'], row['Receiver'], weight=row['Weight'])

In [5]:
G_comm

<networkx.classes.digraph.DiGraph at 0x120b12d10>

### Create the network of communications

In [6]:
path = 'sender-text.csv'
senderTextdf = pd.read_csv(path)

In [7]:
senderTextdf

Unnamed: 0.1,Unnamed: 0,Sender,All texts
0,0,Kerri Thompson,think price deal ; missing deal jeff richter s...
1,1,Chris Germany,nan; De Man change spelling Druckett Dracut ow...
2,2,Evelyn Metoyer,Hi Kate Mike Swerzbin ref Prebon counterparty ...
3,3,Gerald Nemec,Attached form; Attached form; Article relevant...
4,4,Tana Jones,connection execution new ISDA Master Agreement...
5,5,Mark E Haedicke,efficiently EWS Mark Forwarded Mark E Haedick...
6,6,James D Steffes,Jeff helpful feedback prior starting deal fun...
7,7,John J Lavorato,Original Message From Guerriero Michael mailto...
8,8,Richard B Sanders,kidding Columbus day found PUC Jeff DasovichEN...
9,9,Richard Shapiro,Leslieafter seeing point writing extremely r...


In [8]:
senderTextdf['All texts'] = senderTextdf['All texts'].astype(str)

In [9]:
# text in sender
text_list = senderTextdf['All texts'].tolist()

In [10]:
len(text_list)

28

In [11]:
# Calculate the similarity between the abstracts using n-grams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 3))
X = vectorizer.fit_transform(text_list)
similarity_matrix = cosine_similarity(X, X)

In [12]:
# Create a graph from the similarity matrix
# Draw an edge between papers with similarity in the fourth quartile

# Calculate the 75th percentile of the similarity matrix
import numpy as np
percentile_75 = np.percentile(similarity_matrix, 75)
print(f"75th percentile = {percentile_75}")

G_sim = nx.Graph()
for i in range(similarity_matrix.shape[0]):
    for j in range(i+1, similarity_matrix.shape[1]):
        if similarity_matrix[i, j] > percentile_75:
            G_sim.add_edge(i, j)

75th percentile = 0.44058040025146533


In [13]:
# Save the graph to a CSV file
nx.write_edgelist(G_sim, 'similarity_graph_emailDataset.csv', delimiter=',')

In [14]:
unique_elements = list(set(sendReceiverdf['Sender'].tolist() + sendReceiverdf['Receiver'].tolist()))

In [16]:
node_list = list(unique_elements)

In [17]:
node_list

['Christopher Smith',
 'Shawna Johnson',
 'W David Duran',
 'Michael M Driscoll',
 'Fred Cohagan',
 'Kirsty Hogarth',
 'Ann Ballard',
 'Jeff G Slaughter',
 'Mary Nell Browning',
 'Neil Hong',
 'Janet Edwards',
 'Mary Jo Johnson',
 'Ruth Concannon',
 'Jean Mrha',
 'Derryl Cleaveland',
 'Bob Butts',
 'Grant Oh',
 'Bradley Samuelson',
 'Nony Flores',
 'Tanya Tamarchenko',
 'Keegan Farrell',
 'Andrew Kelemen',
 'Eric Letke',
 'Scott Tackett',
 'Claire Broido',
 'Sandi M Braband',
 'Joe Steele',
 'Emily Butler',
 'Sheila Tweed',
 'Kathy Franz',
 'Jackie Gentle',
 'Deirdre McCaffrey',
 'Aga Rehman',
 'Jarek Dybowski',
 'Hillary Mack',
 'Mary Perkins',
 'Mark E Lindsey',
 'Susan D Trevino',
 'Leaf Harasin',
 'JoAnn Holloway',
 'Dennis Benevides',
 'Peter Nassab',
 'Jeffrey T Hodge',
 'Vance Meyer',
 'Wendy Conwell',
 'Jay Ferry',
 'Chris Gaskill',
 'Pushkar Shahi',
 'Brad Nebergall',
 'Simon Shih',
 'Clint Walden',
 'Sandra F Brawner',
 'Vikas Dwivedi',
 'Richard B Sanders',
 'Robert Superty'

In [18]:
# Iterate over all nodes and find the neighbors of each node in G_comm and G_sim
intersection_dict = {}
for node in node_list:
    print(f"Processing node {node}")
    # Check if node is in both graphs
    # TODO: Figure out why some nodes are not in both graphs
    if node not in G_comm.nodes or node not in G_sim.nodes:
        print(f"Node {node} is not in both graphs")
        continue
    else:
        print(f"Node {node} is in both graphs")
    network_comm_neighbors = set(G_comm.neighbors(node))
    network_sim_neighbors = set(G_sim.neighbors(node))
    intersection = network_comm_neighbors.intersection(network_sim_neighbors)
    union = network_comm_neighbors.union(network_sim_neighbors)
    jaccard_similarity = len(intersection) / len(union)
    intersection_dict[node] = {
        'intersection': intersection,
        'union': union,
        'jaccard_similarity': jaccard_similarity
    }

# # Save the intersection dictionary to a JSON file
# with open('./data/intersection_dict.json', 'w') as f:
#     json.dump(intersection_dict, f, indent=4)

Processing node Christopher Smith
Node Christopher Smith is not in both graphs
Processing node Shawna Johnson
Node Shawna Johnson is not in both graphs
Processing node W David Duran
Node W David Duran is not in both graphs
Processing node Michael M Driscoll
Node Michael M Driscoll is not in both graphs
Processing node Fred Cohagan
Node Fred Cohagan is not in both graphs
Processing node Kirsty Hogarth
Node Kirsty Hogarth is not in both graphs
Processing node Ann Ballard
Node Ann Ballard is not in both graphs
Processing node Jeff G Slaughter
Node Jeff G Slaughter is not in both graphs
Processing node Mary Nell Browning
Node Mary Nell Browning is not in both graphs
Processing node Neil Hong
Node Neil Hong is not in both graphs
Processing node Janet Edwards
Node Janet Edwards is not in both graphs
Processing node Mary Jo Johnson
Node Mary Jo Johnson is not in both graphs
Processing node Ruth Concannon
Node Ruth Concannon is not in both graphs
Processing node Jean Mrha
Node Jean Mrha is not