In [2]:
# System tools
import os

# Data analysis
import pandas as pd
from collections import Counter
from itertools import combinations 
from tqdm import tqdm

# NLP
import spacy
nlp = spacy.load("en_core_web_sm")

# drawing
import networkx as nx
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,20)

In [3]:
input_file = os.path.join("data","fake_or_real_news.csv")
data = pd.read_csv(input_file)

In [4]:
# Selecting only the REAL labelled data 
real_df = data[data["label"]=="REAL"]["text"]

In [5]:
# Extracting entities

text_entities = []

for text in tqdm(real_df):
    # create temporary list 
    tmp_entities = []
    # create doc object
    doc = nlp(text)
    # for every named entity
    for entity in doc.ents:
        # if that entity is a person
        if entity.label_ == "PERSON":
            # append to temp list
            tmp_entities.append(entity.text)
    # append temp list to main list
    text_entities.append(tmp_entities)

100%|██████████| 3171/3171 [08:21<00:00,  6.33it/s]


In [6]:
type(text_entities)

list

In [7]:
edgelist = []
# iterate over every document
for text in text_entities:
    # use itertools.combinations() to create edgelist
    edges = list(combinations(text, 2))
    # for each combination - i.e. each pair of 'nodes'
    for edge in edges:
        # append this to final edgelist
        edgelist.append(tuple(sorted(edge)))

In [8]:
edgelist

[('John F. Kerry', 'Laurent Fabius'),
 ('Francois Hollande', 'John F. Kerry'),
 ('John F. Kerry', 'Kerry'),
 ('John F. Kerry', 'Obama'),
 ('John F. Kerry', 'Kerry'),
 ('John F. Kerry', 'Kerry'),
 ('Benjamin Netanyahu', 'John F. Kerry'),
 ('Jane Hartley', 'John F. Kerry'),
 ('John F. Kerry', 'Victoria Nuland'),
 ('Eric H. Holder Jr.', 'John F. Kerry'),
 ('John F. Kerry', 'Narendra Modi'),
 ('John F. Kerry', 'Kerry'),
 ('Francois Hollande', 'Laurent Fabius'),
 ('Kerry', 'Laurent Fabius'),
 ('Laurent Fabius', 'Obama'),
 ('Kerry', 'Laurent Fabius'),
 ('Kerry', 'Laurent Fabius'),
 ('Benjamin Netanyahu', 'Laurent Fabius'),
 ('Jane Hartley', 'Laurent Fabius'),
 ('Laurent Fabius', 'Victoria Nuland'),
 ('Eric H. Holder Jr.', 'Laurent Fabius'),
 ('Laurent Fabius', 'Narendra Modi'),
 ('Kerry', 'Laurent Fabius'),
 ('Francois Hollande', 'Kerry'),
 ('Francois Hollande', 'Obama'),
 ('Francois Hollande', 'Kerry'),
 ('Francois Hollande', 'Kerry'),
 ('Benjamin Netanyahu', 'Francois Hollande'),
 ('Franco

In [9]:
counted_edges = []
for key, value in Counter(edgelist).items():
    source = key[0]
    target = key[1]
    weight = value
    counted_edges.append((source, target, weight))

In [11]:
counted_edges = pd.DataFrame(counted_edges, columns=["nodeA", "nodeB", "weights"])

In [12]:
outpath = os.path.join("data", "edgelist.csv")

In [14]:
counted_edges.to_csv(outpath,index=False)