# Import libraries

In [8]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [9]:
NER = spacy.load("en_core_web_sm")

# Creating an NER Object

In [13]:
# Load the book

with open('alice_article_wiki.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')

book = NER(data)

In [15]:
# Visualize identified entities

displacy.render(book[273:20000], style = "ent", jupyter = True)

In [18]:
# import the countries data set 
countries_df=pd.read_csv(r'/Users/yasersouri/Desktop/data analysis/specialization 1/Alice_Network_Analysis/characters_df.csv',index_col=0)

In [21]:
displacy.render(book, options = {'ents': ['GPE']}, style = 'ent', jupyter = True)

## Splitting Sentence Entities

In [10]:
df_sentences = []

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)

In [11]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"( , Alice, 's, Adventures, in, Wonderland, ...",[Wonderland - Wikipedia ...
1,"(Edit, links, , ArticleTalk, ,...","[English, Tools Tools]"
2,"(informationCite, this, pageGet, shortened, UR...","[URLDownload, Download, PDFPrintable]"
3,"(Wikimedia, CommonsWikiquoteWikisourceWikidata...","[Wikimedia, Wikipedia, 1865, Lewis Carroll ""Al..."
4,"(For, other, uses, ,, see, Alice, in, Wonderla...","[Alice, Wonderland]"
5,"(Alice, 's, Adventures, in, Wonderland, First,...","[Alice's Adventures, Wonderland, 1865)AuthorLe..."
6,"(It, details, the, story, of, a, girl, named, ...",[Alice]
7,"(It, is, seen, as, an, example, of, the, liter...",[]
8,"(The, artist, Sir, John, Tenniel, provided, 42...","[John Tenniel, 42]"
9,"(It, received, positive, reviews, upon, releas...",[Victorian]


## Filtering Data Using the Main Countries

In [13]:
# import the characters data set as a pandas dataframe
character_df=pd.read_csv('/Users/yasersouri/Desktop/data analysis/specialization 1/Alice_Network_Analysis/characters_df.csv',index_col=0)

In [14]:
character_df['character_alias'] = character_df['character'].apply(lambda x: x.rsplit(' ',1)[-1])

In [15]:
# Function to filter out entities not of interest
def filter_entity(ent_list, character_df):
    return [ent for ent in ent_list if ent in list(character_df['character_alias'])]


In [16]:
# pass the sentence entities and the characters dataframe into a filter, which will return only the entities of interest:
df_sentences['character_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, character_df))

In [17]:
# Filter out sentences that don’t have any country entities

df_sentences_filtered = df_sentences[df_sentences['character_entities'].map(len) > 0]

df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,character_entities
791,"(Alice, and, the, Land, that, Wonders, (, 2020...","[Alice, 2020, Alice, 2021, Wonderland, 1962]","[Alice, Alice]"
792,"(Alice, in, Wonderland, or, What, 's, a, Nice,...","[Alice, Wonderland, 1966]",[Alice]
793,"(Alice, in, Wonderland, (, 1966, ), Alice, Thr...","[Alice, Wonderland, 1966, Alice Through, 1983]",[Alice]
796,"(Alice, through, the, Looking, Glass, (, 1998, ))","[Alice, the Looking Glass, 1998]",[Alice]
797,"(Alice, in, Wonderland, (, 1999, ), Alice, (, ...","[Alice, Wonderland, 1999, Alice, 2009]","[Alice, Alice]"
798,"(Once, Upon, a, Time, in, Wonderland, (, 2013,...","[Wonderland, 2013, Alice, 2022, Artwork Alice,...",[Alice]
800,"(Alice, in, Wonderland, (, 2010, ))","[Alice, Wonderland, 2010]",[Alice]
801,"(Almost, Alice, (, 2010, ), "", Alice, "", "", Fo...","[Alice, 2010, Alice, Follow Me Down, Tea Party...","[Alice, Alice]"
802,"("", Just, Like, Fire, "", "", Alice, "", (, 2020,...","[Just Like Fire, Alice, 2020, the Looking Glas...","[Alice, Alice]"
804,"(Alice, no, Paint, Adventure, (, 1995, ), Alic...","[Alice no Paint Adventure, 1995, Alice, Wonder...","[Alice, Alice, Alice, Alice]"


In [18]:
# Filter out sentences that don’t have any character entities

df_sentences_filtered = df_sentences[df_sentences['character_entities'].map(len) > 0]

df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,character_entities
791,"(Alice, and, the, Land, that, Wonders, (, 2020...","[Alice, 2020, Alice, 2021, Wonderland, 1962]","[Alice, Alice]"
792,"(Alice, in, Wonderland, or, What, 's, a, Nice,...","[Alice, Wonderland, 1966]",[Alice]
793,"(Alice, in, Wonderland, (, 1966, ), Alice, Thr...","[Alice, Wonderland, 1966, Alice Through, 1983]",[Alice]
796,"(Alice, through, the, Looking, Glass, (, 1998, ))","[Alice, the Looking Glass, 1998]",[Alice]
797,"(Alice, in, Wonderland, (, 1999, ), Alice, (, ...","[Alice, Wonderland, 1999, Alice, 2009]","[Alice, Alice]"
798,"(Once, Upon, a, Time, in, Wonderland, (, 2013,...","[Wonderland, 2013, Alice, 2022, Artwork Alice,...",[Alice]
800,"(Alice, in, Wonderland, (, 2010, ))","[Alice, Wonderland, 2010]",[Alice]
801,"(Almost, Alice, (, 2010, ), "", Alice, "", "", Fo...","[Alice, 2010, Alice, Follow Me Down, Tea Party...","[Alice, Alice]"
802,"("", Just, Like, Fire, "", "", Alice, "", (, 2020,...","[Just Like Fire, Alice, 2020, the Looking Glas...","[Alice, Alice]"
804,"(Alice, no, Paint, Adventure, (, 1995, ), Alic...","[Alice no Paint Adventure, 1995, Alice, Wonder...","[Alice, Alice, Alice, Alice]"


## Creating Relationships

In [20]:
# Defining relationships

# window size = 5 : this defines how many sentences will be looked at simultaneously
relationships = []  # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i + 5, df_sentences_filtered.index[-1])
    characters_list = sum(df_sentences_filtered.loc[i:end_i].character_entities, [])

    # Remove duplicated characters that are next to each other
    char_unique = [
        characters_list[j] for j in range(len(characters_list))
        if (j == 0) or characters_list[j] != characters_list[j - 1]
    ]

    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [21]:
relationships_df = pd.DataFrame(relationships)

relationships_df

Unnamed: 0,source,target
0,Alice,Caterpillar
1,Alice,Caterpillar
2,Caterpillar,Alice
3,Caterpillar,Alice
4,Alice,Caterpillar
...,...,...
142,Alice,Hatter
143,Hatter,Alice
144,Alice,Hatter
145,Hatter,Alice


In [22]:
# sorting the values in the dataframe first, then use the groupby() function to get all the different frequencies of the source:
# Sort the cases with a- >b and b- >a
relationships_df = pd.DataFrame(np.sort(relationships_df.values, axis = 1), columns = relationships_df.columns)
relationships_df.head(5)

Unnamed: 0,source,target
0,Alice,Caterpillar
1,Alice,Caterpillar
2,Alice,Caterpillar
3,Alice,Caterpillar
4,Alice,Caterpillar


In [23]:
relationships_df["value"] = 1
relationships_df = relationships_df.groupby(["source","target"], sort=False, as_index=False).sum()

relationships_df.head(10)

Unnamed: 0,source,target,value
0,Alice,Caterpillar,18
1,Alice,Duchess,24
2,Alice,Hatter,26
3,Dormouse,Hatter,17
4,Duchess,Gryphon,5
5,Alice,Gryphon,17
6,Duck,Lory,6
7,Alice,Lory,4
8,Alice,Mouse,7
9,Alice,Dodo,12


In [37]:
relationships_df.to_csv(r'/Users/yasersouri/Desktop/data analysis/specialization 1/Alice_Network_Analysis//relationships_df.csv')