# Import libraries

In [1]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [5]:
NER = spacy.load("en_core_web_sm")

# Creating an NER Object

In [7]:
# Load the book

with open('alice_article_wiki.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')

book = NER(data)

In [8]:
# Visualize identified entities

displacy.render(book[273:20000], style = "ent", jupyter = True)

## Splitting Sentence Entities

In [10]:
df_sentences = []

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)

In [11]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"( , Alice, in, Wonderland, (, franchise, ),...","[Wonderland, Search Search ..."
1,"(1, Films, , Toggle, Films, subsection, ...","[1, 1.1.1, Alice in Wonderland , 1.2..."
2,"(Once, Upon, a, Time, in, Wonderland, )",[Wonderland]
3,"(2.3, Alice, 's, Wonderland, Bakery, ...","[2.3, Alice, 3, Toggle Video, 3.1, Wonderland,..."
4,"(informationCite, this, pageGet, shortened, UR...","[URLDownload, Download, PDFPrintable, Wikidata..."
5,"(A, logo, used, to, represent, the, 1951, anim...","[1951, 2010, DisneyOriginal, Wonderland, 1951)..."
6,"(Television, series, Adventures, in, Wonderlan...","[Wonderland, 1992–1995]"
7,"(Once, Upon, a, Time, in, Wonderland, (, 2013–...","[Wonderland, Alice, 2022–2024, Wonderland, Ali..."
8,"(Alice, in, Wonderland, (, 2010, ))","[Alice, Wonderland, 2010]"
9,"(Disney, Infinity, 3.0, (, 2015, ), *, AudioSo...","[Disney Infinity, 2015, Alice, Wonderland, 195..."


## Filtering Data Using the Main Characters

In [28]:
# import the characters data set as a pandas dataframe
character_df=pd.read_csv('/Users/yasersouri/Desktop/data analysis/specialization 1/Alice_Network_Analysis/character_df.csv',index_col=0)

In [29]:
# keep only the last part of the strings in chars using an rsplit() function:
character_df['character_alias'] = character_df['character'].apply(lambda x: x.rsplit(' ',1)[-1])
character_df.head(5)

Unnamed: 0,character,character_alias
0,Alice,Alice
1,The White Rabbit,Rabbit
2,The Mouse,Mouse
3,The Dodo,Dodo
4,The Lory,Lory


In [30]:
# Function to filter out entities not of interest
def filter_entity(ent_list, character_df):
    return [ent for ent in ent_list if ent in list(character_df['character_alias'])]


In [31]:
# pass the sentence entities and the characters dataframe into a filter, which will return only the entities of interest:
df_sentences['character_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, character_df))

In [32]:
# Filter out sentences that don’t have any character entities

df_sentences_filtered = df_sentences[df_sentences['character_entities'].map(len) > 0]

df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,character_entities
119,"(Alice, in, Wonderland, (, 1966, ), Alice, Thr...","[Alice, Wonderland, 1966, Alice Through, 1983]",[Alice]
122,"(Alice, through, the, Looking, Glass, (, 1998, ))","[Alice, the Looking Glass, 1998]",[Alice]
123,"(Alice, in, Wonderland, (, 1999, ), Alice, (, ...","[Alice, Wonderland, 1999, Alice, 2009]","[Alice, Alice]"
124,"(Once, Upon, a, Time, in, Wonderland, (, 2013,...","[Wonderland, 2013, Alice, 2022, Artwork Alice,...",[Alice]
126,"(Alice, in, Wonderland, (, 2010, ))","[Alice, Wonderland, 2010]",[Alice]
127,"(Almost, Alice, (, 2010, ), "", Alice, "", "", Fo...","[Alice, 2010, Alice, Follow Me Down, Tea Party...","[Alice, Alice]"
128,"("", Just, Like, Fire, "", "", Alice, "", (, 2020,...","[Just Like Fire, Alice, 2020, the Looking Glas...","[Alice, Alice]"
130,"(Alice, no, Paint, Adventure, (, 1995, ), Alic...","[Alice no Paint Adventure, 1995, Alice, Wonder...","[Alice, Alice, Alice, Alice]"
156,"(The, Adventures, of, Ichabod, and, Mr., Toad,...","[The Adventures of Ichabod, Toad, 1949, Cinder...",[Alice]
192,"(Walt, &, El, Grupo, (, 2009, ), Miscellaneous...","[Walt & El Grupo, 2009, Walt Disney Animation ...",[Alice]


In [35]:
# Take only the first name of the characters

df_sentences_filtered['character_entities'] = df_sentences_filtered['character_entities'].apply(lambda x: [item.split()[0]
                                                                                                    for item in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sentences_filtered['character_entities'] = df_sentences_filtered['character_entities'].apply(lambda x: [item.split()[0]


In [42]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,character_entities
119,"(Alice, in, Wonderland, (, 1966, ), Alice, Thr...","[Alice, Wonderland, 1966, Alice Through, 1983]",[Alice]
122,"(Alice, through, the, Looking, Glass, (, 1998, ))","[Alice, the Looking Glass, 1998]",[Alice]
123,"(Alice, in, Wonderland, (, 1999, ), Alice, (, ...","[Alice, Wonderland, 1999, Alice, 2009]","[Alice, Alice]"
124,"(Once, Upon, a, Time, in, Wonderland, (, 2013,...","[Wonderland, 2013, Alice, 2022, Artwork Alice,...",[Alice]
126,"(Alice, in, Wonderland, (, 2010, ))","[Alice, Wonderland, 2010]",[Alice]
127,"(Almost, Alice, (, 2010, ), "", Alice, "", "", Fo...","[Alice, 2010, Alice, Follow Me Down, Tea Party...","[Alice, Alice]"
128,"("", Just, Like, Fire, "", "", Alice, "", (, 2020,...","[Just Like Fire, Alice, 2020, the Looking Glas...","[Alice, Alice]"
130,"(Alice, no, Paint, Adventure, (, 1995, ), Alic...","[Alice no Paint Adventure, 1995, Alice, Wonder...","[Alice, Alice, Alice, Alice]"
156,"(The, Adventures, of, Ichabod, and, Mr., Toad,...","[The Adventures of Ichabod, Toad, 1949, Cinder...",[Alice]
192,"(Walt, &, El, Grupo, (, 2009, ), Miscellaneous...","[Walt & El Grupo, 2009, Walt Disney Animation ...",[Alice]
