In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy # For visualizing the entities

import networkx as nx
import matplotlib.pyplot as plt

In [6]:
# Download English language model for spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 960.0 kB/s eta 0:00:14
     --------------------------------------- 0.1/12.8 MB 907.3 kB/s eta 0:00:15
     ---------------------------------------- 0.2/12.8 MB 1.0 MB/s eta 0:00:13
      --------------------------------------- 0.2/12.8 MB 1.0 MB/s eta 0:00:13
      --------------------------------------- 0.3/12.8 MB 1.1 MB/s eta 0:00:12
      --------------------------------------- 0.3/12.8 MB 1.0 MB/s eta 0:00:13
     - -------------------------------------- 0.3/12.8 MB 1.1 MB/s eta 0:00:12
     - -------------------------------------- 0.4/12.8 MB 1.1 MB/s eta 0:00:12
     - -------------------------------------- 0.4/12.8 MB 1.1 MB/s eta 0:00:12
     - --------------------------------

In [2]:
# Load spacy English language model
NER = spacy.load("en_core_web_sm")

# Strategy

## Named Entity recognition
**Named Entity Recognition** (NER) is a natural language processing technique that identifies and classifies named entities in text into predefined categories, such as people, organizations, and locations.
* NER using [Spacy](https://spacy.io/api/entityrecognizer)
* Spacy [English language model](https://spacy.io/models/en)
* Web-scraping from [Witcher WIKI](https://witcher.fandom.com/wiki/Witcher_Wiki)

## Preprocessing
1. Tokenized every books into a list of sentences and label them with the name of the characters appearing in the sentence
2. Define a window size of how far two sentences are apart from each other and assume that if two characters are mentioned in two sentences within this window then there is a relationship between them.

## Resources
1. [Books](https://github.com/dworschak/Witcher/tree/master)

# Load Books

In [3]:
import os

all_books = [b for b in os.scandir('data')]
all_books

[<DirEntry 'B - The Sword of Destiny.txt'>,
 <DirEntry 'C - The Last Wish.txt'>,
 <DirEntry 'E - something ends something begins.txt'>,
 <DirEntry 'I - Blood of Elves.txt'>,
 <DirEntry 'II - Times of Contempt.txt'>,
 <DirEntry 'III - Baptism of Fire.txt'>,
 <DirEntry 'IV - The Tower of the Swallow.txt'>,
 <DirEntry 'V - The Lady of the Lake.txt'>]

In [4]:
book = all_books[0]
book_text = open(book).read()
book_doc = NER(book_text)

In [5]:
# Visualize identified entities
displacy.render(book_doc[0:2000], style="ent", jupyter=True)