## Data Preparation - Game of Thrones - Extracting Relationships

**Quick Notes**

This Notebook loops through all scripts to extract relationships between the specified characters.

The relationship extracts represent a 'source' - 'target' edge list and are saved as '.csv' file for visualization!

**Sources**

- [Pandas Documentation](https://pandas.pydata.org/docs/)

- [Numpy Documentation](https://numpy.org/doc/)

- [Python os](https://docs.python.org/3/library/os.html)

- [Python re](https://docs.python.org/3/library/re.html)

### Import the necessary tools

In [22]:
import pandas as pd
import numpy as np

## 1 - Data

In [23]:
import os

### 1.1 - Scripts 

In [24]:
# All season directories
all_seasons = [s for s in os.scandir('Data') if 'Season' in s.name]

# Sort directories
all_seasons.sort(key=lambda x: x.name)

In [25]:
# All episode scripts
season_episodes = []
for s in range(0, len(all_seasons)):
    episodes = [e for e in os.scandir(all_seasons[s]) if '.txt' in e.name]
    season_episodes.append(episodes)

In [26]:
# How to get a season:
test_season = season_episodes[0]
# How to get an episode of a season:
test_episode = season_episodes[0][0]
# Episodes are not ordered (We are only interested in Season ordering)

### 1.2 - Characters

In [27]:
# Read in character csv
character_df = pd.read_csv("characters.csv", header=0, index_col=0)
# Identiteis
identities = character_df['Character']
# Firstnames
firstnames = character_df['Character_firstname']
# Nicknames
nicknames = character_df['Character_nickname']


In [28]:
# List of all characters that we want to scan for (first & nicknames)
# characters are going to be mapped to Identities later on to avoid duplication
characters = []

def get_filter_chars(characters, names):
    for element in names:
        cur = str(element)
        if cur != 'nan':
            characters.append(cur)


In [29]:
get_filter_chars(characters, nicknames)
get_filter_chars(characters, firstnames)

In [30]:
# 162 chars + 4 nicknames
print(len(characters))

166


## 2 - Process scripts

In [31]:
from collections import defaultdict
import re

### 2.1 - Processing Functions

In [32]:
def load_episode(episode):
    script = open(episode).readlines()
    script_df = pd.DataFrame(script)
    return script_df

In [33]:
# Function to store appearances in a dictionary
def appearances(episode_df, characters):
    # create defaultdict
    char_dict = defaultdict(list)
    # get characters
    for name in characters:
        get_lines(episode_df, name, char_dict)
    # return defaultdict
    return char_dict

# Helper function to get lines of appearances
def get_lines(episode_df, name, dict):
    counter = 0
    for i in range(episode_df.index[-1]):
        cur = episode_df.iloc[i][0]
        search = re.search(name, cur)
        counter += 1

        # Appends characters to a dict
        if search is not None:
            dict[counter].append(str(search.group(0)))

In [34]:
# loads dict into dataframe for further use
def load_dict(dict):
    # dataframe
    frame = pd.DataFrame.from_dict(dict, orient='index')
    # sort by index (which represents line)
    frame.sort_index(inplace=True)
    return frame

In [35]:
# How to process scripts
test_df = load_episode(test_episode)
test_dict = appearances(test_df, characters)
t_dict_df = load_dict(test_dict)

## 3 - Extract Relationships

### 3.1 - Extraction Functions

In [36]:
# Helper function help identify characters
def find_indentity(name):
    # Check if name is a nickname & return identity if so
    x = character_df[character_df['Character_nickname']==name].values
    if x.size > 0:
        return x[0][0]
    # Check if name is a firstname & return identity if so   
    y = character_df[character_df['Character_firstname']==name].values
    if y.size > 0:
        return y[0][0]
    # Else return name
    return name

In [37]:
def window_crawler(dict_df):
    # Create relationships
    relationships = []

    # For the appearances dict dataframe
    for i in range(dict_df.index[-1]):
        end_i = min(i+3, dict_df.index[-1])
        cur_window = dict_df.iloc[i:end_i].values.tolist()

        char_list = []
        # Append identities of current window to dict_df (nested for loops are ugly...)
        for row in cur_window:
            for name in row:
                if name:
                    identity = find_indentity(name)
                    char_list.append(identity)

        # Remove duplicated chars next to each other
        char_unique = [char_list[i] for i in range(len(char_list))
                        if (i==0) or char_list[i] != char_list[i-1]]

        # Append to relationships
        if len(char_unique) > 1:
            for idx, a in enumerate(char_unique[:-1]):
                b = char_unique[idx + 1]
                relationships.append({"source": a, "target": b})
    # create dataframe
    relationships_df = pd.DataFrame(relationships)
    # sort dataframe
    relationships_df = pd.DataFrame(np.sort(relationships_df.values, axis = 1), columns = relationships_df.columns)
    # retrun sorted datagrame
    return relationships_df

In [38]:
# How to extract relations
t_relations_df = window_crawler(t_dict_df)

### 3.2 - Create Weighted Edges

In [39]:
def weighted_edge(relationships_df):
    # Add a value to each instance
    relationships_df["value"] = 1
    # Aggragate values
    relationships_df = relationships_df.groupby(["source","target"], sort=False, as_index=False).sum()
    return relationships_df

In [40]:
# How to create weighted edges
weighted_edges = weighted_edge(t_relations_df)

### 3.3 - Join Multiple Weighted Edge Dataframes

In [41]:
# Function to concat all weighted edges from a collection of dataframes
def concat_dataframes(dataframes):
    b = len(dataframes) -1
    season_relations = dataframes[0]
    for i in range(0, b):
        season_relations = pd.concat([season_relations, dataframes[i+1]])
    sort_season_relations = season_relations.groupby(["source","target"], sort=False, as_index=False).sum()
    return sort_season_relations

## 4 - Application & Export

### 4.1 - Loop through all seasons

In [42]:
# Start with first season
counter = 1

for season in season_episodes:
    # open a dataframe
    dataframes = []
    for episode in season:
        # 2 - Process the episode and return appearances dataframe
        script = load_episode(episode)
        ap_dict = appearances(script, characters)
        ap_dict_df = load_dict(ap_dict)
        # 3 - Extract Relationships
        rel_df = window_crawler(ap_dict_df)
        source_target_weight = weighted_edge(rel_df)
        # Append frames
        dataframes.append(source_target_weight)
    # concat all dataframes
    cur_season = concat_dataframes(dataframes)
    # name and save csv
    name = 'Season_'+str(counter)+'_rel_df.csv'
    cur_season.to_csv(name)
    # Increment
    counter += 1