# Initialization

## Install Dependencies

In [1]:
# %%bash
# # keep only wikigraphs folder from deepmind-research repo
# if [ ! -d "wikigraphs" ]; then
#     git clone https://github.com/google-deepmind/deepmind-research.git
#     mv deepmind-research/wikigraphs/wikigraphs .
#     mv deepmind-research/wikigraphs/scripts wikigraphs  # move scripts folder inside wikigraphs module
# fi
# rm -rf deepmind-research
# exit

In [2]:
# %%bash
# PACKAGES="chromadb==0.6.3 jraph==0.0.6.dev0 tqdm==4.67.1 sentence-transformers==4.1.0 accelerate==1.7.0 huggingface_hub==0.33.0 spacy==3.8.6"
# if [[ -f "$(which mamba)" ]]; then
#     mamba install $PACKAGES -y
# elif [[ -f "$(which conda)" ]]; then
#     conda install -c conda-forge $PACKAGES -y
# else
#     pip install $PACKAGES
# fi

## Imports

In [3]:
import os
import json
import shutil
import requests
import tarfile
import zipfile
import gzip
import lzma

import chromadb
import ollama

import pandas as pd

from pathlib import Path
from itertools import chain
from tqdm.auto import tqdm
from IPython.display import display

from langchain.text_splitter import RecursiveCharacterTextSplitter

from wikigraphs.scripts.freebase_preprocess import pair_graphs_with_wikitext
from wikigraphs.data import io_tools, paired_dataset

## Configs

In [4]:
pd.set_option('display.max_colwidth', None)

## Global Variables

In [5]:
DATA_DIR = "data/"
FREEBASE_URLS = [
    ("https://docs.google.com/uc?export=download&id=1uuSS2o72dUCJrcLff6NBiLJuTgSU-uRo", "max256.tar"),
    ("https://docs.google.com/uc?export=download&id=1nOfUq3RUoPEWNZa2QHXl2q-1gA5F6kYh", "max512.tar"),
    ("https://docs.google.com/uc?export=download&id=1uuJwkocJXG1UcQ-RCH3JU96VsDvi7UD2", "max1024.tar")
]
FREEBASE_DIR = "data/freebase/"

WIKITEXT_URLS = [
    ("https://wikitext.smerity.com/wikitext-103-v1.zip", "wikitext-103-v1.zip"),
    ("https://wikitext.smerity.com/wikitext-103-raw-v1.zip", "wikitext-103-raw-v1.zip")
]

WIKITEXT_DIR = "data/wikitext-103"
WIKITEXT_RAW_DIR = WIKITEXT_DIR + "-raw"

WIKIGRAPHS_DATA_DIR = "data/wikigraphs/"

GENERATED_GRAPHS_SAMPLE_SIZE = 10

LLM_MODEL = "llama3.1:8b"

## Utilities

In [6]:
def chmod_recursive(path, mode):
    os.chmod(path, mode)
    for root, dirs, files in os.walk(path):
        for item in chain(dirs, files):
            os.chmod(os.path.join(root, item), mode)

def download_file(url, filepath):
    if os.path.exists(filepath):
        print(f"File {filepath} already exists. Skipping download.")
        return

    response = requests.get(url, stream=True)
    response.raise_for_status()

    with open(filepath, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)

    print(f"File {filepath} downloaded successfully.")

def create_tar(source_item, save_path=".", compression=""):
    """Creates a tar archive of the specified item (file or folder)."""

    if compression not in ("gz", "bz2", "xz", ""):
        raise ValueError(f"Invalid compression method: {compression}")

    source_name = Path(source_item).name
    archive_name = f"{source_name}.tar{'.' + compression if compression else ''}"
    archive_path = f"{save_path}/{archive_name}"

    if os.path.exists(archive_path):
        print(f"Archive '{archive_name}' already exists. Skipping creation.")
        return

    try:
        mode = f"w:{compression}" if compression else "w"
        with tarfile.open(archive_path, mode) as tar:
            tar.add(source_item, arcname=source_name)
        print(f"Archive '{archive_name}' created successfully.")
    except FileNotFoundError as e:
        print(f"Error: Folder '{source_item}' not found.")
        raise e
    except Exception as e:
        print(f"An error occurred: {e}")
        raise e

def extract_tar(filename, save_path=".", under_new_folder=False):
    filepath = Path(save_path) / Path(filename)
    if not filepath.exists():
        raise ValueError(f"File `{filename}` does not exist under path `{save_path}`.")

    split = filename.rsplit(".", maxsplit=1)[-1]
    compression = split if split in ("gz", "bz2", "xz") else ""

    if split == "tar":
        compression = ""
    elif split in ("gz", "bz2", "xz"):
        compression = split
    else:
        raise ValueError(f"Invalid file extension: {split}")

    try:
        mode = f"r:{compression}" if compression else "r"
        with tarfile.open(filepath, mode) as tar:
            # properly remove .tar.XXX or .tar ext
            extract_folder_name = filename.rsplit(".", maxsplit=2 if compression else 1)[0]
            extract_folder = ( save_path if not under_new_folder else
                os.path.join(save_path, extract_folder_name) )
            os.makedirs(extract_folder, exist_ok=True)
            for member in tar:
                target_path = os.path.join(extract_folder, member.name)
                if os.path.exists(target_path):
                    # BUG: Displays Skipping even if they do not already exist
                    #      Must be regarding the way it loops through the members
                    # UPDATE: For some weird reason, this has stopped. Maybe
                    #         google colab local storage had an issue
                    print(f"Skipping {member.name}, already exists")
                else:
                    tar.extract(member, extract_folder)
            print(f"File {filename} extracted successfully.")
    except tarfile.ReadError as e:
        print(f"Error: Could not extract {filename}. It may not be a valid tar file.")
        raise e

def extract_zip(filename, save_path=".", under_new_folder=False):
    filepath = Path(save_path) / Path(filename)
    if not filepath.exists():
        raise ValueError(f"File `{filename}` does not exist under path `{save_path}`.")

    splits = filename.rsplit(".", maxsplit=1)
    if splits[-1] != "zip":
        raise ValueError(f"Invalid file extension: {splits[-1]}. Expected 'zip'")

    try:
        with zipfile.ZipFile(filepath, 'r') as zip_ref:
            extract_folder_name = splits[0]
            extract_folder = (save_path if not under_new_folder else
                              os.path.join(save_path, extract_folder_name))
            os.makedirs(extract_folder, exist_ok=True)

            for member_name in zip_ref.namelist():
                target_path = os.path.join(extract_folder, member_name)
                if os.path.exists(target_path):
                    print(f"Skipping {member_name}, already exists")
                else:
                    zip_ref.extract(member_name, extract_folder)
            print(f"File {filename} extracted successfully.")
    except zipfile.BadZipFile as e:
        print(f"Error: Could not extract {filename}. It may not be a valid zip file.")
        raise e

# Loading

In [7]:
def concatenate(input_files, output_file, compress=True):
    if os.path.exists(output_file):
        print(f"File {output_file} already exists. Skipping concatenation.")
    else:
        with open(output_file, 'w') as outfile:
            for input_file in input_files:
                with open(input_file, 'r') as infile:
                    outfile.write(infile.read())
        print(f"Concatenated {input_files} files into {output_file}")

    if compress:
        with open(output_file, 'rb') as f_in:
            compressed_file = output_file + '.gz'
            if os.path.exists(compressed_file):
                print(f"File {compressed_file} already exists. Skipping compression.")
                return
            with gzip.open(compressed_file, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
                # f_out.writelines(f_in)  # alternative
                print(f"Compressed {output_file} into {compressed_file}")

def extract_dataset(filename, save_path=".", merge_dataset_types=True):
    extract_tar(filename, save_path=save_path, under_new_folder=True)
    extract_folder = filename.rsplit(save_path, maxsplit=1)[0]

    # Extract gz files within the extracted folder
    for root, _, files in os.walk(extract_folder):
        extracted_file_paths = []
        for file in files:
            if file.endswith(".gz"):
                gz_file_path = os.path.join(root, file)
                try:
                    with gzip.open(gz_file_path, 'rb') as gz_file:
                        extracted_file_path = gz_file_path[:-3] # remove .gz extension
                        if os.path.exists(extracted_file_path):
                            print(f"File {extracted_file_path} already exists. Skipping extraction.")
                            continue
                        with open(extracted_file_path, 'wb') as extracted_file:
                            extracted_file.write(gz_file.read())
                    extracted_file_paths.append(extracted_file_path)
                    print(f"Extracted: {gz_file_path} to {extracted_file_path}")
                except gzip.BadGzipFile as e:
                    print(f"Error: Could not extract {gz_file_path}. It may not be a valid gzip file.")
                    raise e
        if merge_dataset_types:
            concatenate(extracted_file_paths, os.path.join(extract_folder, "whole"))
        break  # in case subfolders exist inside max1024 it won't go deeper to search them (i.e forces only one iteration)

## Freebase Data

In [8]:
os.makedirs(FREEBASE_DIR, exist_ok=True)

for url, filename in FREEBASE_URLS:
    download_file(url, FREEBASE_DIR + filename)
    extract_dataset(filename, save_path=FREEBASE_DIR)

File data/freebase/max256.tar already exists. Skipping download.
Skipping test.gz, already exists
Skipping train.gz, already exists
Skipping valid.gz, already exists
File max256.tar extracted successfully.
File data/freebase/max512.tar already exists. Skipping download.
Skipping test.gz, already exists
Skipping train.gz, already exists
Skipping valid.gz, already exists
File max512.tar extracted successfully.
File data/freebase/max1024.tar already exists. Skipping download.
Skipping test.gz, already exists
Skipping train.gz, already exists
Skipping valid.gz, already exists
File max1024.tar extracted successfully.


## WikiText-103 Data

In [9]:
for url, filename in WIKITEXT_URLS:
    download_file(url, DATA_DIR + filename)
    extract_zip(filename, save_path=DATA_DIR)

File data/wikitext-103-v1.zip already exists. Skipping download.
Skipping wikitext-103/, already exists
Skipping wikitext-103/wiki.test.tokens, already exists
Skipping wikitext-103/wiki.valid.tokens, already exists
Skipping wikitext-103/wiki.train.tokens, already exists
File wikitext-103-v1.zip extracted successfully.
File data/wikitext-103-raw-v1.zip already exists. Skipping download.
Skipping wikitext-103-raw/, already exists
Skipping wikitext-103-raw/wiki.test.raw, already exists
Skipping wikitext-103-raw/wiki.valid.raw, already exists
Skipping wikitext-103-raw/wiki.train.raw, already exists
File wikitext-103-raw-v1.zip extracted successfully.


## Creation of WikiGraphs Dataset
which is a Text-to-Graph dataset, pairing the Freebase subgraphs with their corresponding WikiText-103

In [10]:
for subset in ['train', 'valid', 'test']:
    pair_graphs_with_wikitext(subset,
                              WIKITEXT_DIR,
                              FREEBASE_DIR + "max256/",
                              WIKIGRAPHS_DATA_DIR + "max256/")

## WikiGraphs Dataset (a.k.a Paired Dataset)

In [11]:
paired_dataset.DATA_ROOT = WIKIGRAPHS_DATA_DIR
dataset = paired_dataset.ParsedDataset(
               subset = 'train',
               shuffle_data = False,
               data_dir = None,
               version = 'max256')

In [12]:
parsed_pairs = list(dataset)

In [13]:
len(parsed_pairs)

23431

In [14]:
pair = parsed_pairs[0]

### Parsed Pair Example

In [15]:
example_pair = parsed_pairs[0]

In [16]:
example_pair.center_node

'ns/m.0ddd390'

In [17]:
example_pair.title

'Valkyria_Chronicles_III'

In [18]:
print(example_pair.text[:1000])


 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . 
 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series n

In [19]:
example_pair.graph.edges()[:10]

[(0, 1, 'key/wikipedia.en'),
 (0, 2, 'ns/type.object.name'),
 (0, 3, 'ns/organization.organization.date_founded'),
 (0, 4, 'ns/cvg.cvg_developer.games_developed'),
 (0, 5, 'ns/common.topic.description'),
 (6, 7, 'ns/type.object.name'),
 (6, 8, 'key/wikipedia.en'),
 (6, 4, 'ns/cvg.cvg_genre.games'),
 (6, 9, 'ns/common.topic.description'),
 (10, 11, 'key/wikipedia.en')]

__Note__: One needs to use `graph.nodes()` to get the node freebase id from the node index id by indexing on the returned list

# Preprocessing

## Ground Truth Graphs

In [20]:
truth_graphs_dfs = []  # ground truth graph dataframes
truth_graphs_dfs_dict = {}  # center_node - ground truth graph dataframe pairs

for pair in parsed_pairs:
    g = pair.graph
    df = pd.DataFrame(g.edges(), columns=["src", "tgt", "edge"])
    # NOTE: "src" corresponds to the ID of the "subject" node and
    #       "tgt" corresponds to the ID of the "object" node.
    
    # Append the values of the nodes as new columns
    df["subject"] = df["src"].apply(lambda node_id: g.nodes()[node_id])
    df["object"] = df["tgt"].apply(lambda node_id: g.nodes()[node_id])

    # Creating Subject-Predicate-Object only columns dataframe
    df = df[["subject", "edge", "object"]]
    df.rename(columns={"edge":"predicate"}, inplace=True)

    truth_graphs_dfs.append(df)
    truth_graphs_dfs_dict[pair.center_node] = df

# Concatenate the dfs using the center node of each df as index value for all its rows
truth_df = pd.concat(truth_graphs_dfs_dict, names=["center_node"])

### Examples

In [21]:
truth_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,subject,predicate,object
center_node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ns/m.0ddd390,0,ns/m.0f9q9z,key/wikipedia.en,"""Sega_AM1"""
ns/m.0ddd390,1,ns/m.0f9q9z,ns/type.object.name,"""Sega Wow"""
ns/m.0ddd390,2,ns/m.0f9q9z,ns/organization.organization.date_founded,"""2000"""
ns/m.0ddd390,3,ns/m.0f9q9z,ns/cvg.cvg_developer.games_developed,ns/m.0ddd390
ns/m.0ddd390,4,ns/m.0f9q9z,ns/common.topic.description,"""Sega Wow was a division of Japanese video game developer Sega."""


In [22]:
truth_df.loc["ns/m.0ddd390"].head()

Unnamed: 0,subject,predicate,object
0,ns/m.0f9q9z,key/wikipedia.en,"""Sega_AM1"""
1,ns/m.0f9q9z,ns/type.object.name,"""Sega Wow"""
2,ns/m.0f9q9z,ns/organization.organization.date_founded,"""2000"""
3,ns/m.0f9q9z,ns/cvg.cvg_developer.games_developed,ns/m.0ddd390
4,ns/m.0f9q9z,ns/common.topic.description,"""Sega Wow was a division of Japanese video game developer Sega."""


### Freebase Sub-Graphs Dataframe Creation

In [23]:
def create_dataframe(graphs):
    df_list = []

    for g in graphs:

        df = pd.DataFrame(g.edges(), columns=["src", "tgt", "edge"])
        # NOTE: "src" corresponds to the ID of the "subject" node and
        #       "tgt" corresponds to the ID of the "object" node.
        # Append the values of the nodes as new columns

        df["subject"] = df["src"].apply(lambda node_id: g.nodes()[node_id])
        df["object"] = df["tgt"].apply(lambda node_id: g.nodes()[node_id])

        spo_df = df[["subject", "edge", "object"]].copy()
        spo_df.rename(columns={"edge":"predicate"}, inplace=True)

        df_list.append(spo_df)

    return pd.concat(df_list, ignore_index=True)

In [24]:
freebase_graphs_generator = io_tools.graphs_from_file("data/freebase/max1024/whole.gz")
freebase_graphs = list(freebase_graphs_generator)
len(freebase_graphs)

23851

In [25]:
# Convert from io_tools.Graph to paired_dataset.Graph
freebase_graphs = [paired_dataset.Graph.from_edges(g.edges) for g in freebase_graphs]

In [26]:
freebase_df = create_dataframe(freebase_graphs)

#### Show Duplicates

In [27]:
freebase_df_dups = freebase_df[freebase_df.duplicated(keep=False)].sort_values(["subject", "predicate", "object"])
display(freebase_df_dups)

Unnamed: 0,subject,predicate,object
351415,ns/g.113qbncfz,ns/common.topic.description,"""Rock & Alternative French Radio | www.lagrosseradio.com"""
834396,ns/g.113qbncfz,ns/common.topic.description,"""Rock & Alternative French Radio | www.lagrosseradio.com"""
351416,ns/g.113qbncfz,ns/type.object.name,"""LA GROSSE RADIO"""
834398,ns/g.113qbncfz,ns/type.object.name,"""LA GROSSE RADIO"""
494921,ns/g.11b6mp5yt4,ns/common.topic.description,"""Katie Bell is a fictional character from the Harry Potter book series written by J. K. Rowling."""
...,...,...,...
155920,ns/m.0zz4ncc,ns/type.object.name,"""Battle"""
156465,ns/m.0zz4ncc,ns/type.object.name,"""Battle"""
156891,ns/m.0zz4ncc,ns/type.object.name,"""Battle"""
157409,ns/m.0zz4ncc,ns/type.object.name,"""Battle"""


#### Remove Duplicates

For some reason, the max1024 version of freebase has duplicates inside. These are removed below.

TODO: Find the reason why duplicates exist

In [28]:
# Before duplicates removal
freebase_df_len_before_dups_removal = len(freebase_df)
print(freebase_df_len_before_dups_removal)

1312745


In [29]:
freebase_df.drop_duplicates(inplace=True)

In [30]:
# After duplicates removal
freebase_df_len_after_dups_removal = len(freebase_df)
print(freebase_df_len_after_dups_removal)

743965


In [31]:
# Duplicates removed
freebase_df_dups_removed = freebase_df_len_before_dups_removal - freebase_df_len_after_dups_removal
print(freebase_df_dups_removed)

568780


## Unique Entity Names

In [32]:
object_name_df = freebase_df[freebase_df["predicate"] == "ns/type.object.name"].drop_duplicates()

# List of unique enitity names, defined as the value (object) of the
# "ns/type.object.name" (predicate) of the corresponding entity's ID (subject)
unique_enitity_names = object_name_df["object"].drop_duplicates().tolist()

In [33]:
len(unique_enitity_names)

81296

### Unique Entity ID to Name and vice-versa Dictionaries

In [34]:
id_to_name = object_name_df.set_index("subject")["object"].to_dict()

In [35]:
print("Number of unique IDs:")
print(len(id_to_name))

Number of unique IDs:
92128


In [36]:
name_to_id = {}
for _id, name in id_to_name.items():
    name_to_id.setdefault(name, []).append(_id)

In [37]:
print("Number of unique names:")
print(len(name_to_id))

Number of unique names:
81296


In [38]:
name_to_id_1to1 = {k: v[0] for k, v in name_to_id.items() if len(v) == 1}

In [39]:
print("Number of unique names that correspond to single entity ID:")
print(len(name_to_id_1to1))

Number of unique names that correspond to single entity ID:
76885


In [40]:
name_to_id_1toN = {k: v for k, v in name_to_id.items() if len(v) > 1}

In [41]:
print("Number of unique names that correspond to multiple entity IDs:")
print(len(name_to_id_1toN))

Number of unique names that correspond to multiple entity IDs:
4411


In [42]:
name_to_id_1toN

{'"Millennium"': ['ns/m.036qs6', 'ns/m.03ydg8b'],
 '"Closure"': ['ns/m.02nw7wx', 'ns/m.04f_98r', 'ns/m.047jr2'],
 '"Oregon"': ['ns/m.05kj_', 'ns/m.0k2lfmq', 'ns/m.0sf4d'],
 '"Portland"': ['ns/m.02frhbc', 'ns/m.0c4kv', 'ns/m.0smnk'],
 '"Manchester"': ['ns/m.052bw', 'ns/m.0xhj2'],
 '"Charleston"': ['ns/m.0gkgp', 'ns/m.0fw2f'],
 '"Japan"': ['ns/m.03_3d', 'ns/m.01c7s7f'],
 '"Athlete"': ['ns/m.02h7c7b', 'ns/m.01445t', 'ns/m.02gr2z'],
 '"Profession"': ['ns/m.02h67g7', 'ns/m.02h65n3'],
 '"Celebrity"': ['ns/m.02_7frv', 'ns/m.06z14x_'],
 '"Height"': ['ns/m.025d7wc', 'ns/m.0wjcd9q'],
 '"Bourbon County"': ['ns/m.0nq49', 'ns/m.0nnv5'],
 '"Species"': ['ns/m.06zf0', 'ns/m.032_76'],
 '"Initial release date"': ['ns/m.0jsg52', 'ns/m.0kpv77', 'ns/m.021y6pg'],
 '"Comedy"': ['ns/m.05p553', 'ns/m.01m7g01'],
 '"Politician"': ['ns/m.0fj9f', 'ns/m.02xlh55'],
 '"Lake County"': ['ns/m.0p028', 'ns/m.0nvd8'],
 '"Papa Stour"': ['ns/m.02ftdw', 'ns/m.029wlqr'],
 '"SpongeBob SquarePants"': ['ns/m.07vqnc', 'ns/m.01b9t

#### Examples of Unique Names to Multiple Entity IDs

**Athlete Example**

In [43]:
example_name_1toN = "Athlete"
example_ids_1toN = name_to_id_1toN[f'"{example_name_1toN}"']
print(example_ids_1toN)

['ns/m.02h7c7b', 'ns/m.01445t', 'ns/m.02gr2z']


In [44]:
# Data Exploration
for example_id in example_ids_1toN:
    display(freebase_df.query(f'subject=="{example_id}"'))

Unnamed: 0,subject,predicate,object
475,ns/m.02h7c7b,ns/type.object.name,"""Athlete"""
476,ns/m.02h7c7b,ns/common.topic.description,"""An athlete is a person who participates in a sport for personal fitness or as part of an amateur or professional competition or contest. \nFor more information, please see the Freebase wiki page on Athlete."""
477,ns/m.02h7c7b,ns/freebase.type_profile.instance_count,"""271978"""
478,ns/m.02h7c7b,ns/type.object.id,"""/sports/pro_athlete"""
479,ns/m.02h7c7b,ns/freebase.object_hints.best_hrid,"""/sports/pro_athlete"""


Unnamed: 0,subject,predicate,object
33537,ns/m.01445t,ns/common.topic.alias,"""Sportsman"""
33538,ns/m.01445t,ns/common.topic.description,"""An athlete or sportsperson, sportsman or sportswoman is a person who competes in one or more sports that involve physical strength, speed and/or endurance. Athletes may be professionals or amateurs.\nMost professional athletes have particularly well-developed physiques obtained by extensive physical training and strict exercise accompanied by a strict dietary regimen.\nThe word \""athlete\"" is a romanization of the Greek: άθλητὴς, athlētēs, one who participates in a contest; from ἄθλος, áthlos, or ἄθλον, áthlon, a contest or feat. It is the general term for all participants in any physical sport; its application to those who participate in other activities, such as horse riding or driving, is somewhat controversial."""
33539,ns/m.01445t,key/wikipedia.en,"""Athlete"""
33540,ns/m.01445t,ns/type.object.name,"""Athlete"""
33541,ns/m.01445t,ns/people.profession.people_with_this_profession,ns/m.02r6pqd
...,...,...,...
1298324,ns/m.01445t,ns/people.profession.people_with_this_profession,ns/m.01jzhl
1305767,ns/m.01445t,ns/people.profession.people_with_this_profession,ns/m.03gzkgc
1305924,ns/m.01445t,ns/people.profession.people_with_this_profession,ns/m.08887m
1309808,ns/m.01445t,ns/people.profession.people_with_this_profession,ns/m.0j43qj4


Unnamed: 0,subject,predicate,object
316187,ns/m.02gr2z,key/wikipedia.en,"""Athlete_$0028group$0029"""
316188,ns/m.02gr2z,ns/music.artist.origin,ns/m.02z6bk
316189,ns/m.02gr2z,ns/common.topic.alias,"""Athelete"""
316190,ns/m.02gr2z,ns/music.artist.active_start,"""2002-03"""
316191,ns/m.02gr2z,ns/common.topic.description,"""Athlete are an English indie rock band formed in Deptford, London, comprising Joel Pott, Carey Willetts, Stephen Roberts and Tim Wanstall.\nThe band had a brief period of high-profile domestic success in which their debut album Vehicles & Animals was a platinum seller in 2005 and Mercury Music Prize nomination. It was followed up by Tourist which reached No. 1 and sold over 500,000 copies allowing this album to also go platinum. Since then the band has continued to release records on regular basis. Their subsequent two albums Beyond the Neighbourhood and Black Swan sees the band exploring different styles, influenced by the works of artists such as Nick Cave & the Bad Seeds."""
316192,ns/m.02gr2z,ns/type.object.name,"""Athlete"""


**Observation**: As one can see from the above example for the name "Athlete", the first and second entities are of the same meaning while the third one has a different meaning (i.e rock band instead of the sportsperson)

**SpongeBob SquarePants Example**

In [45]:
example_name_1toN = "SpongeBob SquarePants"
example_ids_1toN = name_to_id_1toN[f'"{example_name_1toN}"']
print(example_ids_1toN)

['ns/m.07vqnc', 'ns/m.01b9tt']


In [46]:
# Data Exploration
for example_id in example_ids_1toN:
    display(freebase_df.query(f'subject=="{example_id}"'))

Unnamed: 0,subject,predicate,object
1373,ns/m.07vqnc,key/wikipedia.en,"""SpongeBob"""
1374,ns/m.07vqnc,ns/tv.tv_program.number_of_episodes,"""355"""
1375,ns/m.07vqnc,ns/common.topic.description,"""SpongeBob SquarePants is an American animated television series created by marine biologist and animator Stephen Hillenburg for Nickelodeon. The series chronicles the adventures and endeavors of the title character and his various friends in the fictional underwater city of Bikini Bottom. The series' popularity has made it a media franchise, as well as the highest rated series to ever air on Nickelodeon, and the most distributed property of MTV Networks. The media franchise has generated $8 billion in merchandising revenue for Nickelodeon.\nMany of the ideas for the series originated in an unpublished, educational comic book titled The Intertidal Zone, which Hillenburg created in the mid-1980s. He began developing SpongeBob SquarePants into a television series in 1996 upon the cancellation of Rocko's Modern Life, and turned to Tom Kenny, who had worked with him on that series, to voice the titular character. SpongeBob was originally going to be named SpongeBoy, and the series was to be called SpongeBoy Ahoy!, but these were both changed, as the name was already trademarked."""
1376,ns/m.07vqnc,ns/tv.tv_program.air_date_of_first_episode,"""1999-05-01"""
1377,ns/m.07vqnc,ns/tv.tv_program.seasons,ns/m.043rdbm
1378,ns/m.07vqnc,ns/tv.tv_program.episodes,ns/m.07749b
1379,ns/m.07vqnc,ns/common.topic.alias,"""SpongeBob"""
1380,ns/m.07vqnc,ns/tv.tv_program.episode_running_time,"""30"""
1381,ns/m.07vqnc,ns/tv.tv_program.currently_in_production,"""true"""
1382,ns/m.07vqnc,ns/tv.tv_program.number_of_seasons,"""9"""


Unnamed: 0,subject,predicate,object
1044310,ns/m.01b9tt,key/wikipedia.en,"""SpongeBob_SquarePants_$0028character$0029"""
1044311,ns/m.01b9tt,ns/common.topic.notable_types,ns/m.02nsjl9
1044312,ns/m.01b9tt,ns/common.topic.description,"""SpongeBob SquarePants is the titular character and protagonist of the American animated television series of the same name. He is voiced by actor and comedian Tom Kenny, and first appeared on television in the series' pilot episode \""Help Wanted\"" on May 1, 1999.\nSpongeBob SquarePants was created and designed by cartoonist and marine biologist Stephen Hillenburg shortly after the cancellation of Rocko's Modern Life in 1996. Hillenburg intended to create a series about an over-optimistic sponge that annoys other characters. Hillenburg compared the concept to Laurel and Hardy and Pee-wee Herman. As he drew the character, he decided that a \""squeaky-clean square\"" fit the concept. His name is derived from \""Bob the Sponge\"", the host of Hillenburg's comic strip The Intertidal Zone that he originally drew in the 1980s while teaching marine biology to visitors of the Ocean Institute. SpongeBob is a naïve and goofy sea sponge who works as a fry cook in the fictional underwater town of Bikini Bottom.\nThe character has received positive critical response from media critics and achieved popularity with both children and adults, though he has been involved in public controversy."""
1044313,ns/m.01b9tt,ns/fictional_universe.fictional_character.gender,ns/m.05zppz
1044314,ns/m.01b9tt,ns/type.object.name,"""SpongeBob SquarePants"""


**Observation**: As one can see from the above example for the name "SpongeBob SquarePants", the first and second entities are different. One being the show and the other being the name of the character which the show got its name from. In this example, we do not have different entities with the same meaning.

**Pilot Example**

In [47]:
example_name_1toN = "Pilot"
example_ids_1toN = name_to_id_1toN[f'"{example_name_1toN}"']
print(example_ids_1toN)

['ns/m.04m_mcw', 'ns/m.05dpjrg', 'ns/m.0h7mfg6', 'ns/m.0kffn39', 'ns/m.0bwlvv', 'ns/m.0kbdl4b', 'ns/m.0j3fv7g', 'ns/m.06yznhd', 'ns/m.0dkt96', 'ns/m.05syyz4', 'ns/m.0kxflf7', 'ns/m.07mmh1y', 'ns/m.03d6l9d', 'ns/m.0gzb7m', 'ns/m.0fqp_h', 'ns/m.0c0293', 'ns/m.02qnz0j', 'ns/m.011nsb7_', 'ns/m.0c3vwgr', 'ns/m.02nw2nc', 'ns/m.0dpg5_', 'ns/m.0krtwc0', 'ns/m.0795xq1', 'ns/m.0gj8cyj', 'ns/m.05dqwkp', 'ns/m.05dpzp8', 'ns/m.05dtp0f', 'ns/m.0mwpj6b', 'ns/m.0260p1z', 'ns/m.0x0n5mq', 'ns/m.0808gzp', 'ns/m.090ldx', 'ns/m.0w5zm5g', 'ns/m.07kf5_9', 'ns/m.0ddh9q1', 'ns/m.0269nfn', 'ns/m.0bzd9w', 'ns/m.07k8c34', 'ns/m.0hgpv3v', 'ns/m.05q88m9', 'ns/m.0gjdv5', 'ns/m.0kb0_g3', 'ns/m.01xwlgy', 'ns/m.025z3j0', 'ns/m.0bjtfn', 'ns/m.0hnb4n5', 'ns/m.06gt51c', 'ns/m.0fcygc', 'ns/m.05dlk4d', 'ns/m.06yp_4v', 'ns/m.0jfyh85', 'ns/m.0703lxy', 'ns/m.0kd3pbb']


In [48]:
# Data Exploration
for example_id in example_ids_1toN:
    display(freebase_df.query(f'subject=="{example_id}"'))

Unnamed: 0,subject,predicate,object
1708,ns/m.04m_mcw,ns/tv.tv_series_episode.next_episode,ns/m.0g9x0r3
1709,ns/m.04m_mcw,ns/tv.tv_series_episode.writer,ns/m.09pl3f
1710,ns/m.04m_mcw,ns/common.topic.description,"""When all the passengers on a plane die, FBI agent Olivia Dunham investigates the events and her partner almost dies. A desperate Olivia looks for help from Dr. Walter Bishop who has been institutionalized. Olivia, Dr. Bishop and his son Peter begin to discover what really happened on Flight 627 and begin to uncover a larger truth."""
1711,ns/m.04m_mcw,ns/tv.tv_series_episode.air_date,"""2008-09-09"""
1712,ns/m.04m_mcw,ns/type.object.name,"""Pilot"""
1713,ns/m.04m_mcw,ns/tv.tv_series_episode.season,ns/m.06yzmjr
1714,ns/m.04m_mcw,ns/common.topic.notable_types,ns/m.01xrzlb
1715,ns/m.04m_mcw,ns/tv.tv_series_episode.series,ns/m.03cf9ly
1716,ns/m.04m_mcw,key/wikipedia.en,"""Pilot_$0028Fringe$0029"""
1717,ns/m.04m_mcw,ns/tv.tv_series_episode.episode_number,"""1"""


Unnamed: 0,subject,predicate,object
51009,ns/m.05dpjrg,ns/tv.tv_series_episode.season_number,"""1"""
51010,ns/m.05dpjrg,ns/tv.tv_series_episode.series,ns/m.02tp4s
51011,ns/m.05dpjrg,ns/tv.tv_series_episode.production_number,"""101"""
51012,ns/m.05dpjrg,ns/common.topic.description,"""Film critic Jay Sherman falls in love with actress Valerie Fox. Jay's family and friends worry that Valerie is using him to get a good movie review. Does she truly care for him?"""
51013,ns/m.05dpjrg,ns/tv.tv_series_episode.air_date,"""1994-01-26"""
51014,ns/m.05dpjrg,ns/tv.tv_series_episode.writer,ns/m.03xpf_7
51015,ns/m.05dpjrg,key/wikipedia.en,"""Pilot_$0028The_Critic$0029"""
51016,ns/m.05dpjrg,ns/tv.tv_series_episode.episode_number,"""1"""
51017,ns/m.05dpjrg,ns/type.object.name,"""Pilot"""


Unnamed: 0,subject,predicate,object
71427,ns/m.0h7mfg6,ns/tv.tv_series_episode.director,ns/m.07g7h2
71428,ns/m.0h7mfg6,ns/tv.tv_series_episode.episode_number,"""1"""
71429,ns/m.0h7mfg6,ns/common.topic.description,"""A therapist and his family move across the country to escape their troubled past but quickly discover that their new home comes with its own horrific baggage."""
71430,ns/m.0h7mfg6,ns/type.object.name,"""Pilot"""
71431,ns/m.0h7mfg6,ns/tv.tv_series_episode.next_episode,ns/m.0h989bt
71432,ns/m.0h7mfg6,ns/tv.tv_series_episode.writer,ns/m.07g7h2
71433,ns/m.0h7mfg6,ns/tv.tv_series_episode.season_number,"""1"""
71434,ns/m.0h7mfg6,ns/tv.tv_series_episode.season,ns/m.0kcvvyh
71435,ns/m.0h7mfg6,ns/tv.tv_series_episode.series,ns/m.0h3mh3q
71436,ns/m.0h7mfg6,ns/tv.tv_series_episode.air_date,"""2011-10-05"""


Unnamed: 0,subject,predicate,object
92533,ns/m.0kffn39,ns/tv.tv_series_episode.episode_number,"""1"""
92534,ns/m.0kffn39,ns/tv.tv_series_episode.season_number,"""1"""
92535,ns/m.0kffn39,ns/tv.tv_series_episode.series,ns/m.0c2c5c
92536,ns/m.0kffn39,ns/common.topic.description,"""Arthur \""AC\"" Curry is a keen environmentalist with secret abilities that are heightened when he is around water. Upon learning of his true identity Arthur begins to investigate why random people who disappeared in the Bermuda Triangle years earlier are suddenly beginning to resurface."""
92537,ns/m.0kffn39,ns/type.object.name,"""Pilot"""
92538,ns/m.0kffn39,ns/tv.tv_series_episode.air_date,"""2006-07-24"""


Unnamed: 0,subject,predicate,object
97088,ns/m.0bwlvv,ns/common.topic.description,"""Meet the Bluths. In the pilot, Michael decides to take a job in Arizona after being passed over by George Sr. for head of the Bluth Company. George Sr. is jailed, and the family realizes they need Michael."""
97089,ns/m.0bwlvv,ns/tv.tv_series_episode.season_number,"""1"""
97090,ns/m.0bwlvv,ns/type.object.name,"""Pilot"""
97091,ns/m.0bwlvv,ns/tv.tv_series_episode.writer,ns/m.06brp0
97092,ns/m.0bwlvv,ns/tv.tv_series_episode.next_episode,ns/m.0bpvy2
97093,ns/m.0bwlvv,key/wikipedia.en,"""Extended_Pilot"""
97094,ns/m.0bwlvv,ns/tv.tv_series_episode.air_date,"""2003-11-02"""
97095,ns/m.0bwlvv,ns/tv.tv_series_episode.series,ns/m.02hct1
97096,ns/m.0bwlvv,ns/tv.tv_series_episode.episode_number,"""1"""
97097,ns/m.0bwlvv,ns/tv.tv_series_episode.season,ns/m.02kzkl_


Unnamed: 0,subject,predicate,object
114975,ns/m.0kbdl4b,ns/tv.tv_series_episode.season_number,"""1"""
114976,ns/m.0kbdl4b,ns/type.object.name,"""Pilot"""
114977,ns/m.0kbdl4b,ns/tv.tv_series_episode.episode_number,"""1"""
114978,ns/m.0kbdl4b,ns/common.topic.description,"""When Detective Michael Britten regains consciousness following his family's car accident, he is told that his wife Hannah perished, but that his teen son, Rex, has survived. As he tries to put the pieces of his life back together he awakens again in a world in which his wife is very much alive, but his son Rex died in the accident. In order to keep both of his loved ones alive he begins living two dueling realities. Trying to regain some normalcy Michael turns to his work solving crimes in both worlds with the help of two different partners, Detective Isaiah \""Bird\"" Freeman and Detective Efrem Vega. He begins to solve impossible cases by using his dueling realities to gain unique perspectives and link clues that cross over from world to world. Helping Michael to navigate his new existence are his bureau assigned therapists Dr. Evans and Dr. Lee.."""
114979,ns/m.0kbdl4b,ns/tv.tv_series_episode.air_date,"""2012-03-01"""
114980,ns/m.0kbdl4b,ns/tv.tv_series_episode.series,ns/m.0gty91r


Unnamed: 0,subject,predicate,object
114981,ns/m.0j3fv7g,ns/tv.tv_series_episode.air_date,"""2012-03-01"""
114982,ns/m.0j3fv7g,ns/tv.tv_series_episode.next_episode,ns/m.0kbdl55
114983,ns/m.0j3fv7g,ns/tv.tv_series_episode.episode_number,"""1"""
114984,ns/m.0j3fv7g,ns/tv.tv_series_episode.writer,ns/m.0gfrk9x
114985,ns/m.0j3fv7g,ns/tv.tv_series_episode.series,ns/m.0gty91r
114986,ns/m.0j3fv7g,ns/type.object.name,"""Pilot"""
114987,ns/m.0j3fv7g,ns/common.topic.description,"""\""Pilot\"" is the pilot episode of the American television police procedural fantasy drama Awake, which originally aired on the National Broadcasting Company on March 1, 2012. Written by series creator Kyle Killen, \""Pilot\"" earned a Nielsen rating of 2.0, being watched by 6.247 million viewers upon its initial broadcast. Directed by David Slade, it became the highest-rated non-sports program in its respective time slot on NBC in over a year. The episode has generally received positive reviews, with many critics commenting on the episode's unique script, and the cast members, particularly Jason Isaacs' performance as Michael Britten, who they felt effectively embodied the characteristics of the lead role. It was one of eight honorees at the Critics' Choice Television Awards.\nThe pilot introduced the main character, Michael Britten, a detective who works for the Los Angeles Police Department. He is involved in a fatal accident with his family. Michael is conflicted with two parallel realities; in one reality, in which he wears a red wristband, his wife Hannah Britten survived the accident, and in another reality, in which he wears a green wristband, his son Rex Britten survived."""
114988,ns/m.0j3fv7g,ns/tv.tv_series_episode.season_number,"""1"""
114989,ns/m.0j3fv7g,key/wikipedia.en,"""Pilot_$0028Awake$0029"""
891412,ns/m.0j3fv7g,ns/common.topic.notable_types,ns/m.01xrzlb


Unnamed: 0,subject,predicate,object
170202,ns/m.06yznhd,ns/tv.tv_series_episode.writer,ns/m.026c5sz
170203,ns/m.06yznhd,ns/common.topic.description,"""Ted Crisp works at Veridian Technologies in the research and development department and loves his job. However, Ted is taken aback when he learns the company has cryogenically frozen one of the scientists in his department."""
170204,ns/m.06yznhd,ns/tv.tv_series_episode.series,ns/m.04jj3hb
170205,ns/m.06yznhd,ns/tv.tv_series_episode.air_date,"""2009-03-18"""
170206,ns/m.06yznhd,ns/type.object.name,"""Pilot"""
170207,ns/m.06yznhd,ns/tv.tv_series_episode.season_number,"""1"""
170208,ns/m.06yznhd,ns/tv.tv_series_episode.episode_number,"""1"""
170209,ns/m.06yznhd,ns/tv.tv_series_episode.next_episode,ns/m.06yznhq
170210,ns/m.06yznhd,key/wikipedia.en,"""Pilot_$0028Better_Off_Ted$0029"""
170211,ns/m.06yznhd,ns/film.film.initial_release_date,"""2009-03-18"""


Unnamed: 0,subject,predicate,object
175464,ns/m.0dkt96,ns/common.topic.alias,"""Pilot (or The Douchebag Aspect)"""
175465,ns/m.0dkt96,ns/tv.tv_series_episode.air_date,"""1999-09-24"""
175466,ns/m.0dkt96,ns/tv.tv_series_episode.episode_number,"""1"""
175467,ns/m.0dkt96,ns/type.object.name,"""Pilot"""
175468,ns/m.0dkt96,ns/tv.tv_series_episode.season_number,"""1"""
175469,ns/m.0dkt96,ns/common.topic.description,"""Andy's parents stick his nerdy, little brother, Kevin, on him -- which poses quite the threat to Andy's slacker lifestyle. Meanwhile, Kevin is torn between his desire to rat on Andy and his desire to cease being a nerd."""
175470,ns/m.0dkt96,ns/tv.tv_series_episode.series,ns/m.02t_kr
175471,ns/m.0dkt96,ns/tv.tv_series_episode.writer,ns/m.058jkc
625837,ns/m.0dkt96,ns/tv.tv_series_episode.writer,ns/m.02t_8z


Unnamed: 0,subject,predicate,object
226924,ns/m.05syyz4,ns/common.topic.description,"""Indiana government worker Leslie Knope is given the assignment to convert an abandoned quarry pit into a community park. A documentary film crew follows Leslie through her mishaps and gaffes as she tries to make her assignment a reality."""
226925,ns/m.05syyz4,ns/tv.tv_series_episode.season,ns/m.07l9ksm
226926,ns/m.05syyz4,ns/tv.tv_series_episode.series,ns/m.0557yqh
226927,ns/m.05syyz4,ns/tv.tv_series_episode.season_number,"""1"""
226928,ns/m.05syyz4,ns/tv.tv_series_episode.episode_number,"""1"""
226929,ns/m.05syyz4,ns/common.topic.alias,"""Make My Pit a Park"""
226930,ns/m.05syyz4,ns/common.topic.notable_types,ns/m.01xrzlb
226931,ns/m.05syyz4,ns/type.object.name,"""Pilot"""
226932,ns/m.05syyz4,key/wikipedia.en,"""Pilot_$0028Parks_and_Recreation$0029"""
226933,ns/m.05syyz4,ns/tv.tv_series_episode.air_date,"""2009-04-09"""


Unnamed: 0,subject,predicate,object
269606,ns/m.0kxflf7,ns/tv.tv_series_episode.series,ns/m.02bqq6
269607,ns/m.0kxflf7,ns/common.topic.description,"""Rachel Bradley is left by her boyfriend Simon, so she turns to Karen who is happy to provide a shoulder to cry. She is happy to spend some time with a friend, her baby and David, her husband.Meanwhile Adam Williams ends his latest relationship, which ends a sharp slap round the face.Pete is in trouble since his wife Jenny wishes herself a baby. His friend Adam can't help him in this situation.Later on Rachel and Adam run into each other at the supermarket."""
269608,ns/m.0kxflf7,ns/tv.tv_series_episode.season_number,"""0"""
269609,ns/m.0kxflf7,ns/tv.tv_series_episode.episode_number,"""1"""
269610,ns/m.0kxflf7,ns/tv.tv_series_episode.air_date,"""1997-03-30"""
269611,ns/m.0kxflf7,ns/type.object.name,"""Pilot"""


Unnamed: 0,subject,predicate,object
269659,ns/m.07mmh1y,ns/tv.tv_series_episode.air_date,"""1997-03-30"""
269660,ns/m.07mmh1y,ns/type.object.name,"""Pilot"""
269661,ns/m.07mmh1y,ns/common.topic.description,"""Rachel Bradley is left by her boyfriend Simon, so she turns to Karen who is happy to provide a shoulder to cry. She is happy to spend some time with a friend, her baby and David, her husband.Meanwhile Adam Williams ends his latest relationship, which ends a sharp slap round the face.Pete is in trouble since his wife Jenny wishes herself a baby. His friend Adam can't help him in this situation.Later on Rachel and Adam run into each other at the supermarket."""
269662,ns/m.07mmh1y,ns/tv.tv_series_episode.season_number,"""1"""
269663,ns/m.07mmh1y,ns/tv.tv_series_episode.series,ns/m.02bqq6
269664,ns/m.07mmh1y,ns/tv.tv_series_episode.next_episode,ns/m.07mmh2l
269665,ns/m.07mmh1y,ns/tv.tv_series_episode.episode_number,"""0"""


Unnamed: 0,subject,predicate,object
269666,ns/m.03d6l9d,ns/dataworld.gardening_hint.split_to,ns/m.02bqq6
269667,ns/m.03d6l9d,ns/tv.tv_series_episode.episode_number,"""1"""
269668,ns/m.03d6l9d,ns/common.topic.description,"""Cold Feet is a British television pilot directed by Declan Lowney. It stars James Nesbitt and Helen Baxendale as Adam and Rachel, a couple who meet and fall in love, only for the relationship to break down when he gets cold feet. John Thomson, Fay Ripley, Hermione Norris and Robert Bathurst appear in supporting roles. The programme was written by Mike Bullen, a BBC radio producer with little screenwriting experience, who was tasked with creating a one-off television production that would appeal to middle-class television audiences, who the executive producer Andy Harries believed were underepresented on British television.\nAfter filming was completed in 1996 the commissioning network ITV shelved it for a year. It was eventually scheduled for broadcast on the evening of 30 March 1997, as part of the network's Comedy Premieres strand, but overrunning sports coverage delayed it for an hour. Ratings were low and critical reviews were minimal, but positive; critics enjoyed the comedy drama format and praised the writing and performances of the leads."""
269669,ns/m.03d6l9d,ns/common.topic.alias,"""Cold Feet"""
269670,ns/m.03d6l9d,ns/type.object.name,"""Pilot"""
269671,ns/m.03d6l9d,key/wikipedia.en,"""Pilot_$0028Cold_Feet$0029"""
269672,ns/m.03d6l9d,ns/tv.tv_series_episode.air_date,"""1997-03-30"""
891432,ns/m.03d6l9d,ns/common.topic.notable_types,ns/m.01xrzlb
891434,ns/m.03d6l9d,ns/tv.tv_series_episode.series,ns/m.04jhctd


Unnamed: 0,subject,predicate,object
302175,ns/m.0gzb7m,ns/tv.tv_series_episode.season,ns/m.04c__pz
302176,ns/m.0gzb7m,ns/common.topic.description,"""As a child, Max was part of a highly classified genetic enhancement project. She and several other children were trained in a deeply secluded compound to use their superhuman powers. At a young age, Max and several others escaped the compound and made their way in the world. Now a beautiful young bike messenger and thief, Max is trying to find her lost \""\""siblings\""\"" from the project, but at the same time, Max is trying to dodge capture from her former handlers led by Lydecker, and is aided in her quest by Logan Cale, an idealistic cyber-journalist."""
302177,ns/m.0gzb7m,ns/tv.tv_series_episode.air_date,"""2000-10-03"""
302178,ns/m.0gzb7m,ns/tv.tv_series_episode.season_number,"""1"""
302179,ns/m.0gzb7m,ns/type.object.name,"""Pilot"""
302180,ns/m.0gzb7m,ns/tv.tv_series_episode.series,ns/m.0cg1c
302181,ns/m.0gzb7m,ns/tv.tv_series_episode.writer,ns/m.03_gd
302182,ns/m.0gzb7m,ns/tv.tv_series_episode.episode_number,"""1"""


Unnamed: 0,subject,predicate,object
312871,ns/m.0fqp_h,key/wikipedia.en,"""Pilot_XFiles"""
312872,ns/m.0fqp_h,ns/tv.tv_series_episode.season_number,"""1"""
312873,ns/m.0fqp_h,ns/tv.tv_series_episode.air_date,"""1993-09-10"""
312874,ns/m.0fqp_h,ns/tv.tv_series_episode.episode_number,"""1"""
312875,ns/m.0fqp_h,ns/tv.tv_series_episode.series,ns/m.07g9f
312876,ns/m.0fqp_h,ns/tv.tv_series_episode.next_episode,ns/m.0fqq9w
312877,ns/m.0fqp_h,ns/tv.tv_series_episode.production_number,"""01-101"""
312878,ns/m.0fqp_h,ns/tv.tv_series_episode.season,ns/m.059xkks
312879,ns/m.0fqp_h,ns/type.object.name,"""Pilot"""
312880,ns/m.0fqp_h,ns/tv.tv_series_episode.writer,ns/m.01gp_x


Unnamed: 0,subject,predicate,object
338445,ns/m.0c0293,ns/tv.tv_series_episode.season_number,"""1"""
338446,ns/m.0c0293,ns/type.object.name,"""Pilot"""
338447,ns/m.0c0293,ns/tv.tv_series_episode.air_date,"""1995-09-13"""
338448,ns/m.0c0293,ns/common.topic.description,"""\""Pilot\"" is the first episode and the series premiere of the American sitcom The Drew Carey Show. It first aired on September 13, 1995 on the ABC network in the United States. The premise of the show revolves around the life Drew Carey would have lived if he had not become a stand-up comedian. The pilot introduces the main characters of Drew, Kate, Lewis and Oswald, as well as Drew's workplace, the fictional Winfred-Lauder department store, and enemy Mimi Bobeck.\nThe pilot was written by series co-creators Carey and Bruce Helford, while Michael Lessac directed. It was shot in April 1995 at the Warner Bros. Television studios in Burbank, California. The episode ranked joint 29th in television programs with the most viewers for the week of September 11–17, 1995. Critical response was mixed, with many comparing the show to the NBC sitcom Friends. Ray Richmond from the Los Angeles Daily News praised Carey's performance, but thought the episode did not click, while Variety's Tony Scott liked the opening sequence and Lessac's \""inventive\"" direction."""
338449,ns/m.0c0293,ns/tv.tv_series_episode.production_number,"""457095"""
338450,ns/m.0c0293,ns/tv.tv_series_episode.series,ns/m.0h21w6
338451,ns/m.0c0293,key/wikipedia.en,"""Pilot_$0028The_Drew_Carey_Show$0029"""
338452,ns/m.0c0293,ns/tv.tv_series_episode.writer,ns/m.01tsqg2
338453,ns/m.0c0293,ns/tv.tv_series_episode.episode_number,"""1"""
892621,ns/m.0c0293,ns/common.topic.notable_types,ns/m.01xrzlb


Unnamed: 0,subject,predicate,object
372325,ns/m.02qnz0j,key/wikipedia.en,"""Pilot_$0028Twin_Peaks$0029"""
372326,ns/m.02qnz0j,ns/tv.tv_series_episode.season_number,"""1"""
372327,ns/m.02qnz0j,ns/film.film.directed_by,ns/m.026dx
372328,ns/m.02qnz0j,ns/tv.tv_series_episode.episode_number,"""1"""
372329,ns/m.02qnz0j,ns/tv.tv_series_episode.air_date,"""1990-04-08"""
372330,ns/m.02qnz0j,ns/type.object.name,"""Pilot"""
372331,ns/m.02qnz0j,ns/common.topic.alias,"""Twin Peaks: Pilot"""
372332,ns/m.02qnz0j,ns/common.topic.description,"""The pilot episode, also known as Northwest Passage, of the mystery television series Twin Peaks premiered on the ABC Network on Sunday, April 8, 1990. It was written by series creators Mark Frost and David Lynch, and directed by Lynch. The pilot follows the characters of Dale Cooper and Harry S. Truman as they investigate the death of popular high school student Laura Palmer; Cooper believes the murder has connections to a murder case that occurred a year earlier. In addition to setting the tone for the show, the episode sets up several character and story arcs and marked the appearance of several recurring characters. The episode received a strong Nielsen household rating compared to other season one episodes and was well received by fans and critics alike. The original title for the series was Northwest Passage, but this was later changed."""
372333,ns/m.02qnz0j,ns/tv.tv_series_episode.director,ns/m.026dx
372334,ns/m.02qnz0j,ns/tv.tv_series_episode.writer,ns/m.06lv4f


Unnamed: 0,subject,predicate,object
408151,ns/m.011nsb7_,ns/tv.tv_series_episode.series,ns/m.010lwj01
408152,ns/m.011nsb7_,ns/tv.tv_series_episode.episode_number,"""1"""
408153,ns/m.011nsb7_,ns/tv.tv_series_episode.season_number,"""1"""
408154,ns/m.011nsb7_,ns/type.object.name,"""Pilot"""
408155,ns/m.011nsb7_,ns/common.topic.description,"""A deadly subway collision is investigated as a New York City medical examiner attempts to discover the truth behind his inexplicable immortality."""
408156,ns/m.011nsb7_,ns/tv.tv_series_episode.air_date,"""2014-09-22"""


Unnamed: 0,subject,predicate,object
419857,ns/m.0c3vwgr,key/wikipedia.en,"""Pilot_$0028Friday_Night_Lights$0029"""
419858,ns/m.0c3vwgr,ns/tv.tv_series_episode.writer,ns/m.064jjy
419859,ns/m.0c3vwgr,ns/tv.tv_series_episode.series,ns/m.0dsx3f
419860,ns/m.0c3vwgr,ns/type.object.name,"""Pilot"""
419861,ns/m.0c3vwgr,ns/tv.tv_series_episode.director,ns/m.064jjy
419862,ns/m.0c3vwgr,ns/tv.tv_series_episode.season,ns/m.02q4r1j
419863,ns/m.0c3vwgr,ns/tv.tv_series_episode.episode_number,"""1"""
419864,ns/m.0c3vwgr,ns/common.topic.description,"""A newcomer coach Eric Taylor finds himself helming a promising high school football team, the Dillon Panthers, that has a real chance to make it to the top."""
419865,ns/m.0c3vwgr,ns/tv.tv_series_episode.next_episode,ns/m.06ws66m
419866,ns/m.0c3vwgr,ns/tv.tv_series_episode.season_number,"""1"""


Unnamed: 0,subject,predicate,object
432299,ns/m.02nw2nc,ns/common.topic.description,"""Forty years ago, Harriet Vanger disappeared from a family gathering on the island owned and inhabited by the powerful Vanger clan. Her body was never found, yet her beloved uncle is convinced it was murder and that the killer is a member of his own tightly knit but dysfunctional family. He employs disgraced financial journalist Mikael Blomkvist and the tattooed, but troubled computer hacker Lisbeth Salander to investigate."""
432300,ns/m.02nw2nc,key/wikipedia.en,"""Pilot_$0028Millennium$0029"""
432301,ns/m.02nw2nc,ns/tv.tv_series_episode.episode_number,"""1"""
432302,ns/m.02nw2nc,ns/tv.tv_series_episode.next_episode,ns/m.02nw2gv
432303,ns/m.02nw2nc,ns/type.object.name,"""Pilot"""
432304,ns/m.02nw2nc,ns/common.topic.alias,"""The Frenchman"""
432305,ns/m.02nw2nc,ns/tv.tv_series_episode.season,ns/m.02nw1ss
432306,ns/m.02nw2nc,ns/tv.tv_series_episode.air_date,"""1996-10-25"""
432307,ns/m.02nw2nc,ns/tv.tv_series_episode.writer,ns/m.01gp_x
432308,ns/m.02nw2nc,ns/tv.tv_series_episode.series,ns/m.036qs6


Unnamed: 0,subject,predicate,object
458549,ns/m.0dpg5_,key/wikipedia.en,"""Pilot_$0028Cosby_Show$0029"""
458550,ns/m.0dpg5_,ns/common.topic.notable_types,ns/m.01xrzlb
458551,ns/m.0dpg5_,ns/common.topic.description,"""Clair is furious when Theo brings home a report card with 4 D's. He tells Cliff that he has no intention of going to college, as he feels that he is destined to be a \""\""regular person.\""\"" Cliff uses Monopoly money to teach him about the economic realities that many blue-collar workers must face. He assures Theo that he only wants him to try his best. Cliff is alarmed to meet Denise's latest beau, a former merchant marine who once spent time in a Turkish prison."""
458552,ns/m.0dpg5_,ns/type.object.name,"""Pilot"""
458553,ns/m.0dpg5_,ns/tv.tv_series_episode.air_date,"""1984-09-20"""
458554,ns/m.0dpg5_,ns/tv.tv_series_episode.director,ns/m.0d560m
458555,ns/m.0dpg5_,ns/tv.tv_series_episode.next_episode,ns/m.06ypdl6
458556,ns/m.0dpg5_,ns/tv.tv_series_episode.series,ns/m.016tvq
458557,ns/m.0dpg5_,ns/tv.tv_series_episode.season_number,"""1"""
458558,ns/m.0dpg5_,ns/tv.tv_series_episode.season,ns/m.05drtcb


Unnamed: 0,subject,predicate,object
619373,ns/m.0krtwc0,ns/tv.tv_series_episode.air_date,"""1991-12-07"""
619374,ns/m.0krtwc0,ns/tv.tv_series_episode.season_number,"""0"""
619375,ns/m.0krtwc0,ns/type.object.name,"""Pilot"""
619376,ns/m.0krtwc0,ns/tv.tv_series_episode.episode_number,"""1"""
619377,ns/m.0krtwc0,ns/tv.tv_series_episode.season,ns/m.0795xps
619378,ns/m.0krtwc0,ns/tv.tv_series_episode.series,ns/m.04r27q


Unnamed: 0,subject,predicate,object
619384,ns/m.0795xq1,ns/tv.tv_series_episode.air_date,"""1991-07-12"""
619385,ns/m.0795xq1,ns/tv.tv_series_episode.episode_number,"""0"""
619386,ns/m.0795xq1,ns/type.object.name,"""Pilot"""
619387,ns/m.0795xq1,ns/tv.tv_series_episode.season_number,"""0"""
619388,ns/m.0795xq1,ns/tv.tv_series_episode.series,ns/m.04r27q
619389,ns/m.0795xq1,ns/tv.tv_series_episode.season,ns/m.0795xps


Unnamed: 0,subject,predicate,object
684847,ns/m.0gj8cyj,ns/tv.tv_series_episode.next_episode,ns/m.0ghmykz
684848,ns/m.0gj8cyj,key/wikipedia.en,"""Body_of_Proof$002Fpilot"""
684849,ns/m.0gj8cyj,ns/type.object.name,"""Pilot"""
684850,ns/m.0gj8cyj,ns/tv.tv_series_episode.air_date,"""2011-03-29"""
684851,ns/m.0gj8cyj,ns/common.topic.description,"""A female jogger is found floating in the Schuylkill River. Dr. Megan Hunt teams up with her partner, medical investigator and former cop Peter Dunlop, under the watchful eye of their boss, Chief Medical Examiner Dr. Kate Murphey, who is very aware of Megan’s gifted but polarizing work style. Megan’s colleagues, Dr. Ethan Gross and Deputy Chief Medical Examiner Dr. Curtis Brumfield, must find their own way to work alongside the new medical examiner. Megan’s investigation takes her out in the field where she meets old-school cop Detective Bud Morris, who is exasperated by Megan’s tenacity and bull-headedness. But his partner, Detective Samantha Baker, possesses a quiet respect for Megan’s input, even if Morris doesn’t. Meanwhile, as she’s is trying to solve the female jogger’s death, Megan has to solve a personal puzzle of her own -- what is the perfect present for her daughter’s 12th birthday?"""
684852,ns/m.0gj8cyj,ns/tv.tv_series_episode.season_number,"""1"""
684853,ns/m.0gj8cyj,ns/tv.tv_series_episode.season,ns/m.0ggw5qg
684854,ns/m.0gj8cyj,ns/tv.tv_series_episode.series,ns/m.0by0t44
684855,ns/m.0gj8cyj,ns/tv.tv_series_episode.writer,ns/m.0bl4zxp
684856,ns/m.0gj8cyj,ns/common.topic.notable_types,ns/m.01xrzlb


Unnamed: 0,subject,predicate,object
703927,ns/m.05dqwkp,ns/tv.tv_series_episode.series,ns/m.088rf0
703928,ns/m.05dqwkp,ns/tv.tv_series_episode.episode_number,"""1"""
703929,ns/m.05dqwkp,ns/type.object.name,"""Pilot"""
703930,ns/m.05dqwkp,ns/tv.tv_series_episode.air_date,"""2005-09-22"""
703931,ns/m.05dqwkp,ns/common.topic.description,"""Main Plot:Denise aids an old boyfriend in finding a new love and Clea confesses some not so good news about her love life."""
703932,ns/m.05dqwkp,ns/tv.tv_series_episode.season_number,"""1"""
703933,ns/m.05dqwkp,ns/tv.tv_series_episode.next_episode,ns/m.05dqwks


Unnamed: 0,subject,predicate,object
766014,ns/m.05dpzp8,ns/tv.tv_series_episode.writer,ns/m.07fvf1
766015,ns/m.05dpzp8,ns/tv.tv_series_episode.series,ns/m.04sflc
766016,ns/m.05dpzp8,ns/type.object.name,"""Pilot"""
766017,ns/m.05dpzp8,ns/tv.tv_series_episode.air_date,"""2004-09-24"""
766018,ns/m.05dpzp8,ns/tv.tv_series_episode.season_number,"""1"""
766019,ns/m.05dpzp8,ns/tv.tv_series_episode.episode_number,"""1"""
766020,ns/m.05dpzp8,ns/common.topic.description,"""When the 23rd helper quits from the Savages' house, burning their laundry in the process, all the boys want a new one. Nick says no telling the boys they need to be more responsible and clean up after themselves. To sway him, the boys make the house more disorderly than ever thanks to Jack. Meanwhile, Sam is thinking to ask Angela, his neighbour, to go to the school dance with him. The two stories join when Nick has Angela visit the house. Sam quickly rallies everyone to clean up to help him win the girl."""


Unnamed: 0,subject,predicate,object
766041,ns/m.05dtp0f,ns/common.topic.description,"""In the pilot, Bob hires a nanny for the kids who just so happens to be the woman he stood up at the prom. Meanwhile, Faith doesn't have a date for the homecoming dance."""
766042,ns/m.05dtp0f,ns/tv.tv_series_episode.series,ns/m.01qvyy
766043,ns/m.05dtp0f,ns/tv.tv_series_episode.season_number,"""1"""
766044,ns/m.05dtp0f,ns/tv.tv_series_episode.next_episode,ns/m.05dtp0l
766045,ns/m.05dtp0f,ns/tv.tv_series_episode.episode_number,"""1"""
766046,ns/m.05dtp0f,ns/type.object.name,"""Pilot"""
766047,ns/m.05dtp0f,ns/tv.tv_series_episode.production_number,"""1AGV79"""
766048,ns/m.05dtp0f,ns/tv.tv_series_episode.air_date,"""2003-03-30"""
766049,ns/m.05dtp0f,ns/tv.tv_series_episode.writer,ns/m.07fvf1


Unnamed: 0,subject,predicate,object
777465,ns/m.0mwpj6b,ns/tv.tv_series_episode.season_number,"""0"""
777466,ns/m.0mwpj6b,ns/tv.tv_series_episode.air_date,"""2010-08-16"""
777467,ns/m.0mwpj6b,ns/tv.tv_series_episode.episode_number,"""2"""
777468,ns/m.0mwpj6b,ns/type.object.name,"""Pilot"""
777469,ns/m.0mwpj6b,ns/common.topic.description,"""Destiny can't cope with her new puppies, and a newly-single Vince takes Nelson for a night out. Marion is surgically castrated by Chico from the X-Factor."""
777470,ns/m.0mwpj6b,ns/tv.tv_series_episode.series,ns/m.0cc89g1


Unnamed: 0,subject,predicate,object
891225,ns/m.0260p1z,ns/tv.tv_series_episode.series,ns/m.0d68qy
891226,ns/m.0260p1z,ns/common.topic.description,"""Liz Lemon is the head writer on a demanding, live TV program in New York City. However, things begin to get complicated when her new boss insists that a wild and unpredictable movie star joins the cast."""
891227,ns/m.0260p1z,ns/tv.tv_series_episode.writer,ns/m.0pz7h
891228,ns/m.0260p1z,ns/type.object.name,"""Pilot"""
891229,ns/m.0260p1z,ns/tv.tv_series_episode.episode_number,"""1"""
891230,ns/m.0260p1z,ns/tv.tv_series_episode.season_number,"""1"""
891231,ns/m.0260p1z,ns/common.topic.notable_types,ns/m.01xrzlb
891232,ns/m.0260p1z,ns/tv.tv_series_episode.director,ns/m.02771wd
891233,ns/m.0260p1z,ns/tv.tv_series_episode.season,ns/m.03y04y0
891234,ns/m.0260p1z,key/wikipedia.en,"""Pilot_$002830_Rock$0029"""


Unnamed: 0,subject,predicate,object
891269,ns/m.0x0n5mq,ns/tv.tv_series_episode.writer,ns/m.01vz80y
891270,ns/m.0x0n5mq,ns/tv.tv_series_episode.season_number,"""1"""
891271,ns/m.0x0n5mq,ns/tv.tv_series_episode.director,ns/m.01vz80y
891272,ns/m.0x0n5mq,ns/film.film.produced_by,ns/m.0brlct
891273,ns/m.0x0n5mq,ns/tv.tv_series_episode.air_date,"""2013-09-24"""
891274,ns/m.0x0n5mq,key/wikipedia.en,"""Pilot_$0028Agents_of_S$002EH$002EI$002EE$002EL$002ED$002E$0029"""
891275,ns/m.0x0n5mq,ns/common.topic.description,"""It’s just after the battle of New York, and now that the existence of super heroes and aliens has become public knowledge, the world is trying to come to grips with this new reality. Agent Phil Coulson is back in action and has his eye on a mysterious group called the Rising Tide. In order to track this unseen, unknown enemy, he has assembled a small, highly select group of Agents from the worldwide law-enforcement organization known as S.H.I.E.L.D. (Strategic Homeland Intervention Enforcement and Logistics Division). The group’s first assignment together as a team finds them trying to track down an ordinary man who has gained extraordinary powers. Powers that could have devastating consequences."""
891276,ns/m.0x0n5mq,ns/tv.tv_series_episode.next_episode,ns/m.0x0n5qg
891277,ns/m.0x0n5mq,ns/common.topic.notable_types,ns/m.01xrzlb
891278,ns/m.0x0n5mq,ns/tv.tv_series_episode.episode_number,"""1"""


Unnamed: 0,subject,predicate,object
891474,ns/m.0808gzp,ns/tv.tv_series_episode.next_episode,ns/m.06gsz40
891475,ns/m.0808gzp,ns/tv.tv_series_episode.production_number,"""101"""
891476,ns/m.0808gzp,ns/tv.tv_series_episode.series,ns/m.05pbsry
891477,ns/m.0808gzp,ns/tv.tv_series_episode.season,ns/m.06gsz2m
891478,ns/m.0808gzp,ns/tv.tv_series_episode.director,ns/m.0dky65
891479,ns/m.0808gzp,ns/common.topic.description,"""Jeff Winger, a lawyer whose degree has been revoked, finds himself at Greendale Community College where he forms a study group consisting of Pierce, Britta, a 28-year old drop out; Shirley, a middle-aged and recently divorced woman; Abed, a pop-culture junkie; Annie, a perfectionist; Troy, a former high school football hero; and Senor Chang, the Spanish professor."""
891480,ns/m.0808gzp,ns/tv.tv_series_episode.season_number,"""1"""
891481,ns/m.0808gzp,ns/tv.tv_series_episode.writer,ns/m.0cl9kh
891482,ns/m.0808gzp,ns/common.topic.notable_types,ns/m.01xrzlb
891483,ns/m.0808gzp,ns/type.object.name,"""Pilot"""


Unnamed: 0,subject,predicate,object
891560,ns/m.090ldx,ns/common.topic.description,"""Looking down on her friends and family isn't a way of life for Mary Alice Young. It's a way of death. One day, in her perfect house, in the loveliest of suburbs, Mary Alice ended it all. Now she's taking us into the lives of her family, friends and neighbors, commenting from her elevated POV. Her husband's acting suspicious, the neighbors are talking, and her girlfriends are wondering why one of their own would do something so rash… and so messy."""
891561,ns/m.090ldx,ns/common.topic.notable_types,ns/m.01xrzlb
891562,ns/m.090ldx,key/wikipedia.en,"""Pilot_$0028Desperate_Housewives$0029"""
891563,ns/m.090ldx,ns/tv.tv_series_episode.episode_number,"""1"""
891564,ns/m.090ldx,ns/type.object.name,"""Pilot"""
891565,ns/m.090ldx,ns/tv.tv_series_episode.next_episode,ns/m.093ch5
891566,ns/m.090ldx,ns/tv.tv_series_episode.season_number,"""1"""
891567,ns/m.090ldx,ns/tv.tv_series_episode.series,ns/m.03ln8b
891568,ns/m.090ldx,ns/tv.tv_series_episode.writer,ns/m.04pg29
891569,ns/m.090ldx,ns/tv.tv_series_episode.air_date,"""2004-10-03"""


Unnamed: 0,subject,predicate,object
891573,ns/m.0w5zm5g,ns/tv.tv_series_episode.season_number,"""1"""
891574,ns/m.0w5zm5g,ns/tv.tv_series_episode.director,ns/m.083_jq
891575,ns/m.0w5zm5g,ns/tv.tv_series_episode.series,ns/m.0j42tf5
891576,ns/m.0w5zm5g,ns/tv.tv_series_episode.air_date,"""2013-06-23"""
891577,ns/m.0w5zm5g,ns/type.object.name,"""Pilot"""
891578,ns/m.0w5zm5g,ns/common.topic.description,"""\""Pilot\"" is the series premiere to the Lifetime series Devious Maids. The pilot had been ordered by ABC on January 31, 2012 and cast during the following two months. Filming began in March. ABC declined to pick up the pilot on May 14, but Lifetime did so on June 22, ordering 13 episodes. Although most of the cast had been selected by this time three additional regular supporting characters were added in November 2012 for inclusion in the pilot and the continuing series. The series was once proposed to be a spinoff of Desperate Housewives, but is not one.\nThe episode revolves around the murder of a Latina maid in Beverly Hills and the introduction of her cadre of associates who are also Latina maids. The maids are shown in their employment surroundings with their upper class employers who play supporting roles. The main character is not actually a maid but rather the mother of the primary murder subject who poses as a maid to gain entrance into the world where she might find clues to prove her son's innocence.\nThe pilot episode was released online in both Spanish and English on June 9, 2013, before its television debut on June 23."""
891579,ns/m.0w5zm5g,ns/tv.tv_series_episode.writer,ns/m.04pg29
891580,ns/m.0w5zm5g,key/wikipedia.en,"""Pilot_$0028Devious_Maids$0029"""
891581,ns/m.0w5zm5g,ns/common.topic.notable_types,ns/m.01xrzlb


Unnamed: 0,subject,predicate,object
891775,ns/m.07kf5_9,ns/tv.tv_series_episode.air_date,"""2009-05-19"""
891776,ns/m.07kf5_9,ns/common.topic.description,"""\""Pilot\"" is the pilot episode of the American television series Glee, which premiered on the Fox network on May 19, 2009. An extended director's cut version aired on September 2, 2009. The show focuses on a high school show choir, also known as a glee club, set within the fictional William McKinley High School in Lima, Ohio. The pilot episode covers the formation of the club and introduces the main characters. The episode was directed by series creator Ryan Murphy, and written by Murphy, Brad Falchuk and Ian Brennan. Murphy selected the music featured in the episode, with the intention of maintaining a balance between showtunes and chart hits.\nThe episode achieved 9.619 million viewers on first broadcast, and 4.2 million when the director's cut version aired. Critical response was mixed, with The New York Times‍‍ '​‍s Alessandra Stanley highlighting the episode's unoriginality and stereotyped characters, but praising the showmanship and talent of the cast."""
891777,ns/m.07kf5_9,ns/tv.tv_series_episode.director,ns/m.07g7h2
891778,ns/m.07kf5_9,ns/tv.tv_series_episode.season_number,"""1"""
891779,ns/m.07kf5_9,ns/tv.tv_series_episode.writer,ns/m.07g7h2
891780,ns/m.07kf5_9,ns/tv.tv_series_episode.series,ns/m.05f4vxd
891781,ns/m.07kf5_9,ns/tv.tv_series_episode.production_number,"""1ARC79"""
891782,ns/m.07kf5_9,ns/tv.tv_series_episode.next_episode,ns/m.06gsykt
891783,ns/m.07kf5_9,key/wikipedia.en,"""Pilot_$0028Glee$0029"""
891784,ns/m.07kf5_9,ns/common.topic.notable_types,ns/m.01xrzlb


Unnamed: 0,subject,predicate,object
891834,ns/m.0ddh9q1,ns/tv.tv_series_episode.writer,ns/m.09pl3f
891835,ns/m.0ddh9q1,ns/tv.tv_series_episode.season_number,"""1"""
891836,ns/m.0ddh9q1,ns/common.topic.description,"""Steve McGarrett is summoned home to the island of Honolulu to bury his father after he is murdered by a madman. Governor Jameson wants Steve to head up a new elite police unit and she will give him full authority to get the job done. Initially he turns down the job, but when he meets the man in charge of the unit, he changes his mind."""
891837,ns/m.0ddh9q1,ns/tv.tv_series_episode.season,ns/m.0ggp6kc
891838,ns/m.0ddh9q1,ns/type.object.name,"""Pilot"""
891839,ns/m.0ddh9q1,key/wikipedia.en,"""Pilot_$0028Hawaii_Five-0$0029"""
891840,ns/m.0ddh9q1,ns/tv.tv_series_episode.air_date,"""2010-09-20"""
891841,ns/m.0ddh9q1,ns/tv.tv_series_episode.series,ns/m.0c00rg9
891842,ns/m.0ddh9q1,ns/tv.tv_series_episode.next_episode,ns/m.0gh057s
891843,ns/m.0ddh9q1,ns/tv.tv_series_episode.director,ns/m.03m816


Unnamed: 0,subject,predicate,object
891919,ns/m.0269nfn,ns/common.topic.alias,"""House M.D. Pilot"""
891920,ns/m.0269nfn,ns/tv.tv_series_episode.writer,ns/m.08xwck
891921,ns/m.0269nfn,ns/tv.tv_series_episode.next_episode,ns/m.026d14h
891922,ns/m.0269nfn,ns/tv.tv_series_episode.episode_number,"""1"""
891923,ns/m.0269nfn,ns/common.topic.notable_types,ns/m.01xrzlb
891924,ns/m.0269nfn,ns/tv.tv_series_episode.season_number,"""1"""
891925,ns/m.0269nfn,ns/tv.tv_series_episode.season,ns/m.063znr6
891926,ns/m.0269nfn,ns/common.topic.description,"""Young kindergarten teacher Rebecca Adler collapses in her classroom after uncontrolled gibberish slips out of her mouth while she is about to teach students."""
891927,ns/m.0269nfn,ns/tv.tv_series_episode.director,ns/m.07nznf
891928,ns/m.0269nfn,ns/tv.tv_series_episode.air_date,"""2004-11-16"""


Unnamed: 0,subject,predicate,object
891978,ns/m.0bzd9w,ns/type.object.name,"""Pilot"""
891979,ns/m.0bzd9w,ns/tv.tv_series_episode.writer,ns/m.0697kh
891980,ns/m.0bzd9w,ns/dataworld.gardening_hint.split_to,ns/m.06ypcbn
891981,ns/m.0bzd9w,ns/common.topic.notable_types,ns/m.01xrzlb
891982,ns/m.0bzd9w,ns/tv.tv_series_episode.production_number,"""100"""
891983,ns/m.0bzd9w,ns/common.topic.description,"""Stripped of everything, the 48 survivors scavenge what they can from the plane for their survival. Some panic. Some pin their hopes on rescue. A few find inner strength they never knew they had-like Kate who, with no medical training, suddenly finds herself suturing the doctor's wounds. The band of friends, family, enemies and strangers must work together against the cruel weather and harsh terrain. But the intense howls of mysterious creatures stalking the jungle fill them all with fear. Fortunately, thanks to the calm leadership of quick-thinking Jack and level-headed Kate, they have hope. But even heroes have secrets, as the survivors will come to learn."""
891984,ns/m.0bzd9w,ns/common.topic.alias,"""Pilot (1)"""
891985,ns/m.0bzd9w,ns/tv.tv_series_episode.series,ns/m.0828jw
891986,ns/m.0bzd9w,ns/tv.tv_series_episode.episode_number,"""1"""
891987,ns/m.0bzd9w,key/wikipedia.en,"""Pilot_$0028Lost$0029"""


Unnamed: 0,subject,predicate,object
892101,ns/m.07k8c34,ns/common.topic.description,"""To help capture a serial rapist-turned-killer, FBI Special Agent Don Eppes recruits his genius brother Charlie, who uses a mathematical equation to identify the killer's point of origin by working back from the crime scene locations."""
892102,ns/m.07k8c34,ns/tv.tv_series_episode.series,ns/m.04vrrd
892103,ns/m.07k8c34,ns/tv.tv_series_episode.air_date,"""2005-01-23"""
892104,ns/m.07k8c34,ns/common.topic.notable_types,ns/m.01xrzlb
892105,ns/m.07k8c34,ns/tv.tv_series_episode.season_number,"""1"""
892106,ns/m.07k8c34,ns/tv.tv_series_episode.director,ns/m.02plb56
892107,ns/m.07k8c34,key/wikipedia.en,"""Pilot_$0028Numbers$0029"""
892108,ns/m.07k8c34,ns/type.object.name,"""Pilot"""
892109,ns/m.07k8c34,ns/tv.tv_series_episode.next_episode,ns/m.06xm0br
892110,ns/m.07k8c34,ns/tv.tv_series_episode.episode_number,"""1"""


Unnamed: 0,subject,predicate,object
892134,ns/m.0hgpv3v,ns/tv.tv_series_episode.writer,ns/m.0d7hg4
892135,ns/m.0hgpv3v,ns/common.topic.description,"""Emma does not believe in Henry's stories, and she brings him back to Storybrooke, where she is captivated by an unusual boy. Concerned for the boy, she decides to stay for a while, but soon discovers that Storybrooke is more than a simple town. It's a place where magic has been forgotten, but it's still there, where fairytale characters come alive, even though if they do not remember who they once were, and the Evil Queen, is now Henry's foster mother. An epic battle is beginning, and for the good side to win, Emma will have to accept her destiny and fight for it."""
892136,ns/m.0hgpv3v,ns/tv.tv_series_episode.next_episode,ns/m.0hgp3n8
892137,ns/m.0hgpv3v,ns/common.topic.notable_types,ns/m.01xrzlb
892138,ns/m.0hgpv3v,ns/tv.tv_series_episode.air_date,"""2011-10-23"""
892139,ns/m.0hgpv3v,ns/tv.tv_series_episode.series,ns/m.0cj6jv
892140,ns/m.0hgpv3v,ns/tv.tv_series_episode.season,ns/m.0k74svp
892141,ns/m.0hgpv3v,ns/tv.tv_series_episode.season_number,"""1"""
892142,ns/m.0hgpv3v,ns/type.object.name,"""Pilot"""
892143,ns/m.0hgpv3v,ns/tv.tv_series_episode.episode_number,"""1"""


Unnamed: 0,subject,predicate,object
892250,ns/m.05q88m9,ns/tv.tv_series_episode.air_date,"""2009-04-19"""
892251,ns/m.05q88m9,ns/tv.tv_series_episode.season_number,"""1"""
892252,ns/m.05q88m9,key/wikipedia.en,"""Pilot_$0028Sit_Down$002C_Shut_Up$0029"""
892253,ns/m.05q88m9,ns/type.object.name,"""Pilot"""
892254,ns/m.05q88m9,ns/tv.tv_series_episode.writer,ns/m.06brp0
892255,ns/m.05q88m9,ns/tv.tv_series_episode.season,ns/m.07mq_vs
892256,ns/m.05q88m9,ns/common.topic.description,"""Knob Haven High School is in financial trouble, and Acting Principal Sue Sezno must win the upcoming football game to gain alumni donations or face firing a faculty member. Meanwhile, a scandal erupts when pills confiscated from a student’s locker becomes a part of Vice Principal Stuart Proszakian’s daily diet."""
892257,ns/m.05q88m9,ns/tv.tv_series_episode.series,ns/m.02pjh3f
892258,ns/m.05q88m9,ns/tv.tv_series_episode.next_episode,ns/m.05q6kqj
892259,ns/m.05q88m9,ns/tv.tv_series_episode.director,ns/m.08tx_3


Unnamed: 0,subject,predicate,object
892360,ns/m.0gjdv5,key/wikipedia.en,"""Pilot_$0028Smallville$0029"""
892361,ns/m.0gjdv5,ns/tv.tv_series_episode.season,ns/m.087sz_
892362,ns/m.0gjdv5,ns/tv.tv_series_episode.series,ns/m.03g9xj
892363,ns/m.0gjdv5,ns/common.topic.notable_types,ns/m.01xrzlb
892364,ns/m.0gjdv5,ns/tv.tv_series_episode.season_number,"""1"""
892365,ns/m.0gjdv5,ns/type.object.name,"""Pilot"""
892366,ns/m.0gjdv5,ns/tv.tv_series_episode.air_date,"""2001-10-16"""
892367,ns/m.0gjdv5,ns/common.topic.description,"""The first episode tells the story of the meteor shower that hit Smallville and changed life in the Kansas town forever. Clark Kent meets Lex Luthor for the first time and encounters the first in a long line of humans mutated by the strange green meteor rocks that accompanied him on his journey to Earth."""
892368,ns/m.0gjdv5,ns/tv.tv_series_episode.director,ns/m.025y9fn
892369,ns/m.0gjdv5,ns/tv.tv_series_episode.writer,ns/m.02x_ck


Unnamed: 0,subject,predicate,object
892383,ns/m.0kb0_g3,ns/tv.tv_series_episode.episode_number,"""1"""
892384,ns/m.0kb0_g3,ns/tv.tv_series_episode.air_date,"""2012-02-06"""
892385,ns/m.0kb0_g3,ns/tv.tv_series_episode.next_episode,ns/m.0kb0_h2
892386,ns/m.0kb0_g3,ns/type.object.name,"""Pilot"""
892387,ns/m.0kb0_g3,ns/tv.tv_series_episode.director,ns/m.02vmrmt
892388,ns/m.0kb0_g3,ns/tv.tv_series_episode.season_number,"""1"""
892389,ns/m.0kb0_g3,ns/common.topic.description,"""In the pilot of Smash we are introduced to Tom Levitt and Julia Houstona song-writing duo who had planned on taking a break from the business of Broadway but, find themselves drawn back in when they get the desire to produce a musical based around the life of Marilyn Monroe. When a demo of one of the tracks goes viral the project attracts the eye and ear of tenacious producer Eileen Rand and, although causing a conflict that of director Derek Wills. But the biggest and most important task is to find their Marilyn after, auditioning a range of girls they whittle it down to two but, who do they go for. Ivy Lynn a veteran of the stage who wants to break away from the chorus line to get her break as a lead or Karen Cartwright a fresh talent who is trying to follow her dream of becoming a star."""
892390,ns/m.0kb0_g3,ns/tv.tv_series_episode.series,ns/m.0gfh6fs
892391,ns/m.0kb0_g3,ns/tv.tv_series_episode.season,ns/m.0kb0_dn
892392,ns/m.0kb0_g3,ns/common.topic.notable_types,ns/m.01xrzlb


Unnamed: 0,subject,predicate,object
892467,ns/m.01xwlgy,ns/tv.tv_series_episode.air_date,"""1998-09-22"""
892468,ns/m.01xwlgy,ns/tv.tv_series_episode.season,ns/m.01xwlsk
892469,ns/m.01xwlgy,ns/type.object.name,"""Pilot"""
892470,ns/m.01xwlgy,ns/tv.tv_series_episode.director,ns/m.03c6vl
892471,ns/m.01xwlgy,ns/tv.tv_series_episode.episode_number,"""1"""
892472,ns/m.01xwlgy,ns/common.topic.description,"""Casey begins to slow down at the job, due to his impending divorce. The staff tries to get him out of his haze and J.J. and the network brass threaten to fire him. At the same time Casey himself thinks about leaving until he witnesses that he'd only do this for the wrong reasons.\nDana hires a new associate producer, neurotic, but brilliant Jeremy Goodwin. Dan has a New York Renaissance...."""
892473,ns/m.01xwlgy,ns/tv.tv_series_episode.season_number,"""1"""
892474,ns/m.01xwlgy,ns/tv.tv_series_episode.series,ns/m.01j46y
892475,ns/m.01xwlgy,ns/tv.tv_series_episode.next_episode,ns/m.01xwlh4
892476,ns/m.01xwlgy,ns/tv.tv_series_episode.writer,ns/m.01d8yn


Unnamed: 0,subject,predicate,object
892502,ns/m.025z3j0,ns/tv.tv_series_episode.writer,ns/m.0ksc72
892503,ns/m.025z3j0,ns/tv.tv_series_episode.season_number,"""1"""
892504,ns/m.025z3j0,ns/common.topic.description,"""Sam is about to graduate from college and has an interview set up to join one of the most prestigious law schools in the country. His brother Dean, whom he has not seen since he went to college, shows up in the middle of the night and tells him their father is missing while on a hunting trip. Leaving his girlfriend behind to find their dad, Sam joins Dean in an effort to find their father in a little town called Jericho, where unmarried men disappear without a trace."""
892505,ns/m.025z3j0,ns/tv.tv_series_episode.episode_number,"""1"""
892506,ns/m.025z3j0,ns/tv.tv_series_episode.director,ns/m.025y9fn
892507,ns/m.025z3j0,ns/common.topic.notable_types,ns/m.01xrzlb
892508,ns/m.025z3j0,ns/tv.tv_series_episode.production_number,"""475285"""
892509,ns/m.025z3j0,key/wikipedia.en,"""Pilot_$0028Supernatural$0029"""
892510,ns/m.025z3j0,ns/common.topic.alias,"""Unaired Pilot"""
892511,ns/m.025z3j0,ns/tv.tv_series_episode.next_episode,ns/m.02621rq


Unnamed: 0,subject,predicate,object
892674,ns/m.0bjtfn,ns/tv.tv_series_episode.air_date,"""2005-03-24"""
892675,ns/m.0bjtfn,ns/common.topic.description,"""A documentary crew arrives at the offices of Dunder Mifflin to observe the employees and learn about modern management. Manager Michael Scott tries to paint a happy picture, while sales rep Jim fights with his nemesis Dwight and flirts with receptionist Pam."""
892676,ns/m.0bjtfn,ns/tv.tv_series_episode.writer,ns/m.048wrb
892677,ns/m.0bjtfn,ns/common.topic.notable_types,ns/m.01xrzlb
892678,ns/m.0bjtfn,ns/tv.tv_series_episode.director,ns/m.05fmg7
892679,ns/m.0bjtfn,ns/type.object.name,"""Pilot"""
892680,ns/m.0bjtfn,ns/tv.tv_series_episode.episode_number,"""1"""
892681,ns/m.0bjtfn,key/wikipedia.en,"""Pilot_$0028The_Office$0029"""
892682,ns/m.0bjtfn,ns/tv.tv_series_episode.season,ns/m.03wcyg0
892683,ns/m.0bjtfn,ns/tv.tv_series_episode.next_episode,ns/m.0bkv0w


Unnamed: 0,subject,predicate,object
892745,ns/m.0hnb4n5,ns/tv.tv_series_episode.writer,ns/m.0bxzlqx
892746,ns/m.0hnb4n5,ns/tv.tv_series_episode.series,ns/m.0gtw31d
892747,ns/m.0hnb4n5,ns/tv.tv_series_episode.season_number,"""1"""
892748,ns/m.0hnb4n5,ns/common.topic.description,"""The pilot episode of the American historical fiction television series The Playboy Club premiered on September 19, 2011 in the United States on NBC. It was directed by Alan Taylor and written by Chad Hodge and Becky Mode. In this episode, Maureen, a newly hired Playboy bunny, gets involved in the murder of mob boss Bruno Bianchi. Nick Dalton, one of Chicago's top attorneys and Club key-holder, comes to her aid; his girlfriend Carol-Lynne makes an ambitious move and becomes the first Bunny Mother. Meanwhile, Bunnies Janie, Alice and Brenda each deal with their own personal issues and secrets while the club's general manager Billy Rosen tries his best to keep the club running without interference from the mob.\nDevelopment for a pilot episode began in 2010, when 20th Century Fox Television and Imagine TV attempted to produce the concept in time for the 2010–11 television season; however, it never materialized. Its scripts were picked up by NBC in January 2011 and two months afterwards, principal photography for the episode commenced in Chicago, Illinois, where it occurred over a period of nine days."""
892749,ns/m.0hnb4n5,ns/tv.tv_series_episode.air_date,"""2011-09-19"""
892750,ns/m.0hnb4n5,key/wikipedia.en,"""Pilot_$0028The_Playboy_Club$0029"""
892751,ns/m.0hnb4n5,ns/type.object.name,"""Pilot"""
892752,ns/m.0hnb4n5,ns/tv.tv_series_episode.director,ns/m.08l5l5


Unnamed: 0,subject,predicate,object
943279,ns/m.06gt51c,ns/tv.tv_series_episode.episode_number,"""1"""
943280,ns/m.06gt51c,ns/common.topic.description,"""Cleveland Brown moves with Cleveland, Jr. from Quahog, Rhode Island to his hometown of Stoolbend, Virginia to reunite with his high school girlfriend. This version has different dialogue from the version which finally aired on 27th September 2009."""
943281,ns/m.06gt51c,key/wikipedia.en,"""Pilot_cleveland_show"""
943282,ns/m.06gt51c,ns/tv.tv_series_episode.air_date,"""2009-09-27"""
943283,ns/m.06gt51c,ns/tv.tv_series_episode.series,ns/m.03y3bp7
943284,ns/m.06gt51c,ns/tv.tv_series_episode.production_number,"""1APS01"""
943285,ns/m.06gt51c,ns/type.object.name,"""Pilot"""
943286,ns/m.06gt51c,ns/tv.tv_series_episode.writer,ns/m.09xwyv
943287,ns/m.06gt51c,ns/tv.tv_series_episode.season_number,"""1"""
943288,ns/m.06gt51c,ns/common.topic.alias,"""Pilot (Original)"""


Unnamed: 0,subject,predicate,object
1041923,ns/m.0fcygc,ns/tv.tv_series_episode.season,ns/m.05dt_g2
1041924,ns/m.0fcygc,ns/tv.tv_series_episode.air_date,"""2006-07-07"""
1041925,ns/m.0fcygc,ns/tv.tv_series_episode.next_episode,ns/m.0fd5j3
1041926,ns/m.0fcygc,ns/tv.tv_series_episode.episode_number,"""1"""
1041927,ns/m.0fcygc,ns/common.topic.description,"""\""Pilot\"" is the first episode of the TV series, Psych. It originally aired on USA Network on July 7, 2006."""
1041928,ns/m.0fcygc,key/wikipedia.en,"""Pilot_$0028Psych$0029"""
1041929,ns/m.0fcygc,ns/type.object.name,"""Pilot"""
1041930,ns/m.0fcygc,ns/common.topic.notable_types,ns/m.01xrzlb
1041931,ns/m.0fcygc,ns/tv.tv_series_episode.production_number,"""1001"""
1041932,ns/m.0fcygc,ns/tv.tv_series_episode.writer,ns/m.0gqmn2


Unnamed: 0,subject,predicate,object
1061250,ns/m.05dlk4d,ns/tv.tv_series_episode.season_number,"""1"""
1061251,ns/m.05dlk4d,ns/type.object.name,"""Pilot"""
1061252,ns/m.05dlk4d,ns/common.topic.description,"""Sam, Billie, Adam and Dan are four New Yorkers who are dealing with their individual eating disorders together. All four of them lean on one another for support and are good friends. They attend Belttighteners, which is a recovery program that approaches their issues extremely ruthless ways. Sam is trying to shape his date into a woman from a hot TV commercial, which horrifies Billie. Dan struggles with his choice for a gastric bypass surgery and Adam chases down a deliveryman for food."""
1061253,ns/m.05dlk4d,ns/tv.tv_series_episode.director,ns/m.0fh314
1061254,ns/m.05dlk4d,ns/tv.tv_series_episode.air_date,"""2005-08-04"""
1061255,ns/m.05dlk4d,ns/tv.tv_series_episode.writer,ns/m.0fh314
1061256,ns/m.05dlk4d,ns/tv.tv_series_episode.production_number,"""101"""
1061257,ns/m.05dlk4d,ns/tv.tv_series_episode.series,ns/m.07czvc
1061258,ns/m.05dlk4d,ns/tv.tv_series_episode.next_episode,ns/m.05dlk49
1061259,ns/m.05dlk4d,ns/tv.tv_series_episode.episode_number,"""1"""


Unnamed: 0,subject,predicate,object
1101622,ns/m.06yp_4v,ns/tv.tv_series_episode.writer,ns/m.09hd16
1101623,ns/m.06yp_4v,ns/tv.tv_series_episode.next_episode,ns/m.06yp_54
1101624,ns/m.06yp_4v,ns/tv.tv_series_episode.series,ns/m.01ch04
1101625,ns/m.06yp_4v,ns/tv.tv_series_episode.episode_number,"""1"""
1101626,ns/m.06yp_4v,ns/tv.tv_series_episode.season_number,"""1"""
1101627,ns/m.06yp_4v,ns/tv.tv_series_episode.air_date,"""1993-08-27"""
1101628,ns/m.06yp_4v,ns/common.topic.description,"""When U.S. Marshal Brisco County, Sr. is murdered by John Bly and his gang of outlaws, the robber barons of San Francisco's Westerfield Club hire Brisco County, Jr. as a bounty hunter to round them up. Brisco meets a lot of interesting people along the way, including rival bounty hunter Lord Bowler, and the Westerfield Club's stuffy lawyer, Socrates Poole. He also learns of a mysterious object which John Bly would do anything to possess, for the supernatural power it can give him."""
1101629,ns/m.06yp_4v,ns/type.object.name,"""Pilot"""


Unnamed: 0,subject,predicate,object
1107779,ns/m.0jfyh85,ns/common.topic.description,"""With his father away at the Crusades, Edmund comes up with a plan to prove his brother is illegitimate, thus making him Prince Regent.\n\nThe Blackadder pilot was shot but never aired on terrestrial TV in the UK (although some scenes were shown in the 25th anniversary special Blackadder Rides Again). One notable difference in the pilot, as in many pilots, is the casting. Baldrick is played not by Tony Robinson, but by Philip Fox. Another significant difference is that the character of Prince Edmund presented in the pilot is much closer to the intelligent, conniving Blackadder of the later series than the sniveling, weak Edmund of the original series. Set in the year 1582, the script of the pilot is roughly the same as the episode \""Born to be King\"", albeit with some different jokes, with some lines appearing in other episodes of the series."""
1107780,ns/m.0jfyh85,ns/tv.tv_series_episode.series,ns/m.01d25
1107781,ns/m.0jfyh85,ns/common.topic.alias,"""Original Pilot"""
1107782,ns/m.0jfyh85,ns/type.object.name,"""Pilot"""
1107783,ns/m.0jfyh85,ns/tv.tv_series_episode.season_number,"""0"""
1107784,ns/m.0jfyh85,ns/tv.tv_series_episode.episode_number,"""1"""


Unnamed: 0,subject,predicate,object
1119544,ns/m.0703lxy,ns/tv.tv_series_episode.series,ns/m.0642f_4
1119545,ns/m.0703lxy,ns/common.topic.description,"""Dylan is given an impossible pro bono custody case, Beth keeps mum when a 90-year-old man signs an agreement thinking that she's his daughter, Liam must get a client to sign with the firm under false pretenses, Addy finally gets some attention from her boss when she speaks her mind, and Malcolm gets off on the wrong foot when he's hired outside of the firm's traditional process."""
1119546,ns/m.0703lxy,ns/tv.tv_series_episode.air_date,"""2010-01-21"""
1119547,ns/m.0703lxy,ns/type.object.name,"""Pilot"""
1119548,ns/m.0703lxy,ns/tv.tv_series_episode.season_number,"""1"""
1119549,ns/m.0703lxy,ns/tv.tv_series_episode.next_episode,ns/m.09rmkb1
1119550,ns/m.0703lxy,ns/tv.tv_series_episode.episode_number,"""1"""


Unnamed: 0,subject,predicate,object
1119630,ns/m.0kd3pbb,ns/type.object.name,"""Pilot"""
1119631,ns/m.0kd3pbb,ns/tv.tv_series_episode.episode_number,"""1"""
1119632,ns/m.0kd3pbb,ns/tv.tv_series_episode.series,ns/m.0642f_4
1119633,ns/m.0kd3pbb,ns/common.topic.description,"""Dylan is given an impossible pro bono custody case, Beth keeps mum when a 90-year-old man signs an agreement thinking that she's his daughter, Liam must get a client to sign with the firm under false pretenses, Addy finally gets some attention from her boss when she speaks her mind, and Malcolm gets off on the wrong foot when he's hired outside of the firm's traditional process."""
1119634,ns/m.0kd3pbb,ns/tv.tv_series_episode.air_date,"""2010-01-21"""
1119635,ns/m.0kd3pbb,ns/tv.tv_series_episode.season_number,"""1"""


**Observation**: As one can see from the above example for the name "Pilot", it is full of Pilot episodes from various TV Shows.

## Unique Predicate Names

In [49]:
unique_predicates = freebase_df['predicate'].drop_duplicates().tolist()

In [50]:
len(unique_predicates)

533

## Unique Descriptions

In [51]:
description_df = freebase_df[freebase_df["predicate"] == "ns/common.topic.description"].drop_duplicates()

# List of unique descriptions, defined as the value (object) of the
# "ns/common.topic.description" (predicate) of the corresponding entity's ID (subject)
unique_descriptions = description_df["object"].drop_duplicates().tolist()

In [52]:
len(description_df)  # unique entity ID - description pairs

75844

In [53]:
len(unique_descriptions)

75655

In [54]:
len(description_df) - len(unique_descriptions) # non-unique descriptions

189

### Unique Entity ID to Description and vice-versa Dictionaries

In [55]:
id_to_description = description_df.set_index("subject")["object"].to_dict()

In [56]:
print("Number of unique IDs:")
print(len(id_to_name))

Number of unique IDs:
92128


In [57]:
description_to_id = {}
for _id, description in id_to_description.items():
    description_to_id.setdefault(description, []).append(_id)

In [58]:
print("Number of unique descriptions:")
print(len(description_to_id))

Number of unique descriptions:
75655


In [59]:
description_to_id_1to1 = {k: v[0] for k, v in description_to_id.items() if len(v) == 1}

In [60]:
print("Number of unique descriptions that correspond to single entity ID:")
print(len(description_to_id_1to1))

Number of unique descriptions that correspond to single entity ID:
75472


In [61]:
description_to_id_1toN = {k: v for k, v in description_to_id.items() if len(v) > 1}

In [62]:
print("Number of unique descriptions that correspond to multiple entity IDs:")
print(len(description_to_id_1toN))

Number of unique descriptions that correspond to multiple entity IDs:
183


In [63]:
description_to_id_1toN

{'"Patrick\'s parents are coming over and Patrick doesn\'t want to look stupid, so he tells SpongeBob to be stupider than him."': ['ns/m.07749b',
  'ns/m.0jb6zfd'],
 '"TBA"': ['ns/m.05dtyb4', 'ns/m.05dty9w'],
 '"Told by Robert Powell"': ['ns/m.0k1pdw2',
  'ns/m.0k1pdv6',
  'ns/m.0kng1fl',
  'ns/m.0k1pdtd',
  'ns/m.0k1pdvs'],
 '"New York 2012 and New York 1912, a hundred year gap, with something scarily in common, something unnatural, and whatever you do - do not blink."': ['ns/m.0lbsc1p',
  'ns/m.0lbsc48'],
 '"Finn and Jake go on a quest for a magical book that would prove them worthy of being righteous heroes."': ['ns/m.0gfk03c',
  'ns/m.0gfkc0h'],
 '"Finn and Princess Bubblegum must protect the Candy Kingdom from a horde of candy zombies they accidentally created."': ['ns/m.0gfk6h0',
  'ns/m.0gfx0m2'],
 '"Jake\'s plan to spend time with both Lady Rainicorn and Finn backfires."': ['ns/m.0gfvp0h',
  'ns/m.0kcpdm4'],
 '"Big money problems send Finn and Jake to Wildberry Kingdom."': ['ns

**"Told by Robert Powell" Example**

In [64]:
example_description_1toN = "Told by Robert Powell"
example_ids_1toN = description_to_id_1toN[f'"{example_description_1toN}"']
print(example_ids_1toN)

['ns/m.0k1pdw2', 'ns/m.0k1pdv6', 'ns/m.0kng1fl', 'ns/m.0k1pdtd', 'ns/m.0k1pdvs']


In [65]:
# Data Exploration
for example_id in example_ids_1toN:
    display(freebase_df.query(f'subject=="{example_id}"'))

Unnamed: 0,subject,predicate,object
26463,ns/m.0k1pdw2,ns/common.topic.description,"""Told by Robert Powell"""
26464,ns/m.0k1pdw2,ns/tv.tv_series_episode.air_date,"""1986-12-30"""
26465,ns/m.0k1pdw2,ns/common.topic.alias,"""The Turn of the Screw"""
26466,ns/m.0k1pdw2,ns/tv.tv_series_episode.season_number,"""0"""
26467,ns/m.0k1pdw2,ns/type.object.name,"""The Rose Garden"""
26468,ns/m.0k1pdw2,ns/tv.tv_series_episode.series,ns/m.05f4vlb
26469,ns/m.0k1pdw2,ns/tv.tv_series_episode.episode_number,"""8"""


Unnamed: 0,subject,predicate,object
26565,ns/m.0k1pdv6,ns/tv.tv_series_episode.air_date,"""1986-12-28"""
26566,ns/m.0k1pdv6,ns/type.object.name,"""Wailing Well"""
26567,ns/m.0k1pdv6,ns/common.topic.description,"""Told by Robert Powell"""
26568,ns/m.0k1pdv6,ns/tv.tv_series_episode.episode_number,"""6"""
26569,ns/m.0k1pdv6,ns/common.topic.alias,"""Number 13"""
26570,ns/m.0k1pdv6,ns/tv.tv_series_episode.season_number,"""0"""
26571,ns/m.0k1pdv6,ns/tv.tv_series_episode.series,ns/m.05f4vlb


Unnamed: 0,subject,predicate,object
26581,ns/m.0kng1fl,ns/tv.tv_series_episode.season_number,"""0"""
26582,ns/m.0kng1fl,ns/tv.tv_series_episode.series,ns/m.05f4vlb
26583,ns/m.0kng1fl,ns/type.object.name,"""The Ash Tree"""
26584,ns/m.0kng1fl,ns/common.topic.description,"""Told by Robert Powell"""
26585,ns/m.0kng1fl,ns/tv.tv_series_episode.episode_number,"""5"""
26586,ns/m.0kng1fl,ns/tv.tv_series_episode.air_date,"""1986-12-26"""


Unnamed: 0,subject,predicate,object
26617,ns/m.0k1pdtd,ns/common.topic.alias,"""The Stalls of Barchester"""
26618,ns/m.0k1pdtd,ns/tv.tv_series_episode.air_date,"""1986-12-25"""
26619,ns/m.0k1pdtd,ns/tv.tv_series_episode.episode_number,"""4"""
26620,ns/m.0k1pdtd,ns/type.object.name,"""The Mezzotinit"""
26621,ns/m.0k1pdtd,ns/tv.tv_series_episode.season_number,"""0"""
26622,ns/m.0k1pdtd,ns/tv.tv_series_episode.series,ns/m.05f4vlb
26623,ns/m.0k1pdtd,ns/common.topic.description,"""Told by Robert Powell"""


Unnamed: 0,subject,predicate,object
26624,ns/m.0k1pdvs,ns/tv.tv_series_episode.episode_number,"""7"""
26625,ns/m.0k1pdvs,ns/common.topic.alias,"""A Warning to the Curious"""
26626,ns/m.0k1pdvs,ns/common.topic.description,"""Told by Robert Powell"""
26627,ns/m.0k1pdvs,ns/tv.tv_series_episode.season_number,"""0"""
26628,ns/m.0k1pdvs,ns/tv.tv_series_episode.air_date,"""1986-12-29"""
26629,ns/m.0k1pdvs,ns/type.object.name,"""Oh, Whistle and I'll Come To You, My Lad"""
26630,ns/m.0k1pdvs,ns/tv.tv_series_episode.series,ns/m.05f4vlb


**Observation**: The description here is correct, but not distinguishing. These entities are episodes of Classic Ghost Stories told by Robert Powell. However, only the "Told by Robert Powell" is written in the description, giving no meaningful information about the entity.

**Episodes from "The Office" TV Show Example**

In [66]:
example_description_1toN = "Michael decides to open a cafe-disco in his old office. Pam and Jim are planning a secret trip."
example_ids_1toN = description_to_id_1toN[f'"{example_description_1toN}"']
print(example_ids_1toN)

['ns/m.05s_rz5', 'ns/m.05zt0y_']


In [67]:
# Data Exploration
for example_id in example_ids_1toN:
    display(freebase_df.query(f'subject=="{example_id}"'))

Unnamed: 0,subject,predicate,object
209890,ns/m.05s_rz5,ns/tv.tv_series_episode.director,ns/m.04t2l2
209891,ns/m.05s_rz5,ns/tv.tv_series_episode.writer,ns/m.0g9yd6_
209892,ns/m.05s_rz5,ns/tv.tv_series_episode.episode_number,"""23"""
209893,ns/m.05s_rz5,ns/common.topic.description,"""Michael decides to open a cafe-disco in his old office. Pam and Jim are planning a secret trip."""
209894,ns/m.05s_rz5,ns/tv.tv_series_episode.season_number,"""5"""
209895,ns/m.05s_rz5,ns/tv.tv_series_episode.air_date,"""2009-04-23"""
209896,ns/m.05s_rz5,ns/tv.tv_series_episode.next_episode,ns/m.05sxf44
209897,ns/m.05s_rz5,ns/tv.tv_series_episode.series,ns/m.08jgk1
209898,ns/m.05s_rz5,ns/type.object.name,"""Broke"""
209899,ns/m.05s_rz5,ns/freebase.valuenotation.is_reviewed,ns/m.01xrztx


Unnamed: 0,subject,predicate,object
221146,ns/m.05zt0y_,ns/type.object.name,"""Company Picnic"""
221147,ns/m.05zt0y_,ns/tv.tv_series_episode.season_number,"""5"""
221148,ns/m.05zt0y_,ns/tv.tv_series_episode.previous_episode,ns/m.05zlyff
221149,ns/m.05zt0y_,ns/tv.tv_series_episode.series,ns/m.08jgk1
221150,ns/m.05zt0y_,ns/tv.tv_series_episode.episode_number,"""26"""
221151,ns/m.05zt0y_,ns/common.topic.notable_types,ns/m.01xrzlb
221152,ns/m.05zt0y_,ns/tv.tv_series_episode.air_date,"""2009-05-14"""
221153,ns/m.05zt0y_,ns/common.topic.description,"""Michael decides to open a cafe-disco in his old office. Pam and Jim are planning a secret trip."""
221154,ns/m.05zt0y_,ns/tv.tv_series_episode.season,ns/m.0462vdy
221155,ns/m.05zt0y_,key/wikipedia.en,"""Company_Picnic"""


**Observation**: The description here is wrong and possibly there might be more such cases in Freebase. The description is accurate in the fact that it is about an episode from "The Office", however it is not either of the episode names shown above. This description suits better to episode [Cafe Disco](https://en.wikipedia.org/wiki/Cafe_Disco). Additionally, the episode numbers are incorrect. This means that Freebase or the max1024 subset as provided by the WikiGraphs paper may have inconsistent information.

# Vector Database

In [68]:
class VectorDatabase:
    def __init__(self, texts, vecdb_path, similarity="cosine"):
        """
        Initializes a VectorDatabase object for managing a vector database
        containing a list of given texts.

        This class stores and manages the given texts as word
        embeddings within a ChromaDB collection using a persistent storage.
        It provides functionality for querying and retrieving relevant
        texts based on similarity to a given query.

        Args:
            texts (list): A list of entity texts to be stored in the vector database.
            vecdb_path (str): The path to the directory where the ChromaDB collection
                will be stored.
            similarity (str, optional): The similarity metric to use for nearest neighbor search.
                Defaults to "cosine". Options include "cosine", "l2", "ip", or "dot".
        """
        self.texts = texts
        self.vecdb_path = vecdb_path
        self.similarity = similarity

        self.client = chromadb.PersistentClient(path=self.vecdb_path)
        self.collection_name = Path(self.vecdb_path).stem
        self.collection = self.client.get_or_create_collection(
            name=self.collection_name,
            metadata={"hnsw:space": self.similarity}
        )
        self.ids = [f"id{i}" for i in range(1, len(texts) + 1)]

    def load_db(self, batch_size=None, reset=False):
        if reset:
            self.client.delete_collection(name=self.collection_name)
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"hnsw:space": self.similarity}
            )

        if self.collection.count() == 0:  # if collection is empty recreate it
            if type(batch_size) == int and batch_size > 0:
                if batch_size > 41666:
                    raise ValueError("batch_size must be less than 41666")
                for i in tqdm(range(0, len(self.texts), batch_size), desc="Creating VectorDB"):
                    batch_texts = self.texts[i : i + batch_size]
                    batch_ids = self.ids[i : i + batch_size]
                    self.collection.add(documents=batch_texts, ids=batch_ids)
            else:
                if len(self.texts) > 41666:
                    raise ValueError("Items to be added to collection must be "
                        "less than 41666. Please consider running in batches.")
                self.collection.add(documents=self.texts, ids=self.ids)

            print("VectorDB created at: " + self.vecdb_path)
        else:
            print("Loaded VectorDB from peristent storage")

    def query(self, query_text, n_results):
        results = self.collection.query(
            query_texts=[query_text],
            n_results=n_results
        )
        return results['documents']

## Names Vector Database

In [69]:
# Download and extract already created embeddings
download_file("https://drive.usercontent.google.com/download?id=1GmZrrVyUYamfv66NxTjVp7cvFlcb2U-v&export=download&confirm=t", "name_embeddings.tar.xz")
extract_tar("name_embeddings.tar.xz", save_path=".")

File name_embeddings.tar.xz already exists. Skipping download.
Skipping name_embeddings, already exists
Skipping name_embeddings/2d5ad6ea-6570-43c9-b284-1e733fd6d2e7, already exists
Skipping name_embeddings/2d5ad6ea-6570-43c9-b284-1e733fd6d2e7/data_level0.bin, already exists
Skipping name_embeddings/2d5ad6ea-6570-43c9-b284-1e733fd6d2e7/header.bin, already exists
Skipping name_embeddings/2d5ad6ea-6570-43c9-b284-1e733fd6d2e7/index_metadata.pickle, already exists
Skipping name_embeddings/2d5ad6ea-6570-43c9-b284-1e733fd6d2e7/length.bin, already exists
Skipping name_embeddings/2d5ad6ea-6570-43c9-b284-1e733fd6d2e7/link_lists.bin, already exists
Skipping name_embeddings/chroma.sqlite3, already exists
File name_embeddings.tar.xz extracted successfully.


In [70]:
name_embeddings = VectorDatabase(
    texts=unique_enitity_names,
    vecdb_path="name_embeddings"
)

In [71]:
name_embeddings.load_db(batch_size=1000)

Loaded VectorDB from peristent storage


### Example

In [72]:
name_embeddings.query("George", 10)  # top 10 example

[['"George"',
  '"George"',
  '"John George"',
  '"George Michael"',
  '"George VI"',
  '"George V"',
  '"George Martin"',
  '"George Best"',
  '"Harold L. George"',
  '"Georgecalvert"']]

In [73]:
name_embeddings.query("dog", 1)  # top 1 example

[['"Dog"']]

### Example based on a Parsed Pair

In [74]:
node_id = example_pair.center_node

In [75]:
id_to_name[node_id]

'"Valkyria Chronicles 3: Unrecorded Chronicles"'

In [76]:
# Using title to get the ID with the help of the vector database
title = parsed_pairs[0].title
name = name_embeddings.query(title, n_results=1)[0][0]
print(f"Query: {title}\nResult: {name}")

Query: Valkyria_Chronicles_III
Result: "Valkyria Chronicles 3: Unrecorded Chronicles"


In [77]:
name_to_id[name][0]

'ns/m.0ddd390'

## Descriptions Vector Database

In [78]:
# Download and extract already created embeddings
#download_file("https://drive.usercontent.google.com/download?id=1GmZrrVyUYamfv66NxTjVp7cvFlcb2U-v&export=download&confirm=t", "name_embeddings.tar.xz")
#extract_tar("name_embeddings.tar.xz", save_path=".")

In [79]:
description_embeddings = VectorDatabase(
    texts=unique_descriptions,
    vecdb_path="description_embeddings"
)

In [80]:
description_embeddings.load_db(batch_size=1000)

Loaded VectorDB from peristent storage


### Example

In [81]:
description_embeddings.query("George", 10)  # top 10 example

[['"George blows an invitation upstairs with his latest girlfriend and then when he tries to make restitution he leaves progressively nastier messages on her answering machine. He gets the chance to prevent her from hearing her messages by having Jerry switch the tape out of her machine, while he distracts her. Jerry and his girlfriend have a disagreement about a TV commercial for Dockers, and his telling his friends about their conflict."',
  '"George Sr. suggests a last-ditch effort to save the family by holding a fund raiser to raise money to pay their legal expenses. Michael suggests to Lindsay that she take over the house work, and he sends George-Michael to an expensive boarding school. Meanwhile, Maeby has trouble with a snowboarding film and Tobias tries to be a \\"discipline daddy\\"."',
  '"George IV was King of the United Kingdom of Great Britain and Ireland and of Hanover following the death of his father, George III, on 29 January 1820, until his own death ten years later.

In [82]:
description_embeddings.query("dog", 1)  # top 1 example

[['"The domestic dog is a domesticated canid which has been selectively bred for millennia for various behaviors, sensory capabilities, and physical attributes.\\nAlthough initially thought to have originated as a manmade variant of an extant canid species, extensive genetic studies undertaken during the 2010s indicate that dogs diverged from other wolf-like canids in Eurasia 40,000 years ago. Being the oldest domesticated animals, their long association with people has allowed dogs to be uniquely attuned to human behavior, as well as thrive on a starch-rich diet which would be inadequate for other canid species.\\nDogs perform many roles for people, such as hunting, herding, pulling loads, protection, assisting police and military, companionship, and, more recently, aiding handicapped individuals. This impact on human society has given them the nickname \\"man\'s best friend\\" in the Western world. In some cultures, however, dogs are also a source of meat."']]

## Predicate Vector Database

In [83]:
predicate_embeddings = VectorDatabase(
    texts=unique_predicates,
    vecdb_path="predicate_embeddings"
)

In [84]:
predicate_embeddings.load_db(batch_size=1000)

Loaded VectorDB from peristent storage


### Example

In [85]:
predicate_embeddings.query("parent of Google", 10)  # top 10 example

[['ns/organization.organization.parent',
  'ns/business.business_location.parent_company',
  'ns/organization.organization_relationship.parent',
  'ns/people.person.parents',
  'ns/organization.organization.child',
  'ns/people.family_member.family',
  'ns/organization.organization_relationship.child',
  'ns/base.wikipedia_infobox.video_game.developer',
  'ns/organization.organization_founder.organizations_founded',
  'ns/organization.organization.founders']]

In [86]:
predicate_embeddings.query("parent of John", 10)  # top 10 example

[['ns/people.person.parents',
  'ns/organization.organization.parent',
  'ns/people.family_member.family',
  'ns/organization.organization_relationship.parent',
  'ns/people.family.members',
  'ns/organization.organization_relationship.child',
  'ns/people.person.children',
  'ns/organization.organization.child',
  'ns/film.actor.film',
  'ns/people.person.place_of_birth']]

__Observation__: As one can notice from the above two examples, using the vector database, a descriptive text of a relation can be translated to the appropriate predicate. Here the Predicate Vector Database could understand the meaning of _Google_ and _John_ by finding as first result the `...organization.parent` and `...person.parents` predicates respectively, which are the correct ones.

## Backup Vector Databases

In [87]:
# create_tar("name_embeddings", ".", "xz")
# create_tar("predicate_embeddings", ".", "xz")
# create_tar("description_embeddings", ".", "xz")

## Restore Vector Databases

In [88]:
# extract_tar("name_embeddings.tar.xz", save_path=".")
# extract_tar("predicate_embeddings.tar.xz", save_path=".")
# extract_tar("description_embeddings.tar.xz", save_path=".")

# Generated Graph

The Generted Graph is created by prompting the LLM.

In [89]:
def graph_prompt(input_text: str, metadata={}, model="llama3:latest", silent=True, json_parse=True):
    SYS_PROMPT = (
        "You are a knowledge graph maker who extracts terms and their relations from a given context. "
        "You are provided with a context chunk (delimited by ```) Your task is to extract the ontology "
        "of terms mentioned in the given context. These terms should represent the key concepts as per the context. \n"
        "Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.\n"
            "\tTerms may include object, entity, location, organization, person, \n"
            "\tcondition, acronym, documents, service, concept, etc.\n"
            "\tTerms should be as atomistic as possible\n\n"
        "Thought 2: Think about how these terms can have one on one relation with other terms.\n"
            "\tTerms that are mentioned in the same sentence or the same paragraph are typically related to each other.\n"
            "\tTerms can be related to many other terms\n\n"
        "Thought 3: Find out the relation between each such related pair of terms. \n\n"
        "Format your output as a list of json. Each element of the list contains a pair of terms"
        "and the relation between them, like the follwing: \n"
        "[\n"
        "   {\n"
        '       "node_1": "A concept from extracted ontology",\n'
        '       "node_2": "A related concept from extracted ontology",\n'
        '       "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"\n'
        "   }, {...}\n"
        "]\n"
        #"When naming the nodes, use the same name for nodes that correspond to the same concept. Try to use the most descriptive and concise name.\n"
        "Do not add any other comment before or after the json. Respond ONLY with a well formed json that can be directly read by a program."
    )
    USER_PROMPT = f"context: ```{input_text}``` \n\n output: "
    response_dict = ollama.generate(model=model, system=SYS_PROMPT, prompt=USER_PROMPT)
    response = response_dict["response"]
    if not json_parse:
        return response
    try:
        result = json.loads(response)
        result = [dict(item, **metadata) for item in result]
    except json.decoder.JSONDecodeError as e:
        if not silent:
            print("\n\n JSON Parse ERROR ### Input : ", input_text[:50], "...\n\n")
        result = None
    except Exception as e:
        print("Unexpected Exception:\n", e)
    return result

## Split Text to Chunks
Splitting the document to smaller chunks, because when giving a big chunk to the LLM, it may produce
a summary of the given text rather than the requested graph as a JSON formatted text.

In [90]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

In [91]:
texts_pages = []  # list of page lists extracted from each text
for pair in parsed_pairs[:GENERATED_GRAPHS_SAMPLE_SIZE]:
    pages = splitter.split_text(pair.text)
    texts_pages.append(pages)

In [92]:
for i, pages in enumerate(texts_pages):
    print(f"###### ###### ######")
    print("Text title:", parsed_pairs[i].title)
    print("Number of Pages:", len(pages))
    print(f"###### ###### ######\n")
    for j, page in enumerate(pages, start=1):
        print(f"****** Page {j} ******:\n{page}\n****** ------ ******\n\n")

###### ###### ######
Text title: Valkyria_Chronicles_III
Number of Pages: 21
###### ###### ######

****** Page 1 ******:
= Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . 
 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standar

### Drop Garbage Pages
Drop pages that are 50 characters in size or less to prevent creating false triplets, since these mostly include non meaningful text that could confuse the LLM

In [93]:
for i, pages in enumerate(texts_pages):
    print("Text title:", parsed_pairs[i].title)
    print("Number of Pages:", len(pages))
    dropped_pages = [page for page in pages if len(page) <= 50]
    texts_pages[i] = [page for page in pages if len(page) > 50]
    print("Garbage pages dropped:", len(dropped_pages), "\n")

Text title: Valkyria_Chronicles_III
Number of Pages: 21
Garbage pages dropped: 0 

Text title: Tower_Building_of_the_Little_Rock_Arsenal
Number of Pages: 21
Garbage pages dropped: 0 

Text title: Cicely_Mary_Barker
Number of Pages: 14
Garbage pages dropped: 0 

Text title: Plain_maskray
Number of Pages: 6
Garbage pages dropped: 0 

Text title: 2011$201312_Columbus_Blue_Jackets_season
Number of Pages: 19
Garbage pages dropped: 0 

Text title: Gregorian_Tower
Number of Pages: 8
Garbage pages dropped: 0 

Text title: There$0027s_Got_to_Be_a_Way
Number of Pages: 5
Garbage pages dropped: 0 

Text title: Nebraska_Highway_88
Number of Pages: 4
Garbage pages dropped: 0 

Text title: USS_Atlanta_$00281861$0029
Number of Pages: 16
Garbage pages dropped: 1 

Text title: Jacqueline_Fernandez
Number of Pages: 14
Garbage pages dropped: 0 



## Generating Subgraphs from Pages
After the generation procedure, the texts' subgraphs is saved as JSON in order to avoid repeating the LLM heavy processing on a second run of the notebook using the same LLM model.

It was chosen on purpose to save the texts' subgraphs before they are being parsed in case an error happens during parsing which would discard the whole progress. Also the benefit of saving the file after the subgraphs have been parsed is negligible, since the parsing process executes instantly.

In [94]:
texts_subgraphs_json_path = f"tmp/texts_subgraphs_{LLM_MODEL.replace(':', '-')}.json"
if not os.path.exists(texts_subgraphs_json_path):
    texts_subgraphs = []  # list of subgraph lists generated from each text
    for pages in tqdm(texts_pages):
        subgraphs = []
        for page in tqdm(pages, leave=False):
            subgraph = graph_prompt(page, model=LLM_MODEL, silent=False, json_parse=False)
            subgraphs.append(subgraph)
        texts_subgraphs.append(subgraphs)
    with open(texts_subgraphs_json_path, "w") as f:
        json.dump(texts_subgraphs, texts_subgraphs_json_path)
else:
    with open(texts_subgraphs_json_path) as f:
        texts_subgraphs = json.load(f)

## Cleaning Text

In [95]:
def fix_prompt_output(text):
    """Fixes the prompt output by removing text that would produce a malfromed JSON."""
    # any line not starting with these characters is an LLM comment and not a JSON text line
    starting_characters = ('"', '{', '}', '[', ']')
    lines = text.splitlines()
    filtered_lines = [line for line in lines if any(line.lstrip().startswith(char) for char in starting_characters)]
    return "\n".join(filtered_lines)

__NOTE__:

1. Using the first prompt (based on [knowledge graph][1]) without the added line for producing only well formed json, for this text input sample, it outputs 5 well formed JSON out of the 20 text inputs (i.e missing 15).
    1. Using the fix_prompt_output() function to filter the garbase text from the output text, by helping it conform to the JSON format, it outputs 20 out of 20 (i.e none missing).
3. Using the second prompt (based on both [knowledge graph][1] and [knowledge graph maker][2]), it still outputs 20 out of 20, but it also outputs more triplets. In particular 181 compared to 160 using the first prompt with fix_prompt_output(). Using the fix_prompt_output() function to the second prompt does no nothing, since the prompt produces correct JSON output without any filtering (at least for these 20 text inputs). In order to built a more robust solution however, the text inputs will be fed to fix_prompt_output() before being JSON parsed just in case a malformed output is produced.

[1]: https://github.com/rahulnyk/knowledge_graph/blob/main/helpers/prompts.py
[2]: https://github.com/rahulnyk/knowledge_graph_maker/blob/main/knowledge_graph_maker/graph_maker.py

In [96]:
fixed_texts_subgraphs = []
for subgraphs in texts_subgraphs:    
    fixed_subgraphs = []
    for subgraph in subgraphs:
        fixed_subgraph = fix_prompt_output(subgraph)
        fixed_subgraphs.append(fixed_subgraph)
    fixed_texts_subgraphs.append(fixed_subgraphs)

### Parsing Subgraphs and Merging Them

In [97]:
parsed_texts_subgraphs = []
for fixed_subgraphs in fixed_texts_subgraphs:  
    parsed_subgraphs = []
    for fixed_subgraph in fixed_subgraphs:
        try:
            parsed_subgraph = json.loads(fixed_subgraph)
        except json.decoder.JSONDecodeError as e:
            print(e)
            continue
        except Exception as e:
            print("Unexpected Exception:\n", e)
            continue
        parsed_subgraphs.append(parsed_subgraph)
    parsed_texts_subgraphs.append(parsed_subgraphs)

In [98]:
parsed_graphs = []
for parsed_subgraphs in parsed_texts_subgraphs:
    parsed_graph = list(chain(*parsed_subgraphs))  # join the subgraphs to create a whole graph
    parsed_graphs.append(parsed_graph)

In [99]:
parsed_graphs[-1]

[{'node_1': 'Jacqueline Fernandez',
  'node_2': 'Sri Lankan actress',
  'edge': 'is a'},
 {'node_1': 'Jacqueline Fernandez',
  'node_2': 'former model',
  'edge': 'was also a'},
 {'node_1': 'Miss Universe Sri Lanka',
  'node_2': 'Jacqueline Fernandez',
  'edge': 'represented her country at the 2006 world Miss Universe pageant'},
 {'node_1': 'Jacqueline Fernandez',
  'node_2': 'University of Sydney',
  'edge': 'graduated with a degree in mass communication from'},
 {'node_1': "Sujoy Ghosh's Aladin",
  'node_2': 'Jacqueline Fernandez',
  'edge': 'marked her acting debut as an actress'},
 {'node_1': "Mohit Suri's Murder 2",
  'node_2': 'Jacqueline Fernandez',
  'edge': "was Fernandez' breakthrough role, a commercial success and garnered her an IIFA Award for Best Supporting Actress nomination"},
 {'node_1': 'Housefull 2',
  'node_2': 'Jacqueline Fernandez',
  'edge': 'Fernandez played a glamorous role in the ensemble @-@ comedy'},
 {'node_1': 'IIFA Award for Star Debut of the Year – Femal

#### Wrongly Added Additional Properties by LLM

After doing a manual investigation, sometimes, the LLM may not be consistent and wrongly add additional properties (e.x like "relationType") that are outside of the standard triplets __node_1__, __node_2__ and __edge__.

These additional properties are found (if any) below and are removed.

In [100]:
for i, graph in enumerate(parsed_graphs):
    for triplet in graph:
        if len(triplet) > 3:
            illegal_properties = {k: v for k, v in triplet.items() if k not in ("node_1", "node_2", "edge")}
            for key in illegal_properties.keys():
                del triplet[key]  # Delete illegal property
            print(f"Graph {i}: Found these illegal properties: {illegal_properties}")
print("All illegal properties deleted (if any)")

Graph 2: Found these illegal properties: {'relationType': 'published in 1917'}
Graph 2: Found these illegal properties: {'relationType': 'published in 1920'}
All illegal properties deleted (if any)


In [101]:
parsed_graphs_dfs_raw = [pd.DataFrame(pg) for pg in parsed_graphs]

In [102]:
parsed_graphs_dfs_raw[0]

Unnamed: 0,node_1,node_2,edge
0,Valkyria Chronicles III,Senjō no Valkyria 3,alternate name for the game
1,Valkyria Chronicles III,PlayStation Portable,platform on which the game was developed and released
2,Valkyria Chronicles III,Valkyria series,part of a larger series of games
3,Valkyria Chronicles III,January 2011,release date in Japan
4,Nameless,penal military unit,type of military unit
...,...,...,...
202,two volumes,Senjō no Valkyria 3 : Namo naki Chikai no Hana,released in
203,2011 and 2012,Senjō no Valkyria 3 : Namo naki Chikai no Hana,between
204,Senjō no Valkyria 3 : Akaki Unmei no Ikusa Otome,Valkyria of the Battlefield 3,alternate adaptation
205,Mizuki Tsuge,Senjō no Valkyria 3 : Akaki Unmei no Ikusa Otome,illustrated by


#### Remove Duplicates

In [103]:
parsed_graphs_dfs_nodups = [df.drop_duplicates() for df in parsed_graphs_dfs_raw]

In [104]:
parsed_graphs_dfs_nodups[0]

Unnamed: 0,node_1,node_2,edge
0,Valkyria Chronicles III,Senjō no Valkyria 3,alternate name for the game
1,Valkyria Chronicles III,PlayStation Portable,platform on which the game was developed and released
2,Valkyria Chronicles III,Valkyria series,part of a larger series of games
3,Valkyria Chronicles III,January 2011,release date in Japan
4,Nameless,penal military unit,type of military unit
...,...,...,...
202,two volumes,Senjō no Valkyria 3 : Namo naki Chikai no Hana,released in
203,2011 and 2012,Senjō no Valkyria 3 : Namo naki Chikai no Hana,between
204,Senjō no Valkyria 3 : Akaki Unmei no Ikusa Otome,Valkyria of the Battlefield 3,alternate adaptation
205,Mizuki Tsuge,Senjō no Valkyria 3 : Akaki Unmei no Ikusa Otome,illustrated by


In [105]:
print("Number of Triplets of each parsed graph:")
print("\t\tRaw\tNo Duplicates\tDuplicates Removed")
for i, (df_raw, df_nodups) in enumerate(zip(parsed_graphs_dfs_raw, parsed_graphs_dfs_nodups)):
    print(f"Graph {i}: \t{len(df_raw)}\t{len(df_nodups)}\t\t{len(df_raw) - len(df_nodups)}")

Number of Triplets of each parsed graph:
		Raw	No Duplicates	Duplicates Removed
Graph 0: 	207	207		0
Graph 1: 	182	182		0
Graph 2: 	176	176		0
Graph 3: 	63	63		0
Graph 4: 	148	148		0
Graph 5: 	73	73		0
Graph 6: 	43	43		0
Graph 7: 	37	37		0
Graph 8: 	133	133		0
Graph 9: 	151	151		0


#### Generated Graph Creation

In [106]:
generated_graphs_dfs = parsed_graphs_dfs_nodups
generated_graphs_dfs_dict = {pair.center_node: df for pair, df in zip(parsed_pairs, generated_graphs_dfs)}

In [107]:
generated_df = pd.concat(generated_graphs_dfs_dict, names=["center_node"])

In [108]:
generated_graphs_dfs_dict['ns/m.0ddd390']

Unnamed: 0,node_1,node_2,edge
0,Valkyria Chronicles III,Senjō no Valkyria 3,alternate name for the game
1,Valkyria Chronicles III,PlayStation Portable,platform on which the game was developed and released
2,Valkyria Chronicles III,Valkyria series,part of a larger series of games
3,Valkyria Chronicles III,January 2011,release date in Japan
4,Nameless,penal military unit,type of military unit
...,...,...,...
202,two volumes,Senjō no Valkyria 3 : Namo naki Chikai no Hana,released in
203,2011 and 2012,Senjō no Valkyria 3 : Namo naki Chikai no Hana,between
204,Senjō no Valkyria 3 : Akaki Unmei no Ikusa Otome,Valkyria of the Battlefield 3,alternate adaptation
205,Mizuki Tsuge,Senjō no Valkyria 3 : Akaki Unmei no Ikusa Otome,illustrated by


In [109]:
print(parsed_pairs[7].text)


 = Nebraska Highway 88 = 
 
 Nebraska Highway 88 ( N @-@ 88 ) is a highway in northwestern Nebraska . It has a western terminus at Wyoming Highway 151 ( WYO 151 ) at the Wyoming – Nebraska state line . The road travels eastward to N @-@ 71 , where it turns south . N @-@ 88 continues east to south of Bridgeport . The road turns north , ends at an intersection with U.S. Highway 385 ( US 385 ) and N @-@ 92 in Bridgeport . The route was designated in 1937 , before the official state highway system was created . It was extended to the state line in 1986 . 
 
 = = Route description = = 
 
 N @-@ 88 starts at the Nebraska – Wyoming state line in Banner County , where WYO 151 ends , and travels northeast . The road quickly bends east after less than one mile ( 1 @.@ 6 km ) , and continues in a straight line . For the next twenty miles ( 32 km ) , N @-@ 88 intersects minor streets , through rural farmland . The route turns south at N @-@ 71 , and becomes concurrent . Four miles ( 6 @.@ 4 km ) 

In [110]:
parsed_graphs_dfs_raw[7]

Unnamed: 0,node_1,node_2,edge
0,Nebraska Highway 88,Wyoming Highway 151,has western terminus at
1,N @-@ 88,N @-@ 71,turns south to
2,N @-@ 88,Bridgeport,continues east to south of
3,N @-@ 88,U.S. Highway 385,ends at an intersection with
4,Nebraska Highway 88,1937,was designated in
5,Nebraska Highway 88,1986,was extended to the state line in
6,Nebraska – Wyoming,state line,located at
7,WYO 151,ends,terminates at
8,N @-@ 88,travels northeast,direction
9,farmland,passes through,location


In [111]:
generated_graphs_dfs[7]

Unnamed: 0,node_1,node_2,edge
0,Nebraska Highway 88,Wyoming Highway 151,has western terminus at
1,N @-@ 88,N @-@ 71,turns south to
2,N @-@ 88,Bridgeport,continues east to south of
3,N @-@ 88,U.S. Highway 385,ends at an intersection with
4,Nebraska Highway 88,1937,was designated in
5,Nebraska Highway 88,1986,was extended to the state line in
6,Nebraska – Wyoming,state line,located at
7,WYO 151,ends,terminates at
8,N @-@ 88,travels northeast,direction
9,farmland,passes through,location


In [112]:
generated_df

Unnamed: 0_level_0,Unnamed: 1_level_0,node_1,node_2,edge
center_node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ns/m.0ddd390,0,Valkyria Chronicles III,Senjō no Valkyria 3,alternate name for the game
ns/m.0ddd390,1,Valkyria Chronicles III,PlayStation Portable,platform on which the game was developed and released
ns/m.0ddd390,2,Valkyria Chronicles III,Valkyria series,part of a larger series of games
ns/m.0ddd390,3,Valkyria Chronicles III,January 2011,release date in Japan
ns/m.0ddd390,4,Nameless,penal military unit,type of military unit
...,...,...,...,...
ns/m.02q1mmh,146,Elle,magazines,cover model for many Indian editions of
ns/m.02q1mmh,147,Verve,magazines,cover model for many Indian editions of
ns/m.02q1mmh,148,Harper 's Bazaar,magazines,cover model for many Indian editions of
ns/m.02q1mmh,149,Women's Health,magazines,cover model for many Indian editions of


## Data Exploration

### Partial Names

#### Jacqueline Fernandez Example

In [113]:
partial_names_example_graph = parsed_graphs[-1]
partial_names_example_graph

[{'node_1': 'Jacqueline Fernandez',
  'node_2': 'Sri Lankan actress',
  'edge': 'is a'},
 {'node_1': 'Jacqueline Fernandez',
  'node_2': 'former model',
  'edge': 'was also a'},
 {'node_1': 'Miss Universe Sri Lanka',
  'node_2': 'Jacqueline Fernandez',
  'edge': 'represented her country at the 2006 world Miss Universe pageant'},
 {'node_1': 'Jacqueline Fernandez',
  'node_2': 'University of Sydney',
  'edge': 'graduated with a degree in mass communication from'},
 {'node_1': "Sujoy Ghosh's Aladin",
  'node_2': 'Jacqueline Fernandez',
  'edge': 'marked her acting debut as an actress'},
 {'node_1': "Mohit Suri's Murder 2",
  'node_2': 'Jacqueline Fernandez',
  'edge': "was Fernandez' breakthrough role, a commercial success and garnered her an IIFA Award for Best Supporting Actress nomination"},
 {'node_1': 'Housefull 2',
  'node_2': 'Jacqueline Fernandez',
  'edge': 'Fernandez played a glamorous role in the ensemble @-@ comedy'},
 {'node_1': 'IIFA Award for Star Debut of the Year – Femal

In [114]:
print(parsed_pairs[GENERATED_GRAPHS_SAMPLE_SIZE - 1].text)


 = Jacqueline Fernandez = 
 
 Jacqueline Fernandez ( born 11 August 1985 ) is a Sri Lankan actress , former model , and the winner of the 2006 Miss Universe Sri Lanka pageant . As Miss Universe Sri Lanka she represented her country at the 2006 world Miss Universe pageant . She graduated with a degree in mass communication from the University of Sydney , and worked as a television reporter in Sri Lanka . 
 While on a modelling assignment in India in 2009 , Fernandez successfully auditioned for Sujoy Ghosh 's fantasy drama Aladin , which marked her acting debut . Fernandez ' breakthrough role was in Mohit Suri 's psychological thriller Murder 2 ( 2011 ) , her first commercial success . This was followed by glamorous roles in the ensemble @-@ comedy Housefull 2 ( 2012 ) and its sequel Housefull 3 , and the action thriller Race 2 ( 2013 ) , all of which were box @-@ office successes . Her performance in the first of these garnered her an IIFA Award for Best Supporting Actress nomination .

## Standardization

In this step of the process, the descriptive names given by the LLM will be converted to the standard names as defined by Freebase, with the help of the Names and Predicates Vector Databases.