## All movies

In [None]:
import random
import numpy as np
import pandas as pd
# Read the CSV file
movies_csv_path = 'path'
movies_df = pd.read_csv(movies_csv_path, sep = '\t', low_memory=False)

actors_csv_path = 'path'
actors_df = pd.read_csv(actors_csv_path, sep='\t')
#filter out unknown birth years and movie information
actors_df = actors_df[(actors_df["birthYear"] != "\\N") & (actors_df['knownForTitles'] != "\\N") 
            & (actors_df['primaryProfession'] != "\\N") & (actors_df['deathYear'] == "\\N")]
#describe datasets
movies_df.describe(include = "all")
actors_df.describe(include = "all")

#drop duplicates and make sure the primary profession is actor or actress
actors_df = actors_df.drop_duplicates(subset=["primaryName", "birthYear", "deathYear"])
actors_df = actors_df[actors_df['primaryProfession'].str.contains('actor|actress', case=False)]

#only keep the first known title
# actors_df['knownForTitle'] = actors_df['knownForTitles'].str.split(',').str[0]
actors_df['tconst'] = actors_df['knownForTitles'].str.split(',')
actors_df = actors_df.explode('tconst')
# actors_df.drop(columns=['knownForTitles'], inplace=True)

#merge actors and movies 
merged_df = pd.merge(actors_df, movies_df, on = "tconst", how='left')
merged_df["genres"] = merged_df["genres"].str.split(',').str[0]

#clean up
var = ["nconst", "tconst", "primaryName", "primaryTitle", "startYear", "genres"]
merged_df = merged_df[var]
merged_df.rename(columns={'nconst': 'actor_id'}, inplace=True)
merged_df.rename(columns={'tconst': 'movie_id'}, inplace=True)
merged_df.rename(columns={'primaryName': 'actor_name'}, inplace=True)
merged_df.rename(columns={'primaryTitle': 'movie_name'}, inplace=True)
merged_df.rename(columns={'startYear': 'movie_year'}, inplace=True)
merged_df.rename(columns={'genres': 'main_genre'}, inplace=True)

#Create the director dataset 
directors_csv_path = 'path'
directors_df = pd.read_csv(directors_csv_path, sep = '\t')
names_csv_path = 'path'
names_df = pd.read_csv(names_csv_path, sep='\t')
names_df = names_df[names_df['primaryProfession'].str.contains('director|producer', case=False)]
#merge directors and names to get director names 
merged_df_director = pd.merge(directors_df, names_df, left_on='directors', right_on= "nconst", how='left')

#clean up
var_director = ["primaryName", "tconst"]
merged_df_director = merged_df_director[var_director]
merged_df_director.rename(columns={'tconst': 'movie_id'}, inplace=True)
merged_df_director.rename(columns={'primaryName': 'director_name'}, inplace=True)

#merge director and actor dataset
dataset = pd.merge(merged_df, merged_df_director, on='movie_id', how='left')
#remove rows with missing director names 
dataset = dataset.dropna(subset=['director_name'])

#create costar
movie_to_actors = dataset.groupby('movie_id')['actor_name'].apply(set).to_dict()
dataset['costars'] = dataset.apply(
    lambda row: list(movie_to_actors[row['movie_id']] - {row['actor_name']}), axis=1
)
dataset['costar'] = dataset['costars'].str[0]
#remove nas
dataset = dataset.dropna(subset = "costar")

#write out the data
output_dir = 'path'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'data_actors.json')
dataset.to_json(output_file, orient='records')
print(f"\nTotal number of statements: {len(dataset)}")

## Dataset Construction

In [None]:
from unidecode import unidecode
import os
#X co-starred with Y in a film by director Z released in the year W
dataset = pd.read_json('path')

# Create a new column with the formatted statement
dataset['fact'] = dataset.apply(
    lambda row: (
        f"{unidecode(str(row['actor_name']))}, "
        f"{unidecode(str(row['costar']))}, "
        f"{unidecode(str(row['movie_name']))}, "
        f"{unidecode(str(row['director_name']))}, "
        f"{unidecode(str(row['main_genre']))}, "
        f"{row['movie_year']}"
    ) if pd.notna(row['costar']) and pd.notna(row['director_name']) and pd.notna(row['movie_year']) else None,
    axis=1
)
# Drop rows with missing statements and create the new DataFrame
statements_df = dataset[['fact']].dropna()

# save the data
output_dir = 'path'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'F_data_actors_short_nofiller.json')
statements_df.to_json(output_file, orient='records')
print(statements_df.head())
print(f"\nTotal number of statements: {len(statements_df)}")