In [1]:
import pandas as pd
import re
import ast

In [2]:
def load_tsv(file): return pd.read_csv(file, sep="\t", na_values='\\N', dtype=str)
def cleanCharactersFromPrincipal(characters):
    if not isinstance(characters, str): 
        return "not available"
    try:
        parsed = ast.literal_eval(characters)
        if isinstance(parsed, list) and parsed:
            first = parsed[0]
            if isinstance(first, str):  # ensure it's a string
                cleaned = re.sub(r"[^A-Za-z0-9 ]+", "", first)
                return cleaned.strip()
    except Exception:
            return "not available"

In [3]:
basics = load_tsv("data/title.basics.tsv")
ratings = load_tsv("data/title.ratings.tsv")
crew = load_tsv("data/title.crew.tsv")
principals = load_tsv("data/title.principals.tsv")
names = load_tsv("data/name.basics.tsv")    

In [4]:
basics = basics[basics['titleType'] == 'movie']
basics['startYear'] = pd.to_numeric(basics['startYear'], errors='coerce')
ratings['averageRating'] = pd.to_numeric(ratings['averageRating'], errors='coerce')
ratings['numVotes'] = pd.to_numeric(ratings['numVotes'], errors='coerce')
basics_merged = basics.merge(ratings, on='tconst').merge(crew, on='tconst')

In [5]:
#clean principal data
#principals[principals['category'].isin(['actor', 'actress'])]
movie_titles = set(basics_merged[basics_merged['titleType'] == 'movie']['tconst'])
principals_new = principals[principals['tconst'].isin(movie_titles)]
principals_new = principals_new.drop(columns='job')
#Remove "self" categories
principals_new = principals_new[principals_new["category"].str.lower() != "self"]
#clean the chracters column to use it further
principals_new = principals_new[principals_new["characters"].notna()]
principals_new["characters"] = principals_new["characters"].apply(cleanCharactersFromPrincipal)
principals_new = principals_new[principals_new['category'].isin(['actor', 'actress'])]
principals_new = principals_new.dropna().reset_index(drop=True)
names_df = names[['nconst','primaryName']]
principals_new = principals_new.merge(names_df, on='nconst')
principals_new

Unnamed: 0,tconst,ordering,nconst,category,characters,primaryName
0,tt0000009,1,nm0063086,actress,Miss Geraldine Holbrook Miss Jerry,Blanche Bayliss
1,tt0000009,2,nm0183823,actor,Mr Hamilton,William Courtenay
2,tt0000009,3,nm1309758,actor,Chauncey Depew the Director of the New York C...,Chauncey Depew
3,tt0000574,1,nm0846887,actress,Kate Kelly,Elizabeth Tait
4,tt0000574,2,nm0846894,actor,School Master,John Tait
...,...,...,...,...,...,...
2170386,tt9916538,7,nm2999289,actor,Parjo,Ence Bagus
2170387,tt9916538,8,nm4774809,actor,Mbah Jiwo,T.M. Tarsan
2170388,tt9916538,9,nm3369416,actress,Nenek,Yati Pesek
2170389,tt9916538,10,nm3370435,actor,Brel,Marwoto


In [6]:
basics_merged[basics_merged['tconst'] == 'tt0111161']

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,directors,writers
62848,tt0111161,movie,The Shawshank Redemption,The Shawshank Redemption,0,1994.0,,142,Drama,9.3,3038716,nm0001104,"nm0000175,nm0001104"


In [7]:
def get_actors(tconst):
    actDetails=[]
    actors = principals_new[(principals_new['tconst'] == tconst)]
    for _, row in actors.iterrows():
        actDetails.append(f'Character "{row['characters']}" played by {row['category']} "{row['primaryName']}"')
    return('"' + ", ".join(actDetails) + '"')

def get_names(ids, name_df):
    if pd.isnull(ids): return []
    return name_df[name_df['nconst'].isin(ids.split(','))]['primaryName'].dropna().tolist()

In [15]:
import pandas as pd

def get_names(ids, name_df):
    if pd.isnull(ids): return "data not available"
    newList = name_df[name_df['nconst'].isin(ids.split(','))]['primaryName'].dropna().tolist()
    return ('"' + ", ".join(newList) + '"')

def get_actors(tconst):
    actDetails=[]
    actors = principals_new[(principals_new['tconst'] == tconst)]
    for _, row in actors.iterrows():
        actDetails.append(f'Character "{row['characters']}" played by {row['category']} "{row['primaryName']}"')
    return('"' + ", ".join(actDetails) + '"')

def build_movie_records():  
    df = basics_merged[basics_merged['numVotes'] >= 1000000].sort_values('averageRating', ascending=False).head(500)
    final_data ={}
    records = []
    for _, row in df.iterrows():
        tconst = row["tconst"]
        title = row["primaryTitle"]
        year = int(row["startYear"]) if pd.notnull(row["startYear"]) else 'not available'
        genres = row["genres"]
        averageRating = row["averageRating"]
        numVotes = row["numVotes"]
        directors = get_names(row["directors"], names)
        writers = get_names(row["writers"], names)
        runtime = row['runtimeMinutes']
        topActors = get_actors(row["tconst"])
        
        parts = [f'Movie "{title}" released in year ({year})']
        if genres:
            parts.append(f"is a {genres} film")
        if directors:
            parts.append(f"directed by {directors}")
        if writers:
            parts.append(f"written by {writers}")
        if runtime:
            parts.append(f"with a runtime of {runtime} minutes")
        if averageRating:
            parts.append(f"rated {averageRating}/10 on IMDb")
        if numVotes:
            parts.append(f"based on {numVotes} votes.")
        if topActors:
            parts.append(f"The top casting is as follow : {topActors}")
            
        #records.append('"' + ", ".join(parts) + '"')
        final_data[row["tconst"]] = ", ".join(parts)
        
    return final_data


In [16]:
records = build_movie_records()
df = pd.DataFrame(records.items(), columns=["mov_id", "mov_details"])
df.to_csv('movies.csv',index=False)

In [18]:
mymovies = pd.read_csv('movies.csv')


In [20]:
mymovies['mov_details'].values

array(['Movie "The Shawshank Redemption" released in year (1994), is a Drama film, directed by "Frank Darabont", written by "Stephen King, Frank Darabont", with a runtime of 142 minutes, rated 9.3/10 on IMDb, based on 3038716 votes., The top casting is as follow : "Character "Andy Dufresne" played by actor "Tim Robbins", Character "Ellis Boyd Red Redding" played by actor "Morgan Freeman", Character "Warden Norton" played by actor "Bob Gunton", Character "Heywood" played by actor "William Sadler", Character "Captain Hadley" played by actor "Clancy Brown", Character "Tommy" played by actor "Gil Bellows", Character "Bogs Diamond" played by actor "Mark Rolston", Character "Brooks Hatlen" played by actor "James Whitmore", Character "1946 DA" played by actor "Jeffrey DeMunn", Character "Skeet" played by actor "Larry Brandenburg""',
       'Movie "The Godfather" released in year (1972), is a Crime,Drama film, directed by "Francis Ford Coppola", written by "Francis Ford Coppola, Mario Puzo", w