In [1]:
import pandas as pd
import csv

movies = pd.read_csv(
    "title.basics (1).tsv.gz",
    sep="\t",
    na_values='\\N',
    encoding='latin1',
    compression='gzip',
    quoting=csv.QUOTE_NONE,
    on_bad_lines='skip'
)

movies = movies[movies['titleType'] == 'movie']
movies = movies.dropna(subset=['primaryTitle', 'startYear'])
movies = movies.rename(columns={
    'tconst': 'movie_id',
    'primaryTitle': 'title',
    'startYear': 'release_year',
    'genres': 'genres'
})
movies['release_year'] = movies['release_year'].astype(int)

movies.to_csv("movies_cleaned.csv", index=False)
print("✅ Saved: movies_cleaned.csv")


ratings = pd.read_csv(
    "title.ratings.tsv.gz",
    sep="\t",
    encoding='latin1',
    compression='gzip',
    quoting=csv.QUOTE_NONE,
    on_bad_lines='skip'
)

ratings = ratings.rename(columns={
    'tconst': 'movie_id',
    'averageRating': 'rating',
    'numVotes': 'num_votes'
})
ratings['rating'] = ratings['rating'].astype(float)
ratings['num_votes'] = ratings['num_votes'].astype(int)

ratings.to_csv("ratings_cleaned.csv", index=False)
print("✅ Saved: ratings_cleaned.csv")


names = pd.read_csv(
    "name.basics.tsv.gz",
    sep="\t",
    na_values='\\N',
    encoding='latin1',
    compression='gzip',
    quoting=csv.QUOTE_NONE,
    on_bad_lines='skip'
)

directors = names[['nconst', 'primaryName', 'birthYear']].copy()
directors = directors.rename(columns={
    'nconst': 'director_id',
    'primaryName': 'director_name',
    'birthYear': 'birth_year'
})
directors = directors.dropna(subset=['birth_year'])
directors['birth_year'] = directors['birth_year'].astype(int)

directors.to_csv("directors_cleaned.csv", index=False)
print("✅ Saved: directors_cleaned.csv")


✅ Saved: movies_cleaned.csv
✅ Saved: ratings_cleaned.csv
✅ Saved: directors_cleaned.csv
