In [154]:
import os
import shutil
from kagglehub import dataset_download
import pandas as pd
import numpy as np

In [155]:
# List of dataset paths on Kaggle
paths = [
    "asaniczka/tmdb-movies-dataset-2023-930k-movies"
]

# Destination path where datasets will be stored
destination_path = "../data/raw/"

# Ensure the destination directory exists
os.makedirs(destination_path, exist_ok=True)

for path in paths:
    # Download the dataset
    source = dataset_download(path)
    
    # Move files or directories to the destination
    if os.path.isdir(source):
        # Move all files in the directory
        for filename in os.listdir(source):
            file_path = os.path.join(source, filename)
            dest_file_path = os.path.join(destination_path, filename)
            # Overwrite if file exists
            if os.path.exists(dest_file_path):
                os.remove(dest_file_path)
            shutil.move(file_path, destination_path)
        print(f"Moved all files from {source} to {destination_path}")
    else:
        # Move a single file
        dest_file_path = os.path.join(destination_path, os.path.basename(source))
        # Overwrite if file exists
        if os.path.exists(dest_file_path):
            os.remove(dest_file_path)
        shutil.move(source, destination_path)
        print(f"Moved {source} to {destination_path}")

Moved all files from C:\Users\osman\.cache\kagglehub\datasets\asaniczka\tmdb-movies-dataset-2023-930k-movies\versions\444 to ../data/raw/


In [156]:
'''
tmdb_data = pd.read_csv("../data/raw/TMDB_movie_dataset_v11.csv")

imdb_data = pd.read_csv("../data/raw/title.basics.tsv", sep="\t", low_memory= False)
'''
imdb_rating = pd.read_csv("../data/raw/title.ratings.tsv", sep="\t")

In [157]:
'''
cpy_tmdb_data = tmdb_data.copy()
cpy_imdb_data = imdb_data.copy()
'''

'\ncpy_tmdb_data = tmdb_data.copy()\ncpy_imdb_data = imdb_data.copy()\n'

In [158]:
tmdb_data = cpy_tmdb_data
imdb_data = cpy_imdb_data

In [159]:
tmdb_data.sample()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
272907,1199231,Werewolf By Night In Color,8.0,1,Released,2023-10-20,0,0,False,,...,Werewolf By Night In Color,,0.0,/6xjL2AxQ9KaLGxyDfSGr9bSWRkF.jpg,,"Action, Horror, Fantasy","Marvel Studios, Kevin Feige Productions",United States of America,English,


In [160]:
tmdb_data.replace({"\\N": np.nan, "": np.nan}, inplace=True)

In [161]:
tmdb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157425 entries, 0 to 1157424
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1157425 non-null  int64  
 1   title                 1157412 non-null  object 
 2   vote_average          1157425 non-null  float64
 3   vote_count            1157425 non-null  int64  
 4   status                1157425 non-null  object 
 5   release_date          961571 non-null   object 
 6   revenue               1157425 non-null  int64  
 7   runtime               1157425 non-null  int64  
 8   adult                 1157425 non-null  bool   
 9   backdrop_path         305480 non-null   object 
 10  budget                1157425 non-null  int64  
 11  homepage              122170 non-null   object 
 12  imdb_id               607691 non-null   object 
 13  original_language     1157425 non-null  object 
 14  original_title        1157412 non-

In [162]:
imdb_data.sample()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,imdb_averageRating,imdb_numVotes
6889934,tt28625325,tvEpisode,Episode #1.508,Episode #1.508,0,2023,,,Drama,0.0,0.0


In [163]:
imdb_data.replace({"\\N": np.nan, "": np.nan}, inplace=True)

In [164]:
imdb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11348647 entries, 0 to 11348646
Data columns (total 11 columns):
 #   Column              Dtype  
---  ------              -----  
 0   tconst              object 
 1   titleType           object 
 2   primaryTitle        object 
 3   originalTitle       object 
 4   isAdult             object 
 5   startYear           object 
 6   endYear             object 
 7   runtimeMinutes      object 
 8   genres              object 
 9   imdb_averageRating  float64
 10  imdb_numVotes       float64
dtypes: float64(2), object(9)
memory usage: 952.4+ MB


In [165]:
imdb_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1519067 entries, 0 to 1519066
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1519067 non-null  object 
 1   averageRating  1519067 non-null  float64
 2   numVotes       1519067 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 34.8+ MB


### Merge

In [166]:
imdb_data.loc[imdb_data["tconst"].isin(imdb_rating["tconst"]), "imdb_averageRating"] = imdb_rating["averageRating"]
imdb_data.loc[imdb_data["tconst"].isin(imdb_rating["tconst"]), "imdb_numVotes"] = imdb_rating["numVotes"]

In [167]:
imdb_data.loc[imdb_data["imdb_numVotes"].isna(), "imdb_numVotes"] = 0
imdb_data.loc[imdb_data["imdb_averageRating"].isna(), "imdb_averageRating"] = 0

In [168]:
merged_df = tmdb_data.merge(imdb_data, 
                          left_on='imdb_id', 
                          right_on='tconst', 
                          how='left', 
                          suffixes=('_tmdb', '_imdb'))



In [169]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157425 entries, 0 to 1157424
Data columns (total 35 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1157425 non-null  int64  
 1   title                 1157412 non-null  object 
 2   vote_average          1157425 non-null  float64
 3   vote_count            1157425 non-null  int64  
 4   status                1157425 non-null  object 
 5   release_date          961571 non-null   object 
 6   revenue               1157425 non-null  int64  
 7   runtime               1157425 non-null  int64  
 8   adult                 1157425 non-null  bool   
 9   backdrop_path         305480 non-null   object 
 10  budget                1157425 non-null  int64  
 11  homepage              122170 non-null   object 
 12  imdb_id               607691 non-null   object 
 13  original_language     1157425 non-null  object 
 14  original_title        1157412 non-

In [170]:
merged_df["genres_tmdb"].isna().sum()

np.int64(467769)

In [171]:
merged_df["genres_tmdb"] = merged_df["genres_tmdb"].fillna(merged_df["genres_imdb"])

In [172]:
merged_df["genres_tmdb"].isna().sum()

np.int64(325266)

In [173]:
merged_df["runtime"].isna().sum()

np.int64(0)

In [174]:
merged_df["release_date"].isna().sum()

np.int64(195854)

In [175]:
merged_df.loc[merged_df["release_date"].isna() & merged_df["startYear"].notna(), "release_date"] = merged_df["startYear"] + "-01-01"

In [176]:
merged_df["release_date"].isna().sum()

np.int64(154048)

In [179]:
merged_df["release_date"] = pd.to_datetime(merged_df['release_date'])

In [177]:
merged_df.drop(axis=1, columns=["startYear", "endYear", "original_title", "backdrop_path", "runtimeMinutes", "genres_imdb", "spoken_languages", "homepage", "imdb_id","tconst","primaryTitle", "originalTitle", "isAdult"], inplace=True)

In [190]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 668237 entries, 0 to 1157420
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   id                    668237 non-null  int64         
 1   title                 668237 non-null  object        
 2   vote_average          668237 non-null  float64       
 3   vote_count            668237 non-null  int64         
 4   status                668237 non-null  object        
 5   release_date          668237 non-null  datetime64[ns]
 6   revenue               668237 non-null  int64         
 7   runtime               668237 non-null  int64         
 8   adult                 668237 non-null  bool          
 9   budget                668237 non-null  int64         
 10  original_language     668237 non-null  object        
 11  overview              668237 non-null  object        
 12  popularity            668237 non-null  float64       
 13  pos

In [191]:
merged_df.isna().sum()

id                           0
title                        0
vote_average                 0
vote_count                   0
status                       0
release_date                 0
revenue                      0
runtime                      0
adult                        0
budget                       0
original_language            0
overview                     0
popularity                   0
poster_path             146355
tagline                 536807
genres_tmdb                  0
production_companies    302403
production_countries    209518
keywords                430320
titleType               153371
imdb_averageRating      153371
imdb_numVotes           153371
dtype: int64

In [192]:
merged_df.dropna(subset=["overview", "title", "genres_tmdb", "release_date"], inplace=True)

In [199]:
merged_df["genres"] = merged_df["genres_tmdb"]
merged_df.drop(axis=1,columns=["genres_tmdb"], inplace=True)

In [201]:
merged_df.to_csv("../data/processed/myData.csv", index=False)