# TMDB Data Preprocessing

## Import Requiremets

#### Libraries

In [1]:
import os
import shutil
from kagglehub import dataset_download
import subprocess

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

  from .autonotebook import tqdm as notebook_tqdm


#### Download Dataset

In [2]:
# List of dataset paths on Kaggle
paths = [
    "ursmaheshj/iso-639-1-language-codes",
    "ashirwadsangwan/imdb-dataset",
    "asaniczka/tmdb-movies-dataset-2023-930k-movies"
]

# Destination path where datasets will be stored
destination_path = "../data/raw/"

# Ensure the destination directory exists
os.makedirs(destination_path, exist_ok=True)

for path in paths:
    # Download the dataset
    source = dataset_download(path)
    
    # Move files or directories to the destination
    if os.path.isdir(source):
        # Move all files in the directory
        for filename in os.listdir(source):
            file_path = os.path.join(source, filename)
            dest_file_path = os.path.join(destination_path, filename)
            # Overwrite if file exists
            if os.path.exists(dest_file_path):
                os.remove(dest_file_path)
            shutil.move(file_path, destination_path)
        print(f"Moved all files from {source} to {destination_path}")
    else:
        # Move a single file
        dest_file_path = os.path.join(destination_path, os.path.basename(source))
        # Overwrite if file exists
        if os.path.exists(dest_file_path):
            os.remove(dest_file_path)
        shutil.move(source, destination_path)
        print(f"Moved {source} to {destination_path}")

Moved all files from C:\Users\osman\.cache\kagglehub\datasets\ursmaheshj\iso-639-1-language-codes\versions\3 to ../data/raw/
Moved all files from C:\Users\osman\.cache\kagglehub\datasets\ashirwadsangwan\imdb-dataset\versions\817 to ../data/raw/
Moved all files from C:\Users\osman\.cache\kagglehub\datasets\asaniczka\tmdb-movies-dataset-2023-930k-movies\versions\444 to ../data/raw/


## Exploratory Data Analysis (EDA)

In [3]:
data = pd.read_csv("../data/raw/TMDB_movie_dataset_v11.csv")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157425 entries, 0 to 1157424
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1157425 non-null  int64  
 1   title                 1157412 non-null  object 
 2   vote_average          1157425 non-null  float64
 3   vote_count            1157425 non-null  int64  
 4   status                1157425 non-null  object 
 5   release_date          961571 non-null   object 
 6   revenue               1157425 non-null  int64  
 7   runtime               1157425 non-null  int64  
 8   adult                 1157425 non-null  bool   
 9   backdrop_path         305480 non-null   object 
 10  budget                1157425 non-null  int64  
 11  homepage              122170 non-null   object 
 12  imdb_id               607691 non-null   object 
 13  original_language     1157425 non-null  object 
 14  original_title        1157412 non-

In [5]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,1157425.0,752508.680656,401988.3,2.0,417128.0,758132.0,1110380.0,1413208.0
vote_average,1157425.0,1.850443,3.008217,0.0,0.0,0.0,4.6,10.0
vote_count,1157425.0,18.532126,315.6474,0.0,0.0,0.0,1.0,34495.0
revenue,1157425.0,690134.044259,18537820.0,-12.0,0.0,0.0,0.0,5000000000.0
runtime,1157425.0,47.456232,61.65602,-28.0,0.0,22.0,88.0,14400.0
budget,1157425.0,265713.816114,5076877.0,0.0,0.0,0.0,0.0,1000000000.0
popularity,1157425.0,1.221146,7.519671,0.0,0.6,0.6,0.874,2994.357


### Visualization

#### Genre Distribution Visualization

#### Revenue vs Budget Relationship

#### Popularity Over Time

## Handle Miss Values

### Handle Missing Values in Genres

In [24]:
data["genres"] = data["genres"].str.strip()
data.loc[(data["genres"] == "") | 
         (data["genres"] == "\\N") | 
         (data["genres"] == "N/A") | 
         (data["genres"] == "null"), "genres"] = np.nan

In [7]:
befor_missing = data["genres"].isna().sum()
befor_missing

np.int64(467769)

In [8]:
imdb_data = pd.read_csv("../data/raw/title.basics.tsv", sep="\t")

  imdb_data = pd.read_csv("../data/raw/title.basics.tsv", sep="\t")


In [9]:
imdb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11348647 entries, 0 to 11348646
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 779.2+ MB


In [10]:
# Step 1: Identify rows with missing genres but existing imdb_id
missing_genres = data[(data["genres"].isna()) & (data["imdb_id"].notna())]

# Step 2: Extract imdb_ids that need genre data
imdb_ids_to_fill = missing_genres["imdb_id"].tolist()

# Step 3: Retrieve corresponding genres from imdb_data
key = imdb_data[imdb_data["tconst"].isin(imdb_ids_to_fill)][["tconst", "genres"]]

# Step 4: Merge eda.data with key on imdb_id and tconst
merged_df = data.merge(key, left_on="imdb_id", right_on="tconst", how="left")

# Step 5: Fill missing genres in genres_y with genres_x
merged_df["genres_y"] = merged_df["genres_y"].fillna(merged_df["genres_x"])

# Step 6: Update the original genres column with filled data
data["genres"] = merged_df["genres_y"]

In [11]:
after_missing = data["genres"].isna().sum()
after_missing

np.int64(315234)

In [12]:
befor_missing - after_missing # 152535 veri güvenilir bir kaynak olan imdb veri setinden dolduruldu

np.int64(152535)

In [13]:
copy_data = data.copy()

In [14]:
def fill_missing_genres_with_supervised_learning(data):
    # Handle missing values
    data['title'] = data['title'].fillna('')
    data['overview'] = data['overview'].fillna('')
    data['tagline'] = data['tagline'].fillna('')
    data['combined_features'] = data['title'] + ' ' + data['overview'] + ' ' + data['tagline']
    
    # Split data into records with known and missing genres
    missing_genres = data[data['genres'].isna()]
    known_genres = data[~data['genres'].isna()]
    
    # Ensure 'genres' column is of string type, convert NaN to empty strings
    known_genres['genres'] = known_genres['genres'].astype(str)
    
    # Split by comma and strip whitespace, remove empty strings
    known_genres['genres'] = known_genres['genres'].apply(
        lambda x: [genre.strip() for genre in x.split(',') if genre.strip()]
    )
    
    # Encode genres using MultiLabelBinarizer
    from sklearn.preprocessing import MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    genre_matrix = mlb.fit_transform(known_genres['genres'])
    
    # Split known_genres into train and validation sets
    from sklearn.model_selection import train_test_split
    X_train_text, X_val_text, y_train, y_val = train_test_split(
        known_genres['combined_features'],
        genre_matrix,
        test_size=0.2,
        random_state=42
    )
    
    # Define the pipeline
    from sklearn.pipeline import Pipeline
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.decomposition import TruncatedSVD
    from sklearn.multioutput import MultiOutputClassifier
    from sklearn.linear_model import LogisticRegression
    
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 1))),
        ('svd', TruncatedSVD(n_components=50)),
        ('clf', MultiOutputClassifier(LogisticRegression(max_iter=1000, solver='lbfgs', C=1.0)))
    ])
    
    # Specify a more limited parameter grid
    param_grid = {
        'tfidf__max_features': [5000],
        'tfidf__ngram_range': [(1, 1)],
        'svd__n_components': [50],
        'clf__estimator__C': [1.0],
        'clf__estimator__penalty': ['l2'],
        'clf__estimator__solver': ['lbfgs']
    }
    
    # Perform Grid Search with fewer parameters
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
    grid_search.fit(X_train_text, y_train)
    
    # Evaluate the best model
    best_model = grid_search.best_estimator_
    y_pred_val = best_model.predict(X_val_text)
    print(classification_report(y_val, y_pred_val))
    
    # Prepare missing genres data
    X_missing = best_model.named_steps['tfidf'].transform(missing_genres['combined_features'])
    X_missing = best_model.named_steps['svd'].transform(X_missing)
    y_pred_missing = best_model.named_steps['clf'].predict(X_missing)
    
    # Convert predictions back to genre labels
    predicted_genres = mlb.inverse_transform(y_pred_missing)
    
    # Assign predicted genres back to the original dataset
    data.loc[missing_genres.index, 'genres'] = [','.join(genres) for genres in predicted_genres]

In [21]:
fill_missing_genres_with_supervised_learning(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  known_genres['genres'] = known_genres['genres'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  known_genres['genres'] = known_genres['genres'].apply(


Fitting 3 folds for each of 1 candidates, totalling 3 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.83      0.00      0.00     10527
           1       0.86      0.54      0.66      6556
           2       1.00      0.00      0.00      5581
           3       0.61      0.02      0.05     12713
           4       0.00      0.00      0.00       603
           5       0.53      0.03      0.05     33263
           6       0.43      0.00      0.00      7549
           7       0.76      0.54      0.63     45614
           8       0.62      0.36      0.46     58111
           9       0.37      0.00      0.00      6424
          10       0.00      0.00      0.00      5366
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         8
          13       0.28      0.00      0.01      3845
          14       0.43      0.01      0.01     12158
          15       0.84      0.30      0.45     12281
          16       0.00      0.00      0.00       420
          17       0.00    

In [25]:
data["genres"].isna().sum()

np.int64(218400)

In [23]:
data.genres.value_counts()[:58]

genres
                          218400
Documentary               194697
Drama                     143738
Comedy                     68614
Adult                      41750
Music                      33847
Animation                  32821
Horror                     24269
Short                      12843
Drama, Romance             10709
Drama,Short                10471
Action                     10230
Comedy, Drama               9590
Romance                     9003
Thriller                    8766
Drama, Comedy               6272
Documentary,Short           6072
Comedy, Romance             5542
Documentary, Music          5516
Western                     4919
Comedy,Short                4889
Crime                       4601
Romance, Drama              4342
Family                      4123
Drama, Thriller             3880
Drama, TV Movie             3628
Science Fiction             3619
Horror, Thriller            3558
Crime, Drama                3355
Fantasy                     3201
Mus

#### Filling Missing Genres by Imdb Dataset

### other

## Handle Outliers