In [23]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import ast

movies = pd.read_csv('movies_metadata.csv', low_memory=False)
# Drop irrelevant columns
movies.drop(columns=['homepage', 'tagline'], inplace=True)

# Fill missing numeric columns with 0
movies['budget'] = movies['budget'].fillna(0)
movies['revenue'] = movies['revenue'].fillna(0)
movies['runtime'] = movies['runtime'].fillna(0)
movies['vote_average'] = movies['vote_average'].fillna(0)
movies['vote_count'] = movies['vote_count'].fillna(0)

# Drop rows that are incomplete or missing data
# movies.dropna(inplace=True)

# clean budget column removing dollar sign
movies['budget'] = pd.to_numeric(movies['budget'].replace('[\$,]', '', regex=True), errors='coerce')
# convert to numeric
movies['popularity'] = pd.to_numeric(movies['popularity'], errors='coerce')

# Fill missing values with 0
movies['budget'] = movies['budget'].fillna(0)
movies['popularity'] = movies['popularity'].fillna(0)

# Convert JSON-like strings to lists of dictionaries
movies['belongs_to_collection'] = movies['belongs_to_collection'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else {})
movies['production_companies'] = movies['production_companies'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
movies['production_countries'] = movies['production_countries'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
movies['spoken_languages'] = movies['spoken_languages'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Extract relevant information
movies['belongs_to_collection'] = movies['belongs_to_collection'].apply(lambda x: x.get('name', '') if isinstance(x, dict) else '')
movies['production_companies'] = movies['production_companies'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies['production_countries'] = movies['production_countries'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies['spoken_languages'] = movies['spoken_languages'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

### in case we want to normalize the numeric values 
# Select numeric columns for normalization
#numeric_columns = ['budget', 'revenue', 'runtime', 'popularity', 'vote_average', 'vote_count']
# check the column data types
#print(movies[numeric_columns].info())
#print(movies[numeric_columns].describe())

# Initialize the scaler
#scaler = MinMaxScaler()
# Apply normalization
#movies[numeric_columns] = scaler.fit_transform(movies[numeric_columns])

#extracting from json the names
def extract_names(obj):
    try:
        obj = ast.literal_eval(obj)
        return [d['name'] for d in obj]
    except (ValueError, SyntaxError):
        return []
movies['genres'] = movies['genres'].apply(extract_names)
movies.to_csv('movies_metadata_v2.csv', index=False)

# Display 2 random rows
print(movies.sample(2))

       adult belongs_to_collection  budget  \
6630   False                         118.0   
27171  False                           0.0   

                                    genres     id    imdb_id  \
6630              [Comedy, Drama, Romance]  37672  tt0078199   
27171  [Action, Science Fiction, Thriller]  92371  tt0105178   

      original_language         original_title  \
6630                 en   Same Time, Next Year   
27171                en  Project: Shadowchaser   

                                                overview  popularity  ...  \
6630   A man and woman meet by chance at a romantic i...    2.103581  ...   
27171  Set in the future, a group of terrorists with ...    0.217767  ...   

                     production_countries release_date revenue runtime  \
6630           [United States of America]   1978-04-23    19.0   119.0   
27171  [Canada, United States of America]   1992-07-02     0.0    97.0   

       spoken_languages    status                  title  vide