We loaded the movie data from the data we retrieved from TMDB.com

In [171]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Load the data
movie_data_path = Path("source_data/movie_data.csv")
movie_data_df = pd.read_csv(movie_data_path)
movie_data_df.head()


  movie_data_df = pd.read_csv(movie_data_path)


Unnamed: 0.1,Unnamed: 0,adult,backdrop_path,belongs_to_collection_id,belongs_to_collection_name,belongs_to_collection_poster_path,belongs_to_collection_backdrop_path,budget,genres_0_id,genres_0_name,...,production_companies_19_id,production_companies_19_logo_path,production_companies_19_name,production_companies_19_origin_country,production_companies_20_id,production_companies_20_logo_path,production_companies_20_name,production_companies_20_origin_country,origin_country_6,origin_country_7
0,0,False,/417tYZ4XUyJrtyZXj7HpvWf1E8f.jpg,1370345.0,The Wild Robot Collection,/cioNnsPSHJH9gsUSETPFHh0m6MT.jpg,/f6G8QPeod5ngQMs5Fe1O4LdphB7.jpg,78000000,16.0,Animation,...,,,,,,,,,,
1,1,False,/3V4kLQg0kSqPLctI5ziYWabAZYF.jpg,558216.0,Venom Collection,/hoTLlTIohrzQ13HQVkZrDlvffuT.jpg,/vq340s8DxA5Q209FT8PHA6CXYOx.jpg,120000000,28.0,Action,...,,,,,,,,,,
2,2,False,/9SSEUrSqhljBMzRe4aBTh17rUaC.jpg,,,,,80000000,878.0,Science Fiction,...,,,,,,,,,,
3,3,False,/7h6TqPB3ESmjuVbxCxAeB1c9OB1.jpg,,,,,17500000,27.0,Horror,...,,,,,,,,,,
4,4,False,/xlkclSE4aq7r3JsFIJRgs21zUew.jpg,727761.0,Terrifier Collection,/4xIzrMcEvCzJm5qAl92WMHLSIeM.jpg,/zREjCmCHIHdEF6ufPoDQjhl4Wdm.jpg,2000000,27.0,Horror,...,,,,,,,,,,


We did cleanup, identifying the needed columns.  Encoded categorical data and dropped columns.

In [172]:
# Drop columns that are not needed
movie_data_df = movie_data_df[['budget', 'revenue', 'runtime', 'popularity', 'vote_average', 'genres_0_name', 'genres_1_name', 'genres_2_name']]

# Encode the categorical columns
ohe = OneHotEncoder()

categorical_cols = ['genres_0_name', 'genres_1_name', 'genres_2_name']
encoded_genres = ohe.fit_transform(movie_data_df[categorical_cols]).toarray()

# Create a new dataframe with the encoded genres
encoded_genres_df = pd.DataFrame(encoded_genres, columns=ohe.get_feature_names_out(categorical_cols))

# Drop the original categorical columns and add the encoded columns
movie_data_df = pd.concat([movie_data_df.drop(columns=categorical_cols), encoded_genres_df], axis=1)
movie_data_df = movie_data_df.drop(columns=['genres_2_name_nan', 'genres_0_name_2016-09-01', 'genres_0_name_2024-10-26'])

movie_data_df.head()

movie_data_df = movie_data_df.dropna()



We calculated profit and roi columns.  Identifying thresholds.

In [175]:
# Create the profit and ROI columns
movie_data_df['profit'] = pd.to_numeric(movie_data_df['revenue']) - pd.to_numeric(movie_data_df['budget'])
movie_data_df = movie_data_df[pd.to_numeric(movie_data_df['budget']) > 0]
movie_data_df['roi'] = pd.to_numeric(movie_data_df['profit']) / pd.to_numeric(movie_data_df['budget'])
# Sort the data by profit, vote_average and popularity
movie_data_df.sort_values(by=['profit', 'vote_average', 'popularity'], ascending=False, inplace=True)

# Store threshold values
vote_average_theshold = movie_data_df['vote_average'].quantile(0.75)
popularity_theshold = movie_data_df['popularity'].quantile(0.75)

movie_data_df.head()


Unnamed: 0,budget,revenue,runtime,popularity,vote_average,genres_0_name_Action,genres_0_name_Adventure,genres_0_name_Animation,genres_0_name_Comedy,genres_0_name_Crime,...,genres_2_name_Music,genres_2_name_Mystery,genres_2_name_Romance,genres_2_name_Science Fiction,genres_2_name_TV Movie,genres_2_name_Thriller,genres_2_name_War,genres_2_name_Western,profit,roi
322,237000000,2923706000.0,162.0,129.244,7.583,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2686706000.0,11.336312
275,356000000,2799439000.0,181.0,141.577,8.25,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2443439000.0,6.863593
199,200000000,2264162000.0,194.0,166.361,7.906,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2064162000.0,10.320812
250,460000000,2320250000.0,192.0,148.255,7.62,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1860250000.0,4.044022
1205,245000000,2068224000.0,136.0,64.763,7.3,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1823224000.0,7.441729


Classification - Define success
Turn target into binary - use model to figure it out


We split the data into testing and training data.

In [174]:
# Create the target column
movie_data_train, movie_data_test = train_test_split(movie_data_df, test_size=0.2, random_state=42)
movie_data_train.to_csv("source_data/movie_data_train.csv", index=False)
movie_data_test.to_csv("source_data/movie_data_test.csv", index=False)
