# Predict film popularity using machine learning

A machine learning model to predict the popularity of a film based on its characteristics

In [523]:
## import tools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import RocCurveDisplay

In [524]:
my_data = pd.read_csv("data/test_kaggle.csv")

## Data exploration
idx_index = my_data.title_index
my_data = my_data.drop("title_index", axis = 1)

In [525]:
%%capture
my_data["country"] = my_data["country"].fillna("Unknown")
my_data["language"] = my_data["language"].fillna("Unknown")
my_data["director"].fillna("Unknown", inplace = True)
my_data["writer"].fillna("Unknown", inplace = True)
my_data["production_company"].fillna("Unknown", inplace = True)
my_data["actors"].fillna("Unknown", inplace = True)
my_data["description"].fillna("Unknown", inplace = True)

# Fix us_gross_income col

In [526]:
my_data['usa_gross_income'] = my_data['usa_gross_income'].str.replace('$', "", regex = False)

In [527]:
index = 0
val = []
for data in my_data['usa_gross_income']:
    #print(data)
    if pd.notnull(data) and 'EUR' in data:
        print(data)
        val.append(index)
    index = index + 1
print(val)
#print(my_data['usa_gross_income'][10998])

[]


In [528]:
%%capture
my_data["usa_gross_income"][10344] = '272254'
my_data["usa_gross_income"][10344]

In [529]:
index = 0
val = []
for data in my_data['usa_gross_income']:
    #print(data)
    if pd.notnull(data) and 'GBP' in data:
        print(data)
        val.append(index)
    index = index + 1
print(val)

[]


In [530]:
%%capture
for index in val:
    my_data['usa_gross_income'][index] = my_data['usa_gross_income'][index].replace("GBP", "")
    my_data['usa_gross_income'][index] = int(my_data['usa_gross_income'][index])*126/100
    print(my_data['usa_gross_income'][index])

In [531]:
my_data['usa_gross_income'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 20000 entries, 0 to 19999
Series name: usa_gross_income
Non-Null Count  Dtype 
--------------  ----- 
3580 non-null   object
dtypes: object(1)
memory usage: 156.4+ KB


In [532]:
my_data['usa_gross_income'] = pd.to_numeric(my_data['usa_gross_income'],errors='coerce' )

In [533]:
mean = my_data['usa_gross_income'].mean()
my_data['usa_gross_income'].fillna(mean, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  my_data['usa_gross_income'].fillna(mean, inplace = True)


# Fix worldwide_gross_income col


In [534]:
my_data['worlwide_gross_income'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 20000 entries, 0 to 19999
Series name: worlwide_gross_income
Non-Null Count  Dtype 
--------------  ----- 
7312 non-null   object
dtypes: object(1)
memory usage: 156.4+ KB


In [535]:
my_data['worlwide_gross_income'] = my_data['worlwide_gross_income'].str.replace('$', "", regex = False)

In [536]:
index = 0
val = []
for data in my_data['worlwide_gross_income']:
    #print(data)
    if type(data) == str and ('NPR' in data):
        #print(data)
        val.append(index)
    index = index + 1
print(val)
print(my_data['worlwide_gross_income'][val])

[]
Series([], Name: worlwide_gross_income, dtype: object)


In [537]:
%%capture
for index in val:
    my_data["worlwide_gross_income"][index] = my_data['worlwide_gross_income'][index].replace('NPR','')
    my_data["worlwide_gross_income"][index] = int(my_data["worlwide_gross_income"][index])/133.18

In [538]:
%%capture
index = 0
val = []
for data in my_data['worlwide_gross_income']:
    #print(data)
    if type(data) == str and ('INR' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["worlwide_gross_income"][index] = my_data['worlwide_gross_income'][index].replace('INR','')
    my_data["worlwide_gross_income"][index] = int(my_data["worlwide_gross_income"][index])*0.012

In [539]:
%%capture
index = 0
val = []
for data in my_data['worlwide_gross_income']:
    #print(data)
    if type(data) == str and ('GBP' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["worlwide_gross_income"][index] = my_data['worlwide_gross_income'][index].replace('GBP','')
    my_data["worlwide_gross_income"][index] = int(my_data["worlwide_gross_income"][index])*1.26

In [540]:
%%capture
index = 0
val = []
for data in my_data['worlwide_gross_income']:
    #print(data)
    if type(data) == str and ('PKR' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["worlwide_gross_income"][index] = my_data['worlwide_gross_income'][index].replace('PKR','')
    my_data["worlwide_gross_income"][index] = int(my_data["worlwide_gross_income"][index])*0.0036

In [541]:
my_data['worlwide_gross_income'] = pd.to_numeric(my_data['worlwide_gross_income'])

In [542]:
%%capture
mean = my_data['worlwide_gross_income'].mean()
my_data['worlwide_gross_income'].fillna(mean, inplace = True)

# Fix budget col

In [543]:
my_data['budget']

0             NaN
1        $ 300000
2             NaN
3             NaN
4             NaN
           ...   
19995         NaN
19996         NaN
19997         NaN
19998         NaN
19999         NaN
Name: budget, Length: 20000, dtype: object

In [544]:
my_data['budget'] = my_data['budget'].str.replace('$', "", regex = False)

In [545]:
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('FRF' in data):
        #print(data)
        val.append(index)
    index = index + 1
print(val)
print(my_data['budget'][val])

[338, 634, 1221, 1239, 1290, 1572, 1707, 2967, 3322, 4161, 6144, 6356, 6763, 7036, 7086, 7269, 9291, 10662, 10901, 11699, 11844, 13169, 13417, 13812, 14170, 14539, 14635, 14674, 14906, 15207, 16567, 17893, 18061, 18628, 18987]
338       FRF 50000000
634       FRF 80000000
1221      FRF 53000000
1239      FRF 25000000
1290       FRF 4000000
1572      FRF 12000000
1707       FRF 6000000
2967      FRF 75000000
3322       FRF 5300000
4161     FRF 110000000
6144      FRF 50000000
6356      FRF 37000000
6763      FRF 15000000
7036      FRF 39000000
7086      FRF 71500000
7269       FRF 3100000
9291     FRF 164000000
10662     FRF 30000000
10901        FRF 40000
11699     FRF 17000000
11844     FRF 53000000
13169     FRF 21000000
13417     FRF 60000000
13812     FRF 26000000
14170     FRF 65000000
14539    FRF 116000000
14635     FRF 17000000
14674     FRF 60000000
14906     FRF 30000000
15207       FRF 248000
16567     FRF 22500000
17893     FRF 82000000
18061       FRF 200000
18628     FRF 

In [546]:
%%capture
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('FRF','')
    my_data["budget"][index] = int(my_data["budget"][index])/6.18

In [547]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('EUR' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('EUR','')
    my_data["budget"][index] = int(my_data["budget"][index])*1.08

In [548]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('GBP' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('GBP','')
    my_data["budget"][index] = int(my_data["budget"][index])*1.26

In [549]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('CAD' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('CAD','')
    my_data["budget"][index] = int(my_data["budget"][index])*1.35

In [550]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('AUD' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('AUD','')
    my_data["budget"][index] = int(my_data["budget"][index])*1.53

In [551]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('JPY' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('JPY','')
    my_data["budget"][index] = int(my_data["budget"][index])/151.44

In [552]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('EGP' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('EGP','')
    my_data["budget"][index] = int(my_data["budget"][index])/47.35

In [553]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('TRL' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('TRL','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.031

In [554]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('SEK' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('SEK','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.094

In [555]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('IRR' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('IRR','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.000024

In [556]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('PTE' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('PTE','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.00538522

In [557]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('NOK' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('NOK','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.092

In [558]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('MXN' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('MXN','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.060

In [559]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('HUF' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('HUF','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0027

In [560]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('DKK' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('DKK','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.14

In [561]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('DEM' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('DEM','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0035

In [562]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('THB' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('THB','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.027

In [563]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('ITL' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('ITL','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.00054

In [564]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('IDR' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('IDR','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.000063

In [565]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('CNY' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('CNY','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.14

In [566]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('CZK' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('CZK','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.043

In [567]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('ARS' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('ARS','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0012

In [568]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('BRL' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('BRL','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.20

In [569]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('KRW' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('KRW','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.00074

In [570]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('RUR' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('RUR','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.011

In [571]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('NLG' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('NLG','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.49

In [572]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('DOP' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('DOP','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.017

In [573]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('HKD' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('HKD','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.13

In [574]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('NZD' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('NZD','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.60

In [575]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('AZM' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('AZM','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.59

In [576]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('SGD' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('SGD','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.74

In [577]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('PLN' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('PLN','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.25

In [578]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('BDT' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('BDT','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0091

In [579]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('ESP' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('ESP','')
    my_data["budget"][index] = int(my_data["budget"][index])*1.08

In [580]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('MYR' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('MYR','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.211663

In [581]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('VND' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('VND','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.00004

In [582]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('BGL' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('BGL','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.55

In [583]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('ISK' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('ISK','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0072

In [584]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('EEK' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('EEK','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0690014

In [585]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('BEF' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('BEF','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0267636

In [586]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('ROL' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('ROL','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.22

In [587]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('COP' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('COP','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.00026

In [588]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('LKR' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('LKR','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0033

In [589]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('PHP' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('PHP','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.018

In [590]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('PKR' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('PKR','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0036

In [591]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('SIT' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('SIT','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0045114867

In [592]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('ZAR' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('ZAR','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.054

In [593]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('CHF' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('CHF','')
    my_data["budget"][index] = int(my_data["budget"][index])*1.11

In [594]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('ILS' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('ILS','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.27

In [595]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('CLP' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('CLP','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0010

In [596]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('LVL' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('LVL','')
    my_data["budget"][index] = int(my_data["budget"][index])*1.65

In [597]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('NPR' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('NPR','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0075

In [598]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('YUM' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('YUM','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.552499

In [599]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('TWD' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('TWD','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.031

In [600]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('GEL' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('GEL','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.37

In [601]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('AMD' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('AMD','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0035

In [602]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('NGN' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('NGN','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.00076

In [603]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('HRK' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('HRK','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.14244

In [604]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('MTL' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('MTL','')
    my_data["budget"][index] = int(my_data["budget"][index])*2.51488

In [605]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('MNT' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('MNT','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.00030

In [606]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('LTL' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('LTL','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.312685

In [607]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('AED' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('AED','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.27

In [608]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('TTD' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('TTD','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.15

In [609]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('BND' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('BND','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.74

In [610]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('IEP' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('IEP','')
    my_data["budget"][index] = int(my_data["budget"][index])*1.37086

In [611]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('RON' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('RON','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.22

In [612]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('ALL' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('ALL','')
    my_data["budget"][index] = int(my_data["budget"][index])*1.35

In [613]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('INR' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('INR','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.012

In [614]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('JOD' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('JOD','')
    my_data["budget"][index] = int(my_data["budget"][index])*1.41

In [615]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('FIM' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('FIM','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.18

In [616]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('UAH' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('UAH','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.026

In [617]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('VEB' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('VEB','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0276

In [618]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('SKK' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('SKK','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0360411

In [619]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('ATS' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('ATS','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.078836

In [620]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('PYG' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('PYG','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.00014

In [621]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('JMD' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('JMD','')
    my_data["budget"][index] = int(my_data["budget"][index])*0.0065

In [622]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('XAU' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('XAU','')
    my_data["budget"][index] = int(my_data["budget"][index])* 2346.699989

In [623]:
%%capture
index = 0
val = []
for data in my_data['budget']:
    #print(data)
    if type(data) == str and ('GRD' in data):
        #print(data)
        val.append(index)
    index = index + 1
for index in val:
    my_data["budget"][index] = my_data['budget'][index].replace('GRD','')
    my_data["budget"][index] = int(my_data["budget"][index])* 0.00318642

In [624]:
my_data["budget"] = pd.to_numeric(my_data['budget'])

In [625]:
mean = my_data['budget'].mean()
my_data['budget'].fillna(mean, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  my_data['budget'].fillna(mean, inplace = True)


In [626]:
my_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   title                  20000 non-null  object 
 1   original_title         20000 non-null  object 
 2   year                   20000 non-null  object 
 3   date_published         20000 non-null  object 
 4   genre                  20000 non-null  object 
 5   duration               20000 non-null  int64  
 6   country                20000 non-null  object 
 7   language               20000 non-null  object 
 8   director               20000 non-null  object 
 9   writer                 20000 non-null  object 
 10  production_company     20000 non-null  object 
 11  actors                 20000 non-null  object 
 12  description            20000 non-null  object 
 13  budget                 20000 non-null  float64
 14  usa_gross_income       20000 non-null  float64
 15  wo

# Checkpoint

In [627]:
my_data["change_title"] = (my_data["title"] == my_data["original_title"]).astype(int)

In [628]:
my_data["year"][8137] = 2019

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  my_data["year"][8137] = 2019
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_data["year"][8137] = 2019


In [629]:
my_data["age"] = 2024 - my_data["year"].astype(int)

In [630]:
copy_data = my_data.copy()

In [631]:
# from sklearn.compose import ColumnTransformer
# column_transformer = ColumnTransformer(
#     transformers = [
#         ('description_tranform', CountVectorizer(stop_words = 'english'), 'description')
#     ],
#     remainder = 'drop'
# )
# transformed_data = column_transformer.fit_transform(my_data)

In [632]:
# count_vec = column_transformer.named_transformers_["description_tranform"]
# feature_names = count_vec.get_feature_names_out()
# prefixed_feature_names = ["mat_" + feature for feature in feature_names]

# sparse_mat = pd.DataFrame.sparse.from_spmatrix(transformed_data, columns = prefixed_feature_names)
# cols = []
# for col in my_data.columns:
#     cols.append(col)

In [633]:
# sparse_mat.index = my_data.index

In [634]:
# my_data = pd.concat([my_data[cols], sparse_mat], axis = 1)

In [635]:
copy_data = pd.read_csv("data/train.csv")
huhu = pd.to_datetime(copy_data["date_published"], errors = 'coerce')
huhu = huhu.dt.strftime("%m-%d")

my_data["date_published"] = huhu

In [636]:
refined_data = my_data.drop(["title", "original_title", "year", "description"], axis = 1)
refined_data = refined_data.iloc[:, :15]

In [637]:
# still need to fix country, language, director, writer, production_comp, actors, budget, usa_gross_income, worldwide_gross_income

refined_data.shape
refined_data["genre"] = refined_data["genre"].str.split(', ')

In [638]:
refined_data["country"] = refined_data["country"].str.split(', ')
refined_data["language"] = refined_data["language"].str.split(', ')
refined_data["director"] = refined_data["director"].str.split(', ')
refined_data["writer"] = refined_data["writer"].str.split(', ')
refined_data["production_company"] = refined_data["production_company"].str.split(', ')
refined_data["actors"] = refined_data["actors"].str.split(', ')

In [639]:
refined_data[['month', 'day']] = refined_data['date_published'].str.split('-', expand=True)

refined_data['month'] = pd.to_numeric(refined_data['month'], errors='coerce')
refined_data['day'] = pd.to_numeric(refined_data['day'], errors='coerce')

In [640]:
refined_data["date_published"] = (refined_data['month'] - 1) * 30 + refined_data['day']
refined_data.fillna(-1, inplace = True)
refined_data.head()

Unnamed: 0,date_published,genre,duration,country,language,director,writer,production_company,actors,budget,usa_gross_income,worlwide_gross_income,change_title,age,month,day
0,147.0,[Drama],104,[Austria],"[German, French]",[Monja Art],[Monja Art],[Orbrock Film],"[Elisabeth Wabitsch, Magdalena Wabitsch, Baile...",13391350.0,20690600.0,24083180.0,1,7,5.0,27.0
1,104.0,"[Action, Drama]",88,[USA],"[Italian, German]",[Ari Taub],[Caio Ribeiro],[Hit & Run Productions],"[Daniel Asher, C.J. Barkus, Gianluca Bianco, D...",300000.0,20690600.0,24083180.0,1,14,4.0,14.0
2,315.0,"[Action, Crime, Drama]",100,[USA],[English],[Paul Mones],[Paul Mones],[MDP Worldwide],"[Damian Chapa, Jennifer Rubin, Scott Plank, Wi...",13391350.0,20690600.0,24083180.0,1,30,11.0,15.0
3,178.0,"[Drama, Horror, Mystery]",125,[Thailand],[Thai],[Sophon Sakdaphisit],"[Sopana Chaowwiwatkul, Sophon Sakdaphisit]",[Unknown],"[Saharat Sangkapreecha, Piyathida Woramusik, S...",13391350.0,20690600.0,5755172.0,1,13,6.0,28.0
4,63.0,[Drama],115,[Canada],[French],[Robin Aubert],[Robin Aubert],[Max Films Productions],"[Trystan Bouthillier, Richard Robitaille, Patr...",13391350.0,20690600.0,24083180.0,1,14,3.0,3.0


In [641]:
refined_data = refined_data.drop(["month", "day"], axis=1)

In [642]:
refined_data.iloc[:5,:] 

Unnamed: 0,date_published,genre,duration,country,language,director,writer,production_company,actors,budget,usa_gross_income,worlwide_gross_income,change_title,age
0,147.0,[Drama],104,[Austria],"[German, French]",[Monja Art],[Monja Art],[Orbrock Film],"[Elisabeth Wabitsch, Magdalena Wabitsch, Baile...",13391350.0,20690600.0,24083180.0,1,7
1,104.0,"[Action, Drama]",88,[USA],"[Italian, German]",[Ari Taub],[Caio Ribeiro],[Hit & Run Productions],"[Daniel Asher, C.J. Barkus, Gianluca Bianco, D...",300000.0,20690600.0,24083180.0,1,14
2,315.0,"[Action, Crime, Drama]",100,[USA],[English],[Paul Mones],[Paul Mones],[MDP Worldwide],"[Damian Chapa, Jennifer Rubin, Scott Plank, Wi...",13391350.0,20690600.0,24083180.0,1,30
3,178.0,"[Drama, Horror, Mystery]",125,[Thailand],[Thai],[Sophon Sakdaphisit],"[Sopana Chaowwiwatkul, Sophon Sakdaphisit]",[Unknown],"[Saharat Sangkapreecha, Piyathida Woramusik, S...",13391350.0,20690600.0,5755172.0,1,13
4,63.0,[Drama],115,[Canada],[French],[Robin Aubert],[Robin Aubert],[Max Films Productions],"[Trystan Bouthillier, Richard Robitaille, Patr...",13391350.0,20690600.0,24083180.0,1,14


6. Experimentation

In [643]:
import json
with open('target_mean_mappings.json', 'r') as f:
    loaded_mappings = json.load(f)

In [644]:
def apply_mapping(row, column):
    if isinstance(row[column], list):
        numerical_val = []
        for item in row[column]:
            if item in loaded_mappings[column]["target_mean"]:
                numerical_val.append(loaded_mappings[column]["target_mean"][item])
            else:
                numerical_val.append(loaded_mappings[column]["global_mean"])
        return sum(numerical_val)/len(numerical_val) if numerical_val else loaded_mappings[column]["global_mean"]
    else:
        if row[column] in loaded_mappings[column]["target_mean"]:
            return loaded_mappings[column]["target_mean"][row[column]]
        else:
            return loaded_mappings[column]["global_mean"]

In [645]:
refined_data["genre"] = refined_data.apply(lambda row: apply_mapping(row, 'genre'), axis = 1)
refined_data["country"] = refined_data.apply(lambda row: apply_mapping(row, 'country'), axis = 1)
refined_data["language"] = refined_data.apply(lambda row: apply_mapping(row, 'language'), axis = 1)
refined_data["director"] = refined_data.apply(lambda row: apply_mapping(row, 'director'), axis = 1)
refined_data["writer"] = refined_data.apply(lambda row: apply_mapping(row, 'writer'), axis = 1)
refined_data["production_company"] = refined_data.apply(lambda row: apply_mapping(row, 'production_company'), axis = 1)
refined_data["actors"] = refined_data.apply(lambda row: apply_mapping(row, 'actors'), axis = 1)

In [646]:
refined_data.head()

Unnamed: 0,date_published,genre,duration,country,language,director,writer,production_company,actors,budget,usa_gross_income,worlwide_gross_income,change_title,age
0,147.0,0.672597,104,0.667622,0.67631,0.550959,0.565012,0.0,0.398994,13391350.0,20690600.0,24083180.0,1,7
1,104.0,0.565348,88,0.45585,0.637768,0.0,0.0,0.0,0.255962,300000.0,20690600.0,24083180.0,1,14
2,315.0,0.57563,100,0.45585,0.475802,0.5,0.5,0.375,0.327067,13391350.0,20690600.0,24083180.0,1,30
3,178.0,0.465774,125,0.505338,0.57508,1.0,1.0,0.491182,0.567248,13391350.0,20690600.0,5755172.0,1,13
4,63.0,0.672597,115,0.429866,0.67776,0.5,0.5,0.9375,0.800095,13391350.0,20690600.0,24083180.0,1,14


In [647]:
from joblib import dump, load
model = load('filmPredictModel.joblib')

In [654]:
y_pred = model.predict(refined_data)

In [661]:
def get_popularity(x):
    if x > 0.5:
        return 'popular'
    else:
        return 'unpopular'

In [662]:
result = pd.DataFrame(columns = ['title_index', 'popularity'])

In [663]:
result.title_index = idx_index

In [664]:
result.popularity = [get_popularity(x) for x in y_pred]

In [665]:
y_pred

array([0, 0, 0, ..., 0, 0, 1])

In [666]:
result

Unnamed: 0,title_index,popularity
0,0,unpopular
1,1,unpopular
2,2,unpopular
3,3,popular
4,4,popular
...,...,...
19995,19998,popular
19996,19999,popular
19997,20000,unpopular
19998,20001,unpopular


In [667]:
result.to_csv('submission.csv',index=False)