In [1]:
import pandas as pd
import numpy as np

from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.multimodal import MultiModalPredictor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval
from dateutil.parser import parse

# from sklearn.preprocessing import MultiLabelBinarizer, TargetEncoder, OneHotEncoder, StandardScaler, QuantileTransformer
# from matplotlib import pyplot as plt
# import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('./movie_dataset.csv', index_col=0)

In [3]:
df = df.drop(columns=['id', 'tagline', 'overview', 'vote_average', 'vote_count', 'domestic_opening', 'original_title', 'title', 'spoken_languages'])

df['budget'] = df['budget'].replace({0: np.NAN})
df['revenue'] = df['revenue'].replace({0: np.NAN})
df['runtime'] = df['runtime'].replace({0: np.NAN})

df.dropna(inplace=True)

df = df[df['status'] == 'Released']
df = df.drop(columns=['status'], axis=1)

def get_class(row):
  R = row['revenue']
  B = row['budget']
  if R < B:
    return 0
  elif B <= R < 2*B:
    return 1
  elif 2*B <= R < 4*B:
    return 2
  elif 4*B <= R:
    return 3
  return np.NAN

literal_eval_cols = ['genres', 'production_countries', 'production_companies', 'actors']

for col in literal_eval_cols:
    df[col] = df.apply(lambda x: literal_eval(x[col]), axis=1)

mlb = MultiLabelBinarizer()
dum = mlb.fit_transform(df['genres'])
classes = ['genre_' + x.replace(' ', '_').lower() for x in mlb.classes_]
df = df.join(pd.DataFrame(dum.astype(bool), df.index, classes))

genres = set(mlb.classes_)

df = df.drop(columns=['genres'], axis=1)

df['original_language'] = df.apply(lambda row: 1 if row['original_language'] == 'en' else 0, axis=1)

def cumulative_revenue_replacer(column, iterable=True):
  entities = df[column].explode().tolist()
  entities_revenue = {}
  for e in entities:
    entities_revenue[e] = 0

  for index, row in df.iterrows():
    if iterable:
      for e in row[column]:
        entities_revenue[e] += row['revenue']
    else:
      entities_revenue[row[column]] += row['revenue']

  def entity_to_cumulative_revenue(entity):
    return entities_revenue[entity]

  def f(row):
    if iterable:
      return sum(list(map(entity_to_cumulative_revenue, row[column])))
    else:
      return entities_revenue[row[column]]

  return f

df['production_companies'] = df.apply(cumulative_revenue_replacer('production_companies'), axis=1)

df['production_countries'] = df.apply(cumulative_revenue_replacer('production_countries'), axis=1)

m = df.apply(lambda x: parse(x['release_date']).month, axis=1)
df = df.join(m.rename('release_month'))
df = df.drop(columns=['release_date'], axis=1)

df['domestic_distributor'] = df.apply(cumulative_revenue_replacer('domestic_distributor', iterable=False), axis=1)

to_delete = ['M/PG', 'GP', 'Approved', 'M', 'Not Rated']
df = df[~df['mpaa'].isin(to_delete)]

df = df.join(pd.DataFrame(df['actors'].values.tolist(), df.index, ['actor_1', 'actor_2', 'actor_3']))
df = df.drop(['actors'], axis=1)
df['actor_1'] = df.apply(cumulative_revenue_replacer('actor_1', iterable=False), axis=1)
df['actor_2'] = df.apply(cumulative_revenue_replacer('actor_2', iterable=False), axis=1)
df['actor_3'] = df.apply(cumulative_revenue_replacer('actor_3', iterable=False), axis=1)

df['director'] = df.apply(cumulative_revenue_replacer('director', iterable=False), axis=1)

# -----

df['target'] = df.apply(get_class, axis=1)
df = df.drop(columns=['revenue'])

In [4]:
# df[['imdb_id', 'poster']].to_csv('./to_download_posters.csv', index=False)

In [5]:
df.shape

(4221, 34)

In [6]:
df = df.set_index('imdb_id')

In [7]:
from pathlib import Path

def replace_url_with_path(df):
    posters = Path('./posters')
    imdb_ids = [x for x in posters.glob('*.jpg')]
    print(len(imdb_ids))

posters = Path('./posters')
imdb_ids = [x.name[:-len('.jpg')] for x in posters.glob('*.jpg')]
df = df.loc[imdb_ids]

In [8]:
df['poster'] = df.apply(lambda x: f'./posters/{x.name}.jpg', axis=1)

In [9]:
df.head()

Unnamed: 0_level_0,original_language,production_companies,production_countries,runtime,domestic_distributor,mpaa,budget,director,poster,genre_action,...,genre_romance,genre_science_fiction,genre_thriller,genre_war,genre_western,release_month,actor_1,actor_2,actor_3,target
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0758766,1,18186499121,405176753592,96.0,69384821965,PG-13,40000000.0,324531590,./posters/tt0758766.jpg,False,...,True,False,False,False,False,2,742220348,693098536,145896422,2
tt0120620,1,47443449224,405176753592,100.0,57983183224,PG-13,25000000.0,70138719,./posters/tt0120620.jpg,False,...,False,False,True,False,False,8,161820076,893377753,634522407,0
tt1524137,1,69202599929,486542805498,109.0,48980929643,R,25000000.0,398523084,./posters/tt1524137.jpg,True,...,False,False,True,False,False,1,3344673893,893377753,662944054,2
tt0117011,1,30931559101,405176753592,100.0,40820231602,R,25000000.0,51702483,./posters/tt0117011.jpg,True,...,False,False,True,False,False,9,573299134,70868043,52121444,2
tt4094724,1,51111709348,422156706097,105.0,48980929643,R,10000000.0,316699354,./posters/tt4094724.jpg,True,...,False,False,True,False,False,6,118587880,118587880,118587880,3


In [10]:
df.reset_index(drop=True, inplace=True)

In [11]:
predictor = MultiModalPredictor(label='target').fit(
    train_data=df,
    time_limit=120
)

No path specified. Models will be saved in: "AutogluonModels/ag-20241209_221942"
AutoGluon Version:  1.2
Python Version:     3.11.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #136~20.04.1-Ubuntu SMP Thu Nov 14 16:38:05 UTC 2024
CPU Count:          16
Pytorch Version:    2.5.1+cpu
CUDA Version:       CUDA is not available
Memory Avail:       11.15 GB / 14.95 GB (74.6%)
Disk Space Avail:   16.11 GB / 216.73 GB (7.4%)
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == int, but few unique label-values observed).
	4 unique label values:  [2, 0, 3, 1]
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])

AutoMM starts to create your model. ✨✨✨

To track the learning progress, you can open a terminal and launch Tensorboard:
    ```shell
    # Assume you have ins

Epoch 0:   1%|          | 5/419 [00:38<53:34,  0.13it/s]                   


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined