# Prediction of Gross

### Loading data set cleaned

In [1]:
import ast
import pandas as pd
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [2]:
import os
from urllib.request import urlretrieve

def download_dataset():

    urls = [
        ("1_zkibYlJyj5ZkLulFYtJ4_vp9RtgoUtW",
        "cleaned_film_datset.csv"),
    ]

    for (fileId, filename) in urls:
        urlretrieve(f"https://drive.usercontent.google.com/download?id={fileId}&export=download&authuser=1&confirm=t", f"./{filename}")

download_dataset()

In [3]:
movies_df = pd.read_csv("cleaned_film_datset.csv")
#movies_df.drop(columns=['genre_imdb','spoken_languages','description_tmdb','Unnamed: 0','genre_letterboxd', 'imdb_id', 'original_language','description_letterboxd','tmdb_id','description_imdb'], inplace=True)
movies_df.drop(columns=[
    #'genre_imdb',
    'spoken_languages',
    'popularity',
    'description_tmdb',
    'adult',
    'Unnamed: 0',
    #'genre_letterboxd',
    'imdb_id',
    'original_language',
    'description_letterboxd',
    'tmdb_id',
    'description_imdb'],
               inplace=True)

In [4]:
pd.set_option('display.max_columns', None)

### Adjustment of features for data vizualization for correlation and adjustment for data  prediction

Making sure it has the right format


In [5]:
movies_df['year']=movies_df['year'].astype(int)
movies_df['runtime']=movies_df['runtime'].astype(int)
movies_df['gross']=movies_df['gross'].astype(int)
movies_df['revenue']=movies_df['revenue'].astype(int)
movies_df['budget']=movies_df['budget'].astype(int)
movies_df['vote_count_letterboxd']=movies_df['vote_count_letterboxd'].astype(int)
movies_df['vote_count_imdb']=movies_df['vote_count_imdb'].astype(int)
movies_df['release_date'] = pd.to_datetime(movies_df['release_date'])

Separate information with comas into new columns:


*   Director
*   Stars




In [6]:
def extract_ids(column):
    return column.str.extractall('nm(\d+)/').groupby(level=0).agg(','.join)[0]

movies_df['director'] = movies_df['director'].astype(str).str.replace(r" \n", "", regex=True)
movies_df['director_ids'] = extract_ids(movies_df['director_id'])

movies_df['star'] = movies_df['star'].astype(str).str.replace(r" \n", "", regex=True)
movies_df['star_ids'] = extract_ids(movies_df['star_id'])

# movies_df[['first_star', 'second_star']] = movies_df['star'].str.split(',', n=2, expand=True)[[0, 1]]
# movies_df['second_star'] = movies_df['second_star'].str.replace('\n', '', regex=False)
#movies_df[[f'star_id_{i}' for i in range(4)]] = movies_df['star_ids'].str.split(',', expand=True, n=3)  # only starts have more than 1 ID, not directors

movies_df.drop(columns=['star_id','director_id'], inplace=True)
#movies_df.head()

Merge genre instances and eliminate duplicated genres

In [7]:
try:
  # Make genre_letterboxd column has the same format as the other two genre columns
  movies_df['genre_letterboxd'] = movies_df['genre_letterboxd'].astype(str).str.replace(r"[\"\[\]]", "", regex=True)

  # Merge the three genre columns into one general genre column
  movies_df['genre'] = movies_df['genre_letterboxd']+ ', ' + movies_df['genre_tmdb'] + ', ' + movies_df['genre_imdb']

  # Take the genre column values to a standard format by removing spaces and applying lowercase
  movies_df['genre'] = movies_df['genre'].str.lower().str.replace(' ', '')

  # Split the genre column into individual genres, removing the duplicates
  movies_df['genre'] = movies_df['genre'].apply(lambda x: list(set(x.split(','))))

  # Combine unique genres
  movies_df['genre'] = movies_df['genre'].apply(lambda x: ', '.join(x))
  # Drop unnecessary columns
  movies_df.drop(columns=['genre_letterboxd', 'genre_imdb', 'genre_tmdb'], inplace=True)

except:
  print('Genre conditioning already done')

Convert the text string of the production countries column to a Python list

In [8]:
try:
  movies_df['production_countries'] = movies_df['production_countries'].apply(ast.literal_eval)

  max_countries = movies_df['production_countries'].apply(len).max()

  for i in range(max_countries):
      col_name = f'country_{i+1}'
      movies_df[col_name] = movies_df['production_countries'].apply(lambda x: x[i] if i < len(x) else None)

  movies_df = movies_df.drop(columns=[f'country_{i}' for i in range(2, max_countries+1)])
  movies_df.drop('production_countries', axis=1, inplace=True, errors='ignore')
  movies_df.rename(columns={'country_1': 'production_country'}, inplace=True)

except:
  print('Production countries already converted')
movies_df.head()

Unnamed: 0,title,release_date,rating_letterboxd,vote_count_letterboxd,year,rating_imdb,director,star,vote_count_imdb,gross,revenue,runtime,budget,production_companies,director_ids,star_ids,genre,production_country
0,The Christine Jorgensen Story,1970-10-29,4.2,5,1970,5.5,Irving Rapper,"John Hansen,Joan Tompkins,Quinn K. Redeker,Joh...",373,237000,0,98,0,Edward Small Productions,710924,0360896086700007147930385402,"biography, romance, drama",United States of America
1,Say It Isn't So,2001-03-10,4.8,124,2001,5.0,J.B. Rogers,"Chris Klein,Heather Graham,Orlando Jones,Sally...",13462,5516708,12320393,95,25000000,"Say It Isn't So Productions, Conundrum Enterta...",736930,0005098000128704289630000398,"romance, comedy",United States of America
2,Killers,2010-06-04,5.9,1819,2010,5.4,Robert Luketic,"Katherine Heigl,Ashton Kutcher,Tom Selleck,Cat...",93114,47059963,98159963,100,75000000,"Katalyst Films, Lionsgate, Aversano Films",525659,0001337000511000006330001573,"romance, comedy, thriller, action",United States of America
3,In the Shadow of Women,2015-05-14,6.0,37,2015,6.5,Philippe Garrel,"Clotilde Courau,Stanislas Merhar,Lena Paugam,V...",1496,50291,54985,73,0,"SBS Productions, ARTE France Cinéma, Close Up ...",308042,0183660058072063609492169779,"romance, drama",Switzerland
4,The Pope of Greenwich Village,1984-06-22,6.5,91,1984,6.6,Stuart Rosenberg,"Eric Roberts,Mickey Rourke,Daryl Hannah,Gerald...",9212,6139896,6836201,121,8000000,United Artists,742341,0000616000062000004350656183,"crime, comedy, drama, action",United States of America


Lower case of production companies names

In [9]:
movies_df['production_companies'] = movies_df['production_companies'].str.lower()
movies_df['production_companies'] = movies_df['production_companies'].apply(lambda x: list(set(x.split(', '))))
movies_df['production_companies']

0                               [edward small productions]
1        [20th century fox, say it isn't so productions...
2              [lionsgate, katalyst films, aversano films]
3        [arte france cinéma, rts, close up films, sbs ...
4                                         [united artists]
                               ...                        
11235    [halestorm entertainment, blue crow production...
11236          [france 2 cinéma, radar films, studiocanal]
11237         [mappa, dugout, toho, shueisha, sumzap, mbs]
11238                                    [new line cinema]
11239                              [minerva international]
Name: production_companies, Length: 11240, dtype: object

Get months and season from released dates

In [10]:
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

movies_df['season'] = movies_df['release_date'].dt.month.apply(get_season)
movies_df= movies_df.drop('release_date', axis=1)
movies_df.head()

Unnamed: 0,title,rating_letterboxd,vote_count_letterboxd,year,rating_imdb,director,star,vote_count_imdb,gross,revenue,runtime,budget,production_companies,director_ids,star_ids,genre,production_country,season
0,The Christine Jorgensen Story,4.2,5,1970,5.5,Irving Rapper,"John Hansen,Joan Tompkins,Quinn K. Redeker,Joh...",373,237000,0,98,0,[edward small productions],710924,0360896086700007147930385402,"biography, romance, drama",United States of America,Fall
1,Say It Isn't So,4.8,124,2001,5.0,J.B. Rogers,"Chris Klein,Heather Graham,Orlando Jones,Sally...",13462,5516708,12320393,95,25000000,"[20th century fox, say it isn't so productions...",736930,0005098000128704289630000398,"romance, comedy",United States of America,Spring
2,Killers,5.9,1819,2010,5.4,Robert Luketic,"Katherine Heigl,Ashton Kutcher,Tom Selleck,Cat...",93114,47059963,98159963,100,75000000,"[lionsgate, katalyst films, aversano films]",525659,0001337000511000006330001573,"romance, comedy, thriller, action",United States of America,Summer
3,In the Shadow of Women,6.0,37,2015,6.5,Philippe Garrel,"Clotilde Courau,Stanislas Merhar,Lena Paugam,V...",1496,50291,54985,73,0,"[arte france cinéma, rts, close up films, sbs ...",308042,0183660058072063609492169779,"romance, drama",Switzerland,Spring
4,The Pope of Greenwich Village,6.5,91,1984,6.6,Stuart Rosenberg,"Eric Roberts,Mickey Rourke,Daryl Hannah,Gerald...",9212,6139896,6836201,121,8000000,[united artists],742341,0000616000062000004350656183,"crime, comedy, drama, action",United States of America,Summer


### Vizualization for correlation
*** CHECK THIS PART ***


In [11]:
fig = px.scatter(movies_df, x='budget', y='gross', trendline='ols')
fig.update_layout(title='Budget vs Gross')
fig.show()

In [12]:
fig = px.scatter(movies_df, x='runtime', y='gross', trendline='ols')
fig.update_layout(title='Runtime vs Gross Revenue')
fig.show()

In [13]:
#movies_df[movies_df['gross'] > 200]
fig = px.scatter(movies_df, x='rating_imdb', y='gross', trendline='ols')
fig.update_layout(title='IMDB Rating vs Gross')
fig.show()

In [14]:
fig = px.line(movies_df[['year','gross']].groupby('year').mean().reset_index(), x='year', y='gross')
fig.update_layout(title='Average Gross Over Time')
fig.show()

In [15]:
seasonal_gross = movies_df.groupby(['year', 'season'])['gross'].mean().reset_index()

fig = px.line(seasonal_gross, x='year', y='gross', color='season',
              title='Average Gross for Season trough the years',
              labels={'year': 'Year', 'gross': 'Average Gross'},
              category_orders={"Season": ["Winter", "Spring", "Summer", "Fall"]})

fig.update_layout(xaxis_title='Year', yaxis_title='Average Gross', hovermode='x')

fig.show()

In [19]:
gross_by_director = movies_df.groupby('director')['gross'].sum().reset_index()

top_directors = gross_by_director.sort_values('gross', ascending=False).head(20)

df_top_directors = movies_df[movies_df['director'].isin(top_directors['director'])]

fig = px.scatter(df_top_directors,
                 x='budget',
                 y='gross',
                 color='director',  # Color by director to differentiate them
                 size='gross',  # Use 'gross' as bubble size to highlight top-grossing movies
                 hover_name='director',  #Shows the director's name when you mouse over the bubble
                 title=f'Top {df_top_directors.shape[1]} Directors by Gross: Gross vs. Budget')

fig.update_layout(xaxis_title='Budget', yaxis_title='Gross', legend_title='Top 20 Directors by Gross')
fig.show()

In [27]:
director_counts = movies_df.groupby('director').agg({'gross': 'sum', 'title': 'count'}).reset_index()

director_counts.columns = ['director', 'total_gross', 'movie_count']

top_directors = director_counts.sort_values('total_gross', ascending=False).head(20)

fig = px.scatter(top_directors,
                 x='movie_count',
                 y='total_gross',
                 size='total_gross',
                 color='director',
                 hover_name='director',
                 title=f'Top {top_directors.shape[0]} Directors: Number of movies vs. Gross Total')

fig.update_layout(xaxis_title='Number of movies', yaxis_title='Gross Total', legend_title='Director')
fig.show()

In [28]:
movies_df[['first_star', 'second_star']] = movies_df['star'].str.split(',', n=2, expand=True)[[0, 1]]
movies_df['second_star'] = movies_df['second_star'].str.replace('\n', '', regex=False)


actor_stats = movies_df.groupby('first_star').agg({'gross': 'sum', 'budget': 'sum', 'title': 'count'}).reset_index()

actor_stats.columns = ['star', 'total_gross', 'total_budget', 'movie_count']

top_actors = actor_stats.sort_values('total_gross', ascending=False).head(50)

fig = px.scatter(top_actors,
                 x='total_budget',
                 y='total_gross',
                 size='movie_count',
                 color='star',
                 hover_name='star',
                 title=f'Top {top_actors.shape[0]} 1- Actors: Gross Total vs. Budget Total')

fig.update_layout(xaxis_title='Budget Total', yaxis_title='Gross Total', legend_title='Actor')

fig.show()

In [29]:
fig = px.scatter(top_actors,
                 x='movie_count',
                 y='total_gross',
                 size='movie_count',
                 color='star',
                 hover_name='star',
                 title=f'Top {top_actors.shape[0]} 1- Actors: Gross Total vs. Number of Appearances')

fig.update_layout(xaxis_title='Number of Appearances', yaxis_title='Gross Total', legend_title='Actor')
fig.show()

In [30]:
actor_stats = movies_df.groupby('second_star').agg({'gross': 'sum', 'budget': 'sum', 'title': 'count'}).reset_index()

actor_stats.columns = ['star', 'total_gross', 'total_budget', 'movie_count']

top_actors = actor_stats.sort_values('total_gross', ascending=False).head(50)

fig = px.scatter(top_actors,
                 x='total_budget',
                 y='total_gross',
                 size='movie_count',
                 color='star',
                 hover_name='star',
                 title=f'Top {top_actors.shape[0]} 2- Actors: Gross Total vs. Budget Total')

fig.update_layout(xaxis_title='Budget Total', yaxis_title='Gross Total', legend_title='Actor')

fig.show()

In [32]:
fig = px.scatter(top_actors,
                 x='movie_count',
                 y='total_gross',
                 size='movie_count',
                 color='star',
                 hover_name='star',
                 title=f'Top {top_actors.shape[0]} 2- Actors: Gross Total vs. Number of Appearances')

fig.update_layout(xaxis_title='Number of Appearances', yaxis_title='Gross Total', legend_title='Actor')
fig.show()

movies_df.drop(columns=['first_star', 'second_star'], inplace=True)

### Adjust features for data predictions

Add inflation to the data frame

In [33]:
pip install cpi

Collecting cpi
  Downloading cpi-1.1.4-py2.py3-none-any.whl (31.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.7/31.7 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cpi
Successfully installed cpi-1.1.4


In [34]:
import cpi

def adjust_for_inflation(amount, year):
    try:
        return cpi.inflate(amount, year)
    except:
        return amount

movies_df['gross_adjusted'] = movies_df.apply(lambda x: adjust_for_inflation(x['gross'], x['year']), axis=1)
movies_df['budget_adjusted'] = movies_df.apply(lambda x: adjust_for_inflation(x['budget'], x['year']), axis=1)
movies_df['revenue_adjusted'] = movies_df.apply(lambda x: adjust_for_inflation(x['revenue'], x['year']), axis=1)

In [35]:
movies_df[['year','budget', 'gross','revenue','budget_adjusted', 'gross_adjusted','revenue_adjusted']].head()

Unnamed: 0,year,budget,gross,revenue,budget_adjusted,gross_adjusted,revenue_adjusted
0,1970,0,237000,0,0.0,1861195.0,0.0
1,2001,25000000,5516708,12320393,43012700.0,9491541.0,21197340.0
2,2010,75000000,47059963,98159963,104801700.0,65759550.0,137164500.0
3,2015,0,50291,54985,0.0,64652.61,70687.08
4,1984,8000000,6139896,6836201,23461170.0,18006150.0,20048160.0


In [36]:
movies_df.drop(columns=['budget','gross','revenue'], inplace=True)

Add weigth rating

In [37]:
movies_df['rating_letterboxd'].describe()
# Letterboxd rating goes from 0 - 10
# IMDB rating goes from 0 - 10: https://help.imdb.com/article/imdb/track-movies-tv/how-do-i-submit-my-rating-on-imdb/G9R8NF943K39DQDT?ref_=helpsect_pro_2_4#


count    11240.000000
mean         6.291797
std          0.995674
min          0.000000
25%          5.800000
50%          6.400000
75%          6.900000
max         10.000000
Name: rating_letterboxd, dtype: float64

In [38]:
movies_df['Weighted_Rating'] = ((movies_df['rating_letterboxd'] * movies_df['vote_count_letterboxd']) +
                                (movies_df['rating_imdb'] * movies_df['vote_count_imdb'])) / (movies_df['vote_count_letterboxd'] + movies_df['vote_count_imdb'])

movies_df[['rating_letterboxd', 'vote_count_letterboxd', 'rating_imdb', 'vote_count_imdb', 'Weighted_Rating']].head(5)

Unnamed: 0,rating_letterboxd,vote_count_letterboxd,rating_imdb,vote_count_imdb,Weighted_Rating
0,4.2,5,5.5,373,5.482804
1,4.8,124,5.0,13462,4.998175
2,5.9,1819,5.4,93114,5.40958
3,6.0,37,6.5,1496,6.487932
4,6.5,91,6.6,9212,6.599022


In [39]:
movies_df.drop(columns=['rating_letterboxd', 'vote_count_letterboxd', 'rating_imdb', 'vote_count_imdb'], inplace=True)

### HOT ENCODING

#### *Genre hot encoding*

In [40]:
try:
  # Generate one-hot encoding of genres
  one_hot_encoded_genres = movies_df['genre'].str.get_dummies(sep=', ')

  # Treat similar genres as one
  try:
    one_hot_encoded_genres['crime']  = one_hot_encoded_genres['crime'] | one_hot_encoded_genres['film-noir']
    one_hot_encoded_genres['musical']  = one_hot_encoded_genres['music'] | one_hot_encoded_genres['musical']
    one_hot_encoded_genres['sci-fi']  = one_hot_encoded_genres['sci-fi'] | one_hot_encoded_genres['sciencefiction']
    one_hot_encoded_genres.drop(columns=['film-noir', 'music', 'sciencefiction'], inplace=True)
  except:
    print('No similar genres to combine')

  # Integrate one-hot encoding sub set
  one_hot_encoded_genres = one_hot_encoded_genres.add_prefix('genre_')
  if 'movies_encoded' in locals() or 'movies_encoded' in globals():
    if set(one_hot_encoded_genres.columns) not in set(movies_encoded.columns):
      movies_encoded = pd.concat([movies_df.drop(columns=['genre'], axis=1), one_hot_encoded_genres], axis=1)
  else:
    movies_encoded = pd.concat([movies_df.drop(columns=['genre'], axis=1), one_hot_encoded_genres], axis=1)

except:
  print('Hot encoding for genres already done')

movies_encoded.loc[:,'genre_action':].head(10)

Unnamed: 0,genre_action,genre_adult,genre_adventure,genre_animation,genre_biography,genre_comedy,genre_crime,genre_documentary,genre_drama,genre_family,genre_fantasy,genre_history,genre_horror,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_sport,genre_thriller,genre_tvmovie,genre_war,genre_western
0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0
6,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1
8,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
9,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0


#### *Production Country Hot encoding*

Selection of the firts 14 most important countries for gross

https://www.the-numbers.com/movies/production-countries/#tab=territory

In [41]:
top_countries = [
    "United States of America",
    "United Kingdom",
    "China",
    "France",
    "Japan",
    "Germany",
    "South Korea",
    "Canada",
    "India",
    "Australia",
    "Hong Kong",
    "New Zealand",
    "Italy",
    "Spain"
]
# top_countries= ["United States of America", "United Kingdom", "China", "France", "Japan", "Russia", "India","Germany","Italy", "South Korea","Canada","Spain", "Hong Kong","Australia","New Zealand","Mexico","Belgium","Ireland","Sweden","Netherlands"]
movies_encoded['production_country'] = movies_encoded['production_country'].apply(lambda x: x if x in top_countries else 'Other Countries')
movies_encoded['production_country'].unique()

array(['United States of America', 'Other Countries', 'France',
       'Australia', 'China', 'United Kingdom', 'India', 'Germany',
       'Canada', 'Hong Kong', 'South Korea', 'Spain', 'Japan', 'Italy',
       'New Zealand'], dtype=object)

In [42]:
try:
  one_hot_encoded_countries = pd.get_dummies(movies_encoded['production_country'], prefix='country').astype(int)
  movies_encoded = pd.concat([movies_encoded.drop('production_country', axis=1), one_hot_encoded_countries], axis=1)
except:
  print('Hot encoding for countries already done')
movies_encoded.loc[:, 'country_Australia':].head(10)

Unnamed: 0,country_Australia,country_Canada,country_China,country_France,country_Germany,country_Hong Kong,country_India,country_Italy,country_Japan,country_New Zealand,country_Other Countries,country_South Korea,country_Spain,country_United Kingdom,country_United States of America
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
6,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


#### *Directors hot-Encoding*

Selection of the firts 16 most important directors (Elbow of World Wide Box Office analisys made)

https://www.the-numbers.com/box-office-star-records/worldwide/lifetime-specific-technical-role/director

In [43]:
top_directors= [
    "Steven Spielberg",
    "James Cameron",
    "Anthony Russo",
    "Joe Russo",
    "Peter Jackson",
    "Michael Bay",
    "David Yates",
    "Christopher Nolan",
    "J.J. Abrams",
    "Ridley Scott",
    "Tim Burton",
    "Robert Zemeckis",
    "Jon Favreau",
    "Ron Howard",
    "Sam Raimi",
    "James Wan"
]

In [44]:
# This function searches for coincidences of a director name in the DataFrame
def search_director(director, irrelevants:list, not_found:list):

    coincidences = movies_encoded[movies_encoded['director'].str.contains(director, case=True)]
    if not coincidences.empty:

        print(f"\nConcidences found for the director {director}: {coincidences.shape[0]}")

        '''
        Printing recent movies of the director for checking validity
        Sources:
        - https://www.bfi.org.uk/lists/10-times-great-directors-left-really-long-gaps-between-films
        - https://screenrant.com/best-director-comebacks-after-breaks/
        - https://screenrant.com/directors-semi-retired-hiatus-great-movie-comeback/

        Longest period break found: 25 years
        Selected tolerance period: 25 years
        If a director didn't make a movie within this period, they're not relevant anymore
        '''

        print("  Recent movies:")
        relevants = coincidences[coincidences['year'] >= (2022 - 25)]
        if relevants.empty:
            irrelevants.append(director)
        else:
          for relevant in relevants.iterrows():
            print(f"\t{relevant[1]['year']}: {relevant[1]['title']}")


        # Printing the way the director is found (whether alone, or with other actors)
        print("  Director name found as:")
        print(f"\t{coincidences['director'].unique()}")
    else:
        print("\n No coincidences found for this director: ", director)
        not_found.append(director)

    return coincidences.shape[0]

instances = 0
irrelevant_directors = list()
not_found_directors = list()
for director in top_directors:
    instances += search_director(director, irrelevant_directors, not_found_directors)
print(f"\nIrrelevant directors: {irrelevant_directors}")
print(f"\nDirectors not found: {not_found_directors}")
print(f"\nTotal instances: {instances}")


Concidences found for the director Steven Spielberg: 31
  Recent movies:
	1997: The Lost World: Jurassic Park
	2002: Catch Me If You Can
	2002: Minority Report
	2011: The Adventures of Tintin
	2005: War of the Worlds
	1998: Saving Private Ryan
	2015: Bridge of Spies
	2012: Lincoln
	2017: The Post
	2001: A.I. Artificial Intelligence
	2008: Indiana Jones and the Kingdom of the Crystal Skull
	2016: The BFG
	1997: Amistad
	2018: Ready Player One
	2005: Munich
	2004: The Terminal
	2011: War Horse
  Director name found as:
	['Steven Spielberg' 'Joe Dante,John Landis,George Miller,Steven Spielberg']

Concidences found for the director James Cameron: 7
  Recent movies:
	1997: Titanic
	2009: Avatar
  Director name found as:
	['James Cameron']

Concidences found for the director Anthony Russo: 6
  Recent movies:
	2014: Captain America: The Winter Soldier
	2006: You, Me and Dupree
	2018: Avengers: Infinity War
	2016: Captain America: Civil War
	2002: Welcome to Collinwood
	2019: Avengers: Endgam

In [45]:
'''
Counting Anthony Russo and Joe Russo as one: Russo Brothers, as they are together in all the instances
As all the directors in the list are valid, There's no need to remove any from the top_directors list.
'''

try:
  top_directors[top_directors.index('Anthony Russo')] = 'Russo Brothers'
  top_directors.remove('Joe Russo')
  movies_encoded['director'] = movies_encoded['director'].replace('Anthony Russo,Joe Russo', 'Russo Brothers')

  for director in irrelevant_directors:
    top_directors.remove(director)
  for director in not_found_directors:
    top_directors.remove(director)
except:
  print('Directors already removed and/or replaced')



In [46]:
def replace_directors(directors):
    directors_replacement = list()
    for director in directors:
      if director not in top_directors:
        directors_replacement.append('Other Directors')
      else:
        directors_replacement.append(director)

    directors_replacement = set(directors_replacement)

    if ("Other Directors" in directors_replacement) and (len(directors_replacement) > 1):
      directors_replacement.remove("Other Directors")
    return directors_replacement

try:
  movies_encoded['director'] = movies_encoded['director'].apply(lambda x: list(set(x.split(','))))
  movies_encoded['director'] = movies_encoded['director'].apply(replace_directors)

  # Delete duplicates
  movies_encoded['director'] = movies_encoded['director'].apply(lambda x: list(set(x)))
except:
  print('Director names already filtered and replaced')

In [47]:
try:
  one_hot_encoded_directors = movies_encoded['director'].apply(lambda x: ', '.join(x)).str.get_dummies(sep=', ')
  one_hot_encoded_directors = one_hot_encoded_directors.add_prefix('director_')
  movies_encoded = pd.concat([movies_encoded.drop(['director', 'director_ids'], axis=1), one_hot_encoded_directors], axis=1)
except:
  print('Hot encoding for directors already done')
movies_encoded.loc[:, 'director_Christopher Nolan':].head(10)

Unnamed: 0,director_Christopher Nolan,director_David Yates,director_J.J. Abrams,director_James Cameron,director_James Wan,director_Jon Favreau,director_Michael Bay,director_Other Directors,director_Peter Jackson,director_Ridley Scott,director_Robert Zemeckis,director_Ron Howard,director_Russo Brothers,director_Sam Raimi,director_Steven Spielberg,director_Tim Burton
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


#### *Actors Hot-Encoding*

- https://www.the-numbers.com/box-office-star-records/worldwide/lifetime-acting/top-grossing-leading-stars
- https://stephenfollows.com/which-actors-most-frequently-appear-in-excellent-awful-movies/

In [48]:
top_stars = [
    "Samuel L. Jackson" ,
    "Scarlett Johansson" ,
    "Robert Downey Jr." ,
    "Zoe Saldana" ,
    "Chris Pratt" ,
    "Tom Cruise" ,
    "Vin Diesel" ,
    "Chris Hemsworth" ,
    "Bradley Cooper" ,
    "Chris Evans" ,
    "Tom Hanks" ,
    "Johnny Depp" ,
    "Dwayne Johnson" ,
    "Tom Holland" ,
    "Mark Ruffalo" ,
    "Emma Watson" ,
    "Don Cheadle" ,
    "Dave Bautista" ,
    "Jeremy Renner" ,
    "Will Smith" ,
    "Karen Gillan" ,
    "Elizabeth Olsen" ,
    "Josh Brolin" ,
    "Daniel Radcliffe" ,
    "Benedict Cumberbatch" ,
    "Harrison Ford" ,
    "Chadwick Boseman" ,
    "Rupert Grint" ,
    "Letitia Wright" ,
    "Leonardo DiCaprio" ,
    "Steve Carell" ,
    "Sebastian Stan" ,
    "Matt Damon" ,
    "Danai Gurira" ,
    "Tom Hiddleston" ,
    "Brad Pitt" ,
    "Paul Bettany" ,
    "Jack Black" ,
    "Bruce Willis" ,
    "Eddie Murphy" ,
    "Liam Neeson" ,
    "Pom Klementieff" ,
    "Benedict Wong" ,
    "Sam Worthington" ,
    "Ben Stiller" ,
    "Hugh Jackman" ,
    "Jason Statham" ,
    "Ian McKellen" ,
    "Gwyneth Paltrow" ,
    "Jennifer Lawrence" ,
    "Mark Wahlberg" ,
    "Nicolas Cage" ,
    "Cameron Diaz" ,
    "Ewan McGregor" ,
    "Christian Bale" ,
]

In [49]:
# This function searches for coincidences of a star name in the DataFrame
def search_star(star, irrelevants:list, not_found:list):

    coincidences = movies_encoded[movies_encoded['star'].str.contains(star, case=True)]
    if not coincidences.empty:

        print(f"\nConcidences found for the star {star}: {coincidences.shape[0]}")

        '''
        Printing recent movies of the star for checking validity
        Sources:
        - https://stephenfollows.com/how-long-is-the-typical-film-actors-career/#:~:text=The%20average%20career%20length%20was,between%2020%20and%2040%20years.
        - https://www.cbr.com/long-acting-breaks-that-actors-were-able-to-successfully-return-from/
        - https://brightside.me/articles/10-actors-who-returned-to-the-screen-after-a-long-hiatus-809693/

        Longest period break found: 13 years
        Selected tolerance period: 15 years
        If stars didn't appear in a movie within this period, they're not relevant anymore
        '''


        print("  Recent movies:")
        relevants = coincidences[coincidences['year'] >= (2022 - 15)]
        if relevants.empty:
            irrelevants.append(star)
        else:
          for relevant in relevants.iterrows():
            print(f"\t{relevant[1]['year']}: {relevant[1]['title']}")


        # Printing the way the star is found (whether alone, or with other actors)
        print("  Star name found as:")
        print(f"\t{coincidences['star'].unique()}")
    else:
        print("\n No coincidences found for this star: ", star)
        not_found.append(star)

    return coincidences.shape[0]

instances = 0
irrelevant_stars = list()
not_found_stars = list()
for star in top_stars:
    instances += search_star(star, irrelevant_stars, not_found_stars)
print(f"\nIrrelevant stars: {irrelevant_stars}")
print(f"\nStars not found: {not_found_stars}")
print(f"\nTotal instances: {instances}")


Concidences found for the star Samuel L. Jackson: 59
  Recent movies:
	2012: The Samaritan
	2014: Captain America: The Winter Soldier
	2017: The Hitman's Bodyguard
	2014: Big Game
	2015: The Hateful Eight
	2011: Captain America: The First Avenger
	2019: Captain Marvel
	2008: Jumper
	2008: The Spirit
	2019: Spider-Man: Far from Home
	2013: Turbo
	2016: Miss Peregrine's Home for Peculiar Children
	2007: Resurrecting the Champ
	2019: Glass
	2013: Oldboy
	2014: Kingsman: The Secret Service
	2007: 1408
	2019: Shaft
	2017: Kong: Skull Island
	2008: Lakeview Terrace
	2012: Meeting Evil
  Star name found as:
	['Bruce Willis,Samuel L. Jackson,Robin Wright,Spencer Treat Clark'
 'James Bond III,Kadeem Hardison,Bill Nunn,Samuel L. Jackson'
 'David Caruso,Samuel L. Jackson,Nicolas Cage,Helen Hunt'
 'Samuel L. Jackson,Luke Kirby,Ruth Negga,A.C. Peterson'
 'Chris Evans,Samuel L. Jackson,Scarlett Johansson,Robert Redford'
 'Dustin Hoffman,Sharon Stone,Samuel L. Jackson,Peter Coyote'
 'Ryan Reynolds,S

In [50]:
'''
As all the starts in the list are still relevant, There's no need to remove any from the top_stars list for this particular reason.
However, removing not found stars from top_stars list
'''
try:
  for star in not_found_stars:
    top_stars.remove(star)
  for star in irrelevant_stars:
    top_stars.remove(star)
except:
  print('Stars already removed and/or replaced')

In [51]:
def replace_stars(stars):
  stars_replacements = list()
  for star in stars:
    if star not in top_stars:
      stars_replacements.append('Other Stars')
    else:
      stars_replacements.append(star)

  stars_replacements = set(stars_replacements)
  if ("Other Stars" in stars_replacements) and (len(stars_replacements) > 1):
    stars_replacements.remove("Other Stars")
  return stars_replacements

try:
  movies_encoded['star'] = movies_encoded['star'].apply(lambda x: list(set(x.split(','))))
  movies_encoded['star'] = movies_encoded['star'].apply(replace_stars)

  # Delete duplicates
  movies_encoded['star'] = movies_encoded['star'].apply(lambda x: list(set(x)))
except:
  print('Stars names already filtered and replaced')

In [52]:
try:
  one_hot_encoded_stars = movies_encoded['star'].apply(lambda x: ', '.join(x)).str.get_dummies(sep=', ')
  one_hot_encoded_stars = one_hot_encoded_stars.add_prefix('star_')
  movies_encoded = pd.concat([movies_encoded.drop(['star', 'star_ids'], axis=1), one_hot_encoded_stars], axis=1)
except:
  print('Hot encoding for stars already done')
movies_encoded.loc[:, 'star_Ben Stiller':].head(10)

Unnamed: 0,star_Ben Stiller,star_Benedict Cumberbatch,star_Benedict Wong,star_Brad Pitt,star_Bradley Cooper,star_Bruce Willis,star_Cameron Diaz,star_Chadwick Boseman,star_Chris Evans,star_Chris Hemsworth,star_Chris Pratt,star_Christian Bale,star_Danai Gurira,star_Daniel Radcliffe,star_Dave Bautista,star_Don Cheadle,star_Dwayne Johnson,star_Eddie Murphy,star_Elizabeth Olsen,star_Emma Watson,star_Ewan McGregor,star_Gwyneth Paltrow,star_Harrison Ford,star_Hugh Jackman,star_Ian McKellen,star_Jack Black,star_Jason Statham,star_Jennifer Lawrence,star_Jeremy Renner,star_Johnny Depp,star_Josh Brolin,star_Karen Gillan,star_Leonardo DiCaprio,star_Liam Neeson,star_Mark Ruffalo,star_Mark Wahlberg,star_Matt Damon,star_Nicolas Cage,star_Other Stars,star_Paul Bettany,star_Robert Downey Jr.,star_Rupert Grint,star_Sam Worthington,star_Samuel L. Jackson,star_Scarlett Johansson,star_Sebastian Stan,star_Steve Carell,star_Tom Cruise,star_Tom Hanks,star_Tom Hiddleston,star_Tom Holland,star_Vin Diesel,star_Will Smith,star_Zoe Saldana
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### *Production Companies Hot-Encoding*

Selecting the top 30 most important companies

Source: https://www.the-numbers.com/movies/production-companies/#production_companies_overview=od3

In [53]:
# Original selected list of production companies
top_producers = [
    "Warner Bros",
    "Universal Pictures",
    "Columbia Pictures",
    "Marvel Studios",
    "Walt Disney Pictures",
    "Paramount",
    "20th Century Fox",
    "Legendary Pictures",
    "New Line Cinema",
    "DreamWorks Animation",
    "Dune Entertainment",
    "Amblin Entertainment",
    "Disney-Pixar",
    "Relativity Media",
    "Metro-Goldwyn-Mayer Pictures",
    "Village Roadshow Productions",
    "DreamWorks Pictures",
    "Heyday Films",
    "Regency Enterprises",
    "Lucasfilm",
    "Walt Disney Animation Studios",
    "Lionsgate",
    "TSG Entertainment",
    "RatPac Entertainment",
    "Illumination Entertainment",
    "Original Film",
    "Skydance Productions",
    "Summit Entertainment",
    "Touchstone Pictures",
    "di Bonaventura Pictures"
]

In [54]:
# Simplified version to find all the variants of the production companies names
top_producers = [
    "Warner",
    "Universal",
    "Columbia",
    "Marvel",
    "Disney",
    "Paramount",
    "Fox",
    "Century",
    "Legendary",
    "New Line",
    "Dune",
    "Amblin",
    "Pixar",
    "Relativity",
    "Metro-Goldwyn-Mayer",
    "Village Roadshow",
    "DreamWorks",
    "Heyday",
    "Regency",
    "Lucasfilm",
    "Lionsgate",
    "TSG",
    "RatPac",
    "Illumination",
    "Original Film",
    "Skydance",
    "Summit",
    "Touchstone",
    "Bonaventura"
]

In [55]:
def find_coincidence(production_companies, producer):
  producer = producer.lower()
  coincidence = list()

  for production_company_row in production_companies:
    #print(production_company_row)

    for production_company in production_company_row:
      if producer in production_company.lower():
        coincidence.append(production_company )

    #print(coincidence)
  if coincidence == []:
    return False, 0
  else:
    return set(coincidence), len(coincidence)

# This function searches for coincidences of a star name in the DataFrame
def search_producer(producer, not_found:list):

    coincidences, num_coincidences = find_coincidence(movies_encoded['production_companies'], producer)

    if coincidences:

        print(f"\nConcidences found for the producer {producer}: {num_coincidences}")

        # Printing the way the producer is found (whether alone, or with other actors)
        print("  Producer name found as:")
        print(f"\t{coincidences}")
    else:
        #print("\n No coincidences found for this producer: ", producer,"\n")
        not_found.append(producer)

    return num_coincidences

instances = 0

not_found_producers = list()
for producer in top_producers:
    instances += search_producer(producer, not_found_producers)
    #print(f"\n{find_coincidence(movies_encoded['production_companies'], producer)} \n")

print(f"\nProducers not found: {not_found_producers}")
print(f"\nTotal instances: {instances}")


Concidences found for the producer Warner: 762
  Producer name found as:
	{'warner china film hg corporation', 'warner premiere', 'warner bros. pictures animation', 'warner bros. entertainment germany', 'warner bros. japan', 'warner bros. entertainment', 'warnervision films', 'warner bros. pictures', 'warner bros. family entertainment', 'warner bros. entertainment españa', 'warner bros. digital', 'warner music uk', 'warner-pathé distributors', 'warner bros. entertainment france', 'warner bros. television', 'warner independent pictures (wip)', 'warner bros-seven arts', 'warner bros. korea', 'warner bros. animation'}

Concidences found for the producer Universal: 645
  Producer name found as:
	{'universal production partners', 'universal pictures do brasil', 'universal pictures international entertainment (upie)', 'universal pictures france', 'universal city studios', 'universal pictures japan', 'universal animation studios', 'wr universal group', 'universal studios home entertainment',

In [56]:
# After manually reviewing the found variants of the selected production companies
replacements = {
  "Warner Bros": [
    'warner bros. television',
    'warner bros. korea',
    'warner bros. pictures animation',
    'warner independent pictures (wip)',
    'warner premiere',
    'warner bros-seven arts',
    'warner bros. entertainment españa',
    'warner bros. entertainment germany',
    'warner bros. entertainment france',
    'warner china film hg corporation',
    'warner bros. pictures',
    'warner bros. digital',
    'warner-pathé distributors',
    'warner bros. entertainment',
    'warner bros. family entertainment',
    'warner bros. japan',
    'warner bros. animation'
  ],
  "Universal Pictures": [
    'universal city studios',
    'universal pictures japan',
    'universal pictures international entertainment (upie)',
    'universal pictures international (upi)',
    'universal animation studios',
    'universal international pictures',
    'universal studios home entertainment',
    'universal film manufacturing company',
    'universal 1440 entertainment',
    'universal pictures do brasil',
    'universal productions france s.a.',
    'universal pictures',
    'universal pictures france'
  ],
  "Columbia Pictures": [
    'columbia pictures film production asia limited',
    'columbia pictures producciones mexico',
    'columbia pictures film production asia',
    'columbia pictures'
  ],
  "Marvel Studios": [
    'marvel enterprises',
    'marvel entertainment',
    'marvel studios'
  ],
  "Disney": [
    'walt disney productions',
    'the walt disney company nordic',
    'walt disney animation',
    'walt disney feature animation',
    'walt disney studios home entertainment',
    'disney television animation',
    'disneytoon studios',
    'the walt disney company (japan)',
    'walt disney animation studios',
    'walt disney pictures'
  ],
  "Paramount": [
    'paramount',
    'paramount famous lasky corporation',
    'paramount players',
    'paramount famous productions',
    'paramount pictures canada',
    'paramount animation',
    'paramount vantage',
    'paramount home entertainment'
  ],
  "20th Century Studios": [
    '20th century fox home entertainment',
    '20th century fox',
    '20th century fox argentina',
    '20th century pictures',
    '20th century fox japan',
    '20th century studios',
    '20th century fox animation',
    '20th century fox television',
    '20th century fox korea',
    'twentieth century-fox productions',
    '20th century fox brazil',
    'fox international productions germany',
    '20th century fox home entertainment',
    'fox atomic',
    'fox searchlight pictures',
    'cbs fox video',
    'fox 2000 pictures',
    'fox international productions india',
    'fox family films',
    'fox international productions',
    'fox film corporation',
    'fox animation studios',
    'fox films ltd.',
    'fox international productions japan',
    'fox international productions spain',
    'fox international productions korea'
  ],
  "Legendary": [
    'legendary pictures',
    'legendary east'
  ],
  "New Line Cinema": [
    'new line cinema'
  ],
  "Amblin": [
    'amblin partners',
    'amblin entertainment'
  ],
  "Pixar": [
    'pixar'
  ],
  "Relativity Media": [
    'relativity media',
    'relativity sports'
  ],
  "Metro-Goldwyn-Mayer": [
    'metro-goldwyn-mayer british studios',
    'metro-goldwyn-mayer'
  ],
  "Village Roadshow": [
    'village roadshow entertainment',
    'village roadshow pictures',
    'village roadshow pictures asia'
  ],
  "DreamWorks": [
    'dreamworks pictures',
    'dreamworks animation'
  ],
  "Heyday Films": [
    'heyday films'
  ],
  "Regency Enterprises": [
    'regency international pictures',
    'regency enterprises',
    'new regency productions',
    'new regency pictures'
  ],
  "Lucasfilm": [
    'lucasfilm animation',
    'lucasfilm ltd.'
  ],
  "Lionsgate": [
    'lionsgate',
    'lionsgate home entertainment',
    'lionsgate premiere'
  ],
  "TSG Entertainment": [
    'tsg entertainment'
  ],
  "RatPac-Dune Entertainment": [
    'ratpac entertainment',
    'dune entertainment',
    'dune entertainment iii',
    'dune films'
  ],
  "Illumination": [
    'illumination',
    'illumination films'
  ],
  "Original Film": [
    'original film'
  ],
  "Skydance": [
    'skydance'
  ],
  "Summit Entertainment": [
    'summit entertainment'
  ],
  "Touchstone Pictures": [
    'touchstone pictures'
  ],
  "di Bonaventura Pictures": [
    'di bonaventura pictures'
  ]
}


In [57]:
def replace_values(producers_list):
    for new_value, current_values in replacements.items():
        for value_to_replace in current_values:
            if value_to_replace in producers_list:
              producers_list[producers_list.index(value_to_replace)] = new_value
    return producers_list

movies_encoded['production_companies'] = movies_encoded['production_companies'].apply(replace_values)

In [58]:
top_producers = list(replacements.keys())

def replace_producers(producers):
  producer_replacement = list()

  for producer in producers:
    if producer not in top_producers:
      producer_replacement.append('Other Production Companies')
    else:
      producer_replacement.append(producer)

  producer_replacement = set(producer_replacement)
  if ('Other Production Companies' in producer_replacement) and (len(producer_replacement) > 1):
    producer_replacement.remove('Other Production Companies')
  return producer_replacement

try:
  movies_encoded['production_companies'] = movies_encoded['production_companies'].apply(replace_producers)

  # Delete duplicates
  movies_encoded['production_companies'] = movies_encoded['production_companies'].apply(lambda x: list(set(x)))

except:
  print('Stars names already filtered and replaced')


In [59]:
try:
  one_hot_encoded_producers = movies_encoded['production_companies'].apply(lambda x: ', '.join(x)).str.get_dummies(sep=', ')
  one_hot_encoded_producers = one_hot_encoded_producers.add_prefix('producer_')
  movies_encoded = pd.concat([movies_encoded.drop(['production_companies'], axis=1), one_hot_encoded_producers], axis=1)
except:
  print('Hot encoding for stars already done')
movies_encoded.loc[:, 'producer_20th Century Studios':].head(10)

Unnamed: 0,producer_20th Century Studios,producer_Amblin,producer_Columbia Pictures,producer_Disney,producer_DreamWorks,producer_Heyday Films,producer_Illumination,producer_Legendary,producer_Lionsgate,producer_Lucasfilm,producer_Marvel Studios,producer_Metro-Goldwyn-Mayer,producer_New Line Cinema,producer_Original Film,producer_Other Production Companies,producer_Paramount,producer_Pixar,producer_RatPac-Dune Entertainment,producer_Regency Enterprises,producer_Relativity Media,producer_Skydance,producer_Summit Entertainment,producer_TSG Entertainment,producer_Touchstone Pictures,producer_Universal Pictures,producer_Village Roadshow,producer_Warner Bros,producer_di Bonaventura Pictures
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### *Hot-encoding of seasons*

In [60]:
try:
  one_hot_encoded_seasons = pd.get_dummies(movies_encoded['season'], prefix='season').astype(int)
  movies_encoded = pd.concat([movies_encoded.drop('season', axis=1), one_hot_encoded_seasons], axis=1)
except:
  print('Hot encoding for seasons already done')
movies_encoded.loc[:, 'season_Fall':].head(10)

Unnamed: 0,season_Fall,season_Spring,season_Summer,season_Winter
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,1,0,0
4,0,0,1,0
5,1,0,0,0
6,0,1,0,0
7,0,1,0,0
8,0,0,1,0
9,0,1,0,0


In [61]:
for column_name in movies_encoded.columns:
    print(column_name)

print(movies_encoded.shape)
movies_encoded.describe(include='all')

title
year
runtime
gross_adjusted
budget_adjusted
revenue_adjusted
Weighted_Rating
genre_action
genre_adult
genre_adventure
genre_animation
genre_biography
genre_comedy
genre_crime
genre_documentary
genre_drama
genre_family
genre_fantasy
genre_history
genre_horror
genre_musical
genre_mystery
genre_romance
genre_sci-fi
genre_sport
genre_thriller
genre_tvmovie
genre_war
genre_western
country_Australia
country_Canada
country_China
country_France
country_Germany
country_Hong Kong
country_India
country_Italy
country_Japan
country_New Zealand
country_Other Countries
country_South Korea
country_Spain
country_United Kingdom
country_United States of America
director_Christopher Nolan
director_David Yates
director_J.J. Abrams
director_James Cameron
director_James Wan
director_Jon Favreau
director_Michael Bay
director_Other Directors
director_Peter Jackson
director_Ridley Scott
director_Robert Zemeckis
director_Ron Howard
director_Russo Brothers
director_Sam Raimi
director_Steven Spielberg
direct

Unnamed: 0,title,year,runtime,gross_adjusted,budget_adjusted,revenue_adjusted,Weighted_Rating,genre_action,genre_adult,genre_adventure,genre_animation,genre_biography,genre_comedy,genre_crime,genre_documentary,genre_drama,genre_family,genre_fantasy,genre_history,genre_horror,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_sport,genre_thriller,genre_tvmovie,genre_war,genre_western,country_Australia,country_Canada,country_China,country_France,country_Germany,country_Hong Kong,country_India,country_Italy,country_Japan,country_New Zealand,country_Other Countries,country_South Korea,country_Spain,country_United Kingdom,country_United States of America,director_Christopher Nolan,director_David Yates,director_J.J. Abrams,director_James Cameron,director_James Wan,director_Jon Favreau,director_Michael Bay,director_Other Directors,director_Peter Jackson,director_Ridley Scott,director_Robert Zemeckis,director_Ron Howard,director_Russo Brothers,director_Sam Raimi,director_Steven Spielberg,director_Tim Burton,star_Ben Stiller,star_Benedict Cumberbatch,star_Benedict Wong,star_Brad Pitt,star_Bradley Cooper,star_Bruce Willis,star_Cameron Diaz,star_Chadwick Boseman,star_Chris Evans,star_Chris Hemsworth,star_Chris Pratt,star_Christian Bale,star_Danai Gurira,star_Daniel Radcliffe,star_Dave Bautista,star_Don Cheadle,star_Dwayne Johnson,star_Eddie Murphy,star_Elizabeth Olsen,star_Emma Watson,star_Ewan McGregor,star_Gwyneth Paltrow,star_Harrison Ford,star_Hugh Jackman,star_Ian McKellen,star_Jack Black,star_Jason Statham,star_Jennifer Lawrence,star_Jeremy Renner,star_Johnny Depp,star_Josh Brolin,star_Karen Gillan,star_Leonardo DiCaprio,star_Liam Neeson,star_Mark Ruffalo,star_Mark Wahlberg,star_Matt Damon,star_Nicolas Cage,star_Other Stars,star_Paul Bettany,star_Robert Downey Jr.,star_Rupert Grint,star_Sam Worthington,star_Samuel L. Jackson,star_Scarlett Johansson,star_Sebastian Stan,star_Steve Carell,star_Tom Cruise,star_Tom Hanks,star_Tom Hiddleston,star_Tom Holland,star_Vin Diesel,star_Will Smith,star_Zoe Saldana,producer_20th Century Studios,producer_Amblin,producer_Columbia Pictures,producer_Disney,producer_DreamWorks,producer_Heyday Films,producer_Illumination,producer_Legendary,producer_Lionsgate,producer_Lucasfilm,producer_Marvel Studios,producer_Metro-Goldwyn-Mayer,producer_New Line Cinema,producer_Original Film,producer_Other Production Companies,producer_Paramount,producer_Pixar,producer_RatPac-Dune Entertainment,producer_Regency Enterprises,producer_Relativity Media,producer_Skydance,producer_Summit Entertainment,producer_TSG Entertainment,producer_Touchstone Pictures,producer_Universal Pictures,producer_Village Roadshow,producer_Warner Bros,producer_di Bonaventura Pictures,season_Fall,season_Spring,season_Summer,season_Winter
count,11240,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0,11240.0
unique,10808,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
top,Little Women,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,,1997.196263,107.234431,51649810.0,29771750.0,92643850.0,6.323871,0.271085,0.000267,0.206139,0.04911,0.076335,0.374466,0.238701,0.001335,0.62927,0.111121,0.11806,0.074377,0.121441,0.054804,0.123221,0.319217,0.099644,0.027402,0.295018,0.000356,0.051246,0.017794,0.017972,0.038256,0.017527,0.0629,0.029626,0.007651,0.031139,0.012544,0.017171,0.003114,0.091103,0.010409,0.008452,0.080338,0.571797,0.000979,0.000623,0.000534,0.000623,0.000801,0.000801,0.001157,0.981851,0.000979,0.002313,0.001512,0.001601,0.000534,0.001246,0.002758,0.00169,0.00258,0.000712,0.000356,0.003114,0.00169,0.004626,0.002847,0.000534,0.002135,0.001335,0.001157,0.00258,0.000178,0.001246,0.000534,0.001868,0.002936,0.002936,0.00089,0.001246,0.003559,0.002402,0.003648,0.002491,0.002135,0.00169,0.002758,0.001512,0.001335,0.003737,0.001868,0.000267,0.001868,0.004537,0.002313,0.003114,0.003203,0.004893,0.900801,0.001246,0.003648,0.000801,0.001335,0.005249,0.002669,0.000712,0.002046,0.003114,0.004093,0.00089,0.000712,0.002046,0.002224,0.001423,0.062544,0.006851,0.046708,0.024911,0.012544,0.001868,0.001068,0.004093,0.013968,0.002224,0.004181,0.038167,0.022064,0.00427,0.584431,0.051779,0.001957,0.010053,0.009964,0.010142,0.001601,0.011121,0.004804,0.016637,0.056495,0.009786,0.065125,0.002669,0.297776,0.235587,0.236566,0.230071
std,,19.084275,22.282695,121145100.0,51869590.0,248055700.0,0.99058,0.44454,0.016336,0.404549,0.216108,0.265544,0.484006,0.426309,0.036508,0.483022,0.314296,0.322694,0.262395,0.326654,0.227608,0.328705,0.466194,0.299538,0.163259,0.456071,0.018862,0.220508,0.132206,0.132854,0.191823,0.131229,0.242794,0.169562,0.08714,0.1737,0.111302,0.129913,0.055718,0.287769,0.101498,0.091549,0.271828,0.49484,0.031269,0.024949,0.023099,0.024949,0.028287,0.028287,0.03399,0.133498,0.031269,0.048042,0.038863,0.039988,0.023099,0.035272,0.052447,0.041081,0.050731,0.02667,0.018862,0.055718,0.041081,0.067863,0.053283,0.023099,0.046161,0.036508,0.03399,0.050731,0.013339,0.035272,0.023099,0.043186,0.054107,0.054107,0.029816,0.035272,0.059551,0.048955,0.060289,0.049851,0.046161,0.041081,0.052447,0.038863,0.036508,0.061017,0.043186,0.016336,0.043186,0.06721,0.048042,0.055718,0.056506,0.069783,0.298943,0.035272,0.060289,0.028287,0.036508,0.072264,0.051596,0.02667,0.045191,0.055718,0.063845,0.029816,0.02667,0.045191,0.047111,0.037704,0.242153,0.082488,0.211022,0.155861,0.111302,0.043186,0.032658,0.063845,0.117363,0.047111,0.064532,0.191608,0.146898,0.065212,0.492842,0.221591,0.0442,0.099766,0.099328,0.100202,0.039988,0.104873,0.069149,0.127913,0.230885,0.098446,0.246756,0.051596,0.457301,0.424384,0.424992,0.420897
min,,1913.0,0.0,1.177315,0.0,0.0,1.301903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,1989.0,94.0,613590.1,0.0,0.0,5.719729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,2002.0,103.0,10824100.0,5878249.0,7883950.0,6.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,2011.0,116.0,56138270.0,38635080.0,79137590.0,6.998651,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [62]:
# Delete rows with low budget
threshold = movies_encoded['budget_adjusted'].quantile(0.422)

movies_encoded_budget_filtered = movies_encoded[movies_encoded['budget_adjusted'] > threshold]
movies_encoded_budget_filtered

Unnamed: 0,title,year,runtime,gross_adjusted,budget_adjusted,revenue_adjusted,Weighted_Rating,genre_action,genre_adult,genre_adventure,genre_animation,genre_biography,genre_comedy,genre_crime,genre_documentary,genre_drama,genre_family,genre_fantasy,genre_history,genre_horror,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_sport,genre_thriller,genre_tvmovie,genre_war,genre_western,country_Australia,country_Canada,country_China,country_France,country_Germany,country_Hong Kong,country_India,country_Italy,country_Japan,country_New Zealand,country_Other Countries,country_South Korea,country_Spain,country_United Kingdom,country_United States of America,director_Christopher Nolan,director_David Yates,director_J.J. Abrams,director_James Cameron,director_James Wan,director_Jon Favreau,director_Michael Bay,director_Other Directors,director_Peter Jackson,director_Ridley Scott,director_Robert Zemeckis,director_Ron Howard,director_Russo Brothers,director_Sam Raimi,director_Steven Spielberg,director_Tim Burton,star_Ben Stiller,star_Benedict Cumberbatch,star_Benedict Wong,star_Brad Pitt,star_Bradley Cooper,star_Bruce Willis,star_Cameron Diaz,star_Chadwick Boseman,star_Chris Evans,star_Chris Hemsworth,star_Chris Pratt,star_Christian Bale,star_Danai Gurira,star_Daniel Radcliffe,star_Dave Bautista,star_Don Cheadle,star_Dwayne Johnson,star_Eddie Murphy,star_Elizabeth Olsen,star_Emma Watson,star_Ewan McGregor,star_Gwyneth Paltrow,star_Harrison Ford,star_Hugh Jackman,star_Ian McKellen,star_Jack Black,star_Jason Statham,star_Jennifer Lawrence,star_Jeremy Renner,star_Johnny Depp,star_Josh Brolin,star_Karen Gillan,star_Leonardo DiCaprio,star_Liam Neeson,star_Mark Ruffalo,star_Mark Wahlberg,star_Matt Damon,star_Nicolas Cage,star_Other Stars,star_Paul Bettany,star_Robert Downey Jr.,star_Rupert Grint,star_Sam Worthington,star_Samuel L. Jackson,star_Scarlett Johansson,star_Sebastian Stan,star_Steve Carell,star_Tom Cruise,star_Tom Hanks,star_Tom Hiddleston,star_Tom Holland,star_Vin Diesel,star_Will Smith,star_Zoe Saldana,producer_20th Century Studios,producer_Amblin,producer_Columbia Pictures,producer_Disney,producer_DreamWorks,producer_Heyday Films,producer_Illumination,producer_Legendary,producer_Lionsgate,producer_Lucasfilm,producer_Marvel Studios,producer_Metro-Goldwyn-Mayer,producer_New Line Cinema,producer_Original Film,producer_Other Production Companies,producer_Paramount,producer_Pixar,producer_RatPac-Dune Entertainment,producer_Regency Enterprises,producer_Relativity Media,producer_Skydance,producer_Summit Entertainment,producer_TSG Entertainment,producer_Touchstone Pictures,producer_Universal Pictures,producer_Village Roadshow,producer_Warner Bros,producer_di Bonaventura Pictures,season_Fall,season_Spring,season_Summer,season_Winter
1,Say It Isn't So,2001,95,9.491541e+06,4.301270e+07,2.119734e+07,4.998175,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,Killers,2010,100,6.575955e+07,1.048017e+08,1.371645e+08,5.409580,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,The Pope of Greenwich Village,1984,121,1.800615e+07,2.346117e+07,2.004816e+07,6.599022,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
5,House at the End of the Street,2012,101,4.195325e+07,9.157224e+06,5.877496e+07,5.505548,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
7,The Man from Snowy River,1982,104,6.523282e+07,1.105137e+07,6.523282e+07,7.195770,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11226,Encanto,2021,102,1.080559e+08,5.622431e+07,2.844950e+08,7.200260,0,0,1,1,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
11227,The Battle at Lake Changjin,2021,178,3.850364e+05,2.248972e+08,1.014895e+09,5.404897,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
11233,Battle of the Brides,2011,101,8.710338e+04,4.063795e+04,0.000000e+00,5.916949,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
11235,Baptists at Our Barbecue,2004,92,2.792258e+05,8.065167e+05,2.795484e+05,5.493840,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [63]:
# Save dataframes as csv files
from pathlib import Path

filepath_df1 = Path('./movies_encoded.csv')
filepath_df1.parent.mkdir(parents=True, exist_ok=True)
movies_encoded.to_csv(filepath_df1, index=True)

filepath_df2 = Path('./movies_encoded_budget_filtered.csv')
filepath_df2.parent.mkdir(parents=True, exist_ok=True)
movies_encoded_budget_filtered.to_csv(filepath_df2, index=True)