Revenue Prediction
==================
In this project the goal is to predict movies revenue using their features. revenue is given by ( box_office - budget ) so the main goal is to predict box office. 

1. Prepare and preprocess the given data. 

2. After exploring data find, select and especially create new features. ignore others.

3. Prepare features to feed the model. 

4. Select and try different models.

5. Document and report each step using relative plots and a brief explanation. finally report the best suited model and justify why did it performed well.



- Keep in mind that in this task accuracy itself only has only part of score.

- Hint: to create new features you can use credit attributes. Think of it this way, what affects box office?  

**Tools** 

importing useful tools and libraries. you may use any other library as well.

In [669]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import json
import datetime
from tabulate import tabulate


# Modelling
from sklearn import preprocessing, svm 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [670]:
# Your project struct must look like this,


# |── Name_SID.zip
# │   ├── data
# │   │       ├── rotten_tomatoes_5000_movies.csv
# │   │       ├── rotten_tomatoes_5000_movies.csv
# │   ├── *.ipynb
# │   ├── document.pdf

df_movies = pd.read_csv(r"data/rotten_tomatoes_5000_movies.csv")
df_credit = pd.read_csv(r"data/rotten_tomatoes_5000_credits.csv")


In [671]:
df_movies.rename(columns={
    'rt_production_budget': 'production_budget',
    'rt_genres': 'movie_genres',
    'rt_website': 'official_website',
    'rt_movie_id': 'movie_id',
    'rt_keywords': 'movie_keywords',
    'rt_original_language': 'original_language',
    'rt_original_title': 'original_title',
    'rt_synopsis': 'synopsis',
    'rt_audience_score': 'audience_score',
    'rt_studios': 'production_studios',
    'rt_production_countries': 'production_countries',
    'rt_release_date': 'release_date',
    'rt_box_office': 'box_office_revenue',
    'rt_runtime': 'movie_runtime',
    'rt_languages': 'languages_available',
    'rt_release_status': 'release_status',
    'rt_tagline': 'movie_tagline',
    'rt_title': 'movie_title',
    'rt_critics_score': 'critics_score',
    'rt_review_count': 'review_count'
}, inplace=True)

df_credit.rename(columns={
    'rt_movie_id': 'movie_id',
    'rt_title': 'movie_title',
    'rt_actors': 'actors',
    'rt_staff': 'staff'
}, inplace=True)

df_movies['release_date'] = pd.to_datetime(df_movies['release_date'])


In [672]:
def create_dict_from_column(df_column, key_field, value_field):
    result_dict = {}
    for data_str in df_column:
        data_list = json.loads(data_str)
        for item in data_list:
            result_dict[item[key_field]] = item[value_field]
            result_dict[item[value_field]] = item[key_field]
    return result_dict

all_genres_dict = create_dict_from_column(df_movies['movie_genres'], 'id', 'name')
all_keywords_dict = create_dict_from_column(df_movies['movie_keywords'], 'id', 'name')
all_studios_dict = create_dict_from_column(df_movies['production_studios'], 'id', 'name')
all_countries_dict = create_dict_from_column(df_movies['production_countries'], 'iso_3166_1', 'name')
all_languages_dict = create_dict_from_column(df_movies['languages_available'], 'iso_639_1', 'name')


In [673]:
def replace_with_ids_or_iso(data_str, key_field):
    data_list = json.loads(data_str)
    return sorted([item[key_field] for item in data_list])

key_field_map = {
    'movie_genres': 'id',
    'movie_keywords': 'id',
    'production_studios': 'id',
    'production_countries': 'iso_3166_1',
    'languages_available': 'iso_639_1'
}

for column, key_field in key_field_map.items():
    df_movies[column] = df_movies[column].apply(replace_with_ids_or_iso, key_field=key_field)


In [674]:
df_movies.head()

Unnamed: 0,production_budget,movie_genres,official_website,movie_id,movie_keywords,original_language,original_title,synopsis,audience_score,production_studios,production_countries,release_date,box_office_revenue,movie_runtime,languages_available,release_status,movie_tagline,movie_title,critics_score,review_count
0,237000000,"[12, 14, 28, 878]",http://www.avatarmovie.com/,19995,"[1463, 2964, 3386, 3388, 3679, 3801, 9685, 984...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[289, 306, 444, 574]","[GB, US]",2009-12-10,2787965087,162.0,"[en, es]",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[12, 14, 28]",http://disney.go.com/disneypictures/pirates/,285,"[270, 726, 911, 1319, 2038, 2052, 2580, 2660, ...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[2, 130, 19936]",[US],2007-05-19,961000000,169.0,[en],Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[12, 28, 80]",http://www.sonypictures.com/movies/spectre/,206647,"[470, 818, 4289, 9663, 14555, 156095, 158431]",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[5, 10761, 69434]","[GB, US]",2015-10-26,880674609,148.0,"[de, en, es, fr, it]",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[18, 28, 53, 80]",http://www.thedarkknightrises.com/,49026,"[849, 853, 949, 1308, 1437, 3051, 3562, 6969, ...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[923, 6194, 9993, 9996]",[US],2012-07-16,1084939099,165.0,[en],Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[12, 28, 878]",http://movies.disney.com/john-carter,49529,"[818, 839, 1456, 3801, 7376, 9951, 10028, 1053...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,[2],[US],2012-03-07,284139100,132.0,[en],Released,"Lost in our world, found in another.",John Carter,6.1,2124


### Q1. Calculate the average production budget per genre in the dataset.


In [675]:
budget_per_genre = {}

for genre_id, genre_name in all_genres_dict.items():
    if isinstance(genre_id, str):
        continue
    genre_movies = df_movies[df_movies['movie_genres'].apply(lambda genres: genre_id in genres)]
    total_budget = genre_movies['production_budget'].sum()
    count = len(genre_movies)

    budget_per_genre[genre_name] = {'avg': round(total_budget / count, 3), 'budget': total_budget, 'count': count,}

for genre_name, data in budget_per_genre.items():
    print(f"Genre: {genre_name}")
    print(f"    Average Budget:   ${data['avg']:,.2f}")
    print(f"    Total Budget:     ${data['budget']:,.0f}")
    print(f"    Number of Movies: {data['count']}")
    print('-' * 50)

Genre: Action
    Average Budget:   $51,510,750.95
    Total Budget:     $59,443,406,599
    Number of Movies: 1154
--------------------------------------------------
Genre: Adventure
    Average Budget:   $66,326,861.35
    Total Budget:     $52,398,220,463
    Number of Movies: 790
--------------------------------------------------
Genre: Fantasy
    Average Budget:   $63,560,605.18
    Total Budget:     $26,949,696,595
    Number of Movies: 424
--------------------------------------------------
Genre: Science Fiction
    Average Budget:   $51,865,551.15
    Total Budget:     $27,748,069,865
    Number of Movies: 535
--------------------------------------------------
Genre: Crime
    Average Budget:   $27,849,808.15
    Total Budget:     $19,383,466,474
    Number of Movies: 696
--------------------------------------------------
Genre: Drama
    Average Budget:   $20,678,324.84
    Total Budget:     $47,498,112,157
    Number of Movies: 2297
------------------------------------------

### Q2. Top 5 Genres: Country Budget Share Percentages
This section shows the budget share percentages for each country in the top 5 genres, filtering out countries with less than 0.01% share.


In [676]:
top_5_genres = sorted(budget_per_genre.items(), key=lambda x: x[1]['avg'], reverse=True)[:5]

country_budget_share_percent = {}

for genre_name, budget_data in top_5_genres:
    total_genre_budget = budget_data['budget']
    genre_id = all_genres_dict[genre_name]
    genre_movies = df_movies[df_movies['movie_genres'].apply(lambda genres: genre_id in genres)]

    country_budget_shares = {}

    for movie in genre_movies.itertuples():
        movie_budget = movie.production_budget
        countries = movie.production_countries

        for country_iso in countries:
            if country_iso not in country_budget_shares:
                country_budget_shares[country_iso] = 0
            country_budget_shares[country_iso] += movie_budget / len(countries)

    genre_country_percentages = {
        all_countries_dict[country_iso]: round((budget_share / total_genre_budget) * 100, 2)
        for country_iso, budget_share in country_budget_shares.items()
    }

    genre_country_percentages = {country: percent for country, percent in genre_country_percentages.items() if percent >= 0.01}

    genre_country_percentages = dict(sorted(genre_country_percentages.items(), key=lambda item: item[1], reverse=True))

    country_budget_share_percent[genre_name] = genre_country_percentages

country_budget_share_percent = dict(sorted(country_budget_share_percent.items(), key=lambda item: sum(item[1].values()), reverse=True))


for genre_name, budget_data in top_5_genres:
    print(f"Genre: {genre_name}")
    print("-" * (len(genre_name) + 7))

    for country, budget_share in country_budget_share_percent[genre_name].items():
        print(f"\t{country:<25} | Budget Share: {budget_share:>10.2f} %")

    print("\n" + "=" * 70)
    print()


Genre: Animation
----------------
	United States of America  | Budget Share:      86.96 %
	United Kingdom            | Budget Share:       2.58 %
	Australia                 | Budget Share:       1.97 %
	Japan                     | Budget Share:       1.76 %
	France                    | Budget Share:       1.41 %
	Germany                   | Budget Share:       1.35 %
	New Zealand               | Budget Share:       0.68 %
	China                     | Budget Share:       0.55 %
	India                     | Budget Share:       0.47 %
	Russia                    | Budget Share:       0.32 %
	Canada                    | Budget Share:       0.31 %
	South Africa              | Budget Share:       0.26 %
	Finland                   | Budget Share:       0.23 %
	Spain                     | Budget Share:       0.22 %
	Mexico                    | Budget Share:       0.19 %
	Belgium                   | Budget Share:       0.15 %
	Hong Kong                 | Budget Share:       0.14 %
	Argentina    

### Q3. Filtering movies from the past 10 years, counting the number of movies with exactly 3 genres per year, and displaying the results.


In [677]:
current_year = datetime.datetime.now().year
df_movies_past_10_years = df_movies[df_movies['release_date'].dt.year >= (current_year - 10)]
df_movies_exactly_3_genres = df_movies[df_movies['release_date'].dt.year >= (current_year - 10)].copy()
df_movies_exactly_3_genres['year'] = df_movies_exactly_3_genres['release_date'].dt.year
movies_count_per_year = df_movies_exactly_3_genres.groupby('year').size()
for year, count in movies_count_per_year.items():
    print(f"Year:   {year}   -   Movies Count:   {count}")

Year:   2015   -   Movies Count:   216
Year:   2016   -   Movies Count:   104
Year:   2017   -   Movies Count:   1


#### Q4. Calculates and prints the average movie runtime per country, sorted in descending order of runtime. Filters invalid movies, then aggregates and computes the average runtime for each country.


In [678]:
country_avg_runtime = {}

def is_not_valid(movie):
    if pd.isna(movie.movie_runtime):
        return True
    if not isinstance(movie.movie_runtime, float):
        return True
    if len(movie.production_countries) == 0:
        return True
    return False

for movie in df_movies.itertuples():
    if is_not_valid(movie):
        continue

    countries = movie.production_countries
    runtime = movie.movie_runtime

    for country_iso in countries:
        country = all_countries_dict[country_iso]
        if country not in country_avg_runtime:
            country_avg_runtime[country] = {'total_runtime': 0, 'movie_count': 0}
        country_avg_runtime[country]['total_runtime'] += runtime
        country_avg_runtime[country]['movie_count'] += 1

country_avg_runtime = {
    country: round(data['total_runtime'] / data['movie_count'], 2)
    for country, data in country_avg_runtime.items()
}

sorted_country_avg_runtime = dict(sorted(country_avg_runtime.items(), key=lambda item: item[1], reverse=True))
print("Country-wise Average Movie Runtime:")
print("-" * 50)

for country, avg_runtime in sorted_country_avg_runtime.items():
    print(f"{country:<25} | Average Runtime: {avg_runtime:>10} minutes")

print("-" * 50)


Country-wise Average Movie Runtime:
--------------------------------------------------
Libyan Arab Jamahiriya    | Average Runtime:      173.0 minutes
Dominica                  | Average Runtime:      151.0 minutes
Malta                     | Average Runtime:      143.0 minutes
Kyrgyz Republic           | Average Runtime:      135.0 minutes
Portugal                  | Average Runtime:      133.0 minutes
Singapore                 | Average Runtime:      132.5 minutes
Jamaica                   | Average Runtime:      130.5 minutes
Morocco                   | Average Runtime:      130.5 minutes
Slovakia                  | Average Runtime:      130.5 minutes
Philippines               | Average Runtime:      130.0 minutes
Ukraine                   | Average Runtime:      127.0 minutes
Taiwan                    | Average Runtime:      125.0 minutes
India                     | Average Runtime:     124.91 minutes
Italy                     | Average Runtime:     124.54 minutes
New Zealand      

#### Q5. This code counts the occurrences of each non-English language in the movies dataset, excluding English, and displays the results in a formatted table showing the language, its ISO code, and the count of movies in that language.


In [679]:
languages_count = {}
for movie in df_movies.itertuples():
    for language in movie.languages_available:
        language_full = all_languages_dict[language]
        if language_full != 'English':
            languages_count.setdefault(language_full, {'count': 0, 'iso': language})['count'] += 1

sorted_languages = sorted(languages_count.items(), key=lambda item: item[1]['count'], reverse=True)
print(tabulate([[language, data['iso'], data['count']] for language, data in sorted_languages], headers=["Language", "Iso Code", "Movie Count"], tablefmt="grid"))

+------------------+------------+---------------+
| Language         | Iso Code   |   Movie Count |
| Français         | fr         |           437 |
+------------------+------------+---------------+
| Español          | es         |           351 |
+------------------+------------+---------------+
| Deutsch          | de         |           262 |
+------------------+------------+---------------+
| Italiano         | it         |           188 |
+------------------+------------+---------------+
| Pусский          | ru         |           185 |
+------------------+------------+---------------+
| 普通话           | zh         |           107 |
+------------------+------------+---------------+
| 日本語           | ja         |            97 |
+------------------+------------+---------------+
| Português        | pt         |            68 |
+------------------+------------+---------------+
| العربية          | ar         |            67 |
+------------------+------------+---------------+
|     

#### Q6. This code calculates the average production budget per year for movies produced in the USA over the past 10 years, adjusting for shared production between countries and providing a detailed breakdown by year.


In [680]:
current_year = datetime.datetime.now().year

budget_per_year = {}
total_budget = 0

for movie in df_movies.itertuples():
    if pd.isna(movie.release_date):
        continue
    release_year = pd.to_datetime(movie.release_date).year
    if release_year < current_year - 10:
        continue
    if 'US' not in movie.production_countries:
        continue
    
    proportional_budget = movie.production_budget / len(movie.production_countries)
    total_budget += proportional_budget
    
    if release_year not in budget_per_year:
        budget_per_year[release_year] = 0
    budget_per_year[release_year] += proportional_budget

print("Average Budget Per Year (Past 10 Years):")
print("-" * 50)
for year in sorted(budget_per_year.keys()):
    avg_budget = budget_per_year[year]
    print(f"{year}: ${avg_budget:,.2f}")
print("-" * 50)
print(f"Total Budget (Past 10 Years): ${total_budget:,.2f}")


Average Budget Per Year (Past 10 Years):
--------------------------------------------------
2015: $5,221,983,336.83
2016: $3,546,900,000.00
--------------------------------------------------
Total Budget (Past 10 Years): $8,768,883,336.83


#### Q7. This code calculates the average production budget per year for movies produced over the past 10 years, adjusting for shared production between countries and providing a detailed breakdown by year.


In [681]:
current_year = datetime.datetime.now().year

budget_per_year = {}
total_budget = 0

for movie in df_movies.itertuples():
    if pd.isna(movie.release_date):
        continue
    release_year = pd.to_datetime(movie.release_date).year
    if release_year < current_year - 10:
        continue

    total_budget += movie.production_budget

    if release_year not in budget_per_year:
        budget_per_year[release_year] = 0
    budget_per_year[release_year] += movie.production_budget

print("Average Budget Per Year (Past 10 Years):")
print("-" * 50)
for year in sorted(budget_per_year.keys()):
    avg_budget = budget_per_year[year]
    print(f"{year}: ${avg_budget:,.2f}")
print("-" * 50)
print(f"Total Budget (Past 10 Years): ${total_budget:,.2f}")


Average Budget Per Year (Past 10 Years):
--------------------------------------------------
2015: $6,724,547,367.00
2016: $4,753,140,000.00
2017: $0.00
--------------------------------------------------
Total Budget (Past 10 Years): $11,477,687,367.00


In [682]:
# gender 1 == female
# gender 2 == male
df_credit.iloc[4802]['actors']

'[{"cast_id": 3, "character": "Herself", "credit_id": "52fe44e8c3a368484e03da91", "gender": 1, "id": 69597, "name": "Drew Barrymore", "order": 0}, {"cast_id": 5, "character": "Himself", "credit_id": "58ce01169251415a3901648f", "gender": 2, "id": 85563, "name": "Brian Herzlinger", "order": 1}, {"cast_id": 6, "character": "Himself", "credit_id": "58ce01339251415a410167f0", "gender": 2, "id": 3034, "name": "Corey Feldman", "order": 2}, {"cast_id": 8, "character": "Himself", "credit_id": "58ce018c9251415a7d016e36", "gender": 2, "id": 21315, "name": "Eric Roberts", "order": 3}, {"cast_id": 9, "character": "Himself", "credit_id": "58ce01b99251415a7d016e7d", "gender": 0, "id": 2171, "name": "Griffin Dunne", "order": 4}, {"cast_id": 10, "character": "Himself", "credit_id": "58ce01d19251415a8b0168be", "gender": 2, "id": 2231, "name": "Samuel L. Jackson", "order": 5}, {"cast_id": 11, "character": "Himself", "credit_id": "58ce01dd9251415a39016580", "gender": 2, "id": 14407, "name": "Matt LeBlanc"

In [683]:
df_movies.iloc[4802]


production_budget                                                       0
movie_genres                                                         [99]
official_website                                                      NaN
movie_id                                                            25975
movie_keywords                                  [1523, 2249, 9986, 11223]
original_language                                                      en
original_title                                          My Date with Drew
synopsis                Ever since the second grade when he first saw ...
audience_score                                                   1.929883
production_studios                                         [87986, 87987]
production_countries                                                 [US]
release_date                                          2005-08-05 00:00:00
box_office_revenue                                                      0
movie_runtime                         

In [684]:
# x = df_credit['actors'][0]
# st = json.loads(x)
# for y in st:
#     if (y['cast_id'] == 5):
#         print(y)

# x = df_credit['actors'][2341]
# st = json.loads(x)
# for y in st:
#     if (y['cast_id'] == 5):
#         print(y)

In [685]:
df_credit.head(1)


Unnamed: 0,movie_id,movie_title,actors,staff
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [686]:
df_movies.head(1)


Unnamed: 0,production_budget,movie_genres,official_website,movie_id,movie_keywords,original_language,original_title,synopsis,audience_score,production_studios,production_countries,release_date,box_office_revenue,movie_runtime,languages_available,release_status,movie_tagline,movie_title,critics_score,review_count
0,237000000,"[12, 14, 28, 878]",http://www.avatarmovie.com/,19995,"[1463, 2964, 3386, 3388, 3679, 3801, 9685, 984...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[289, 306, 444, 574]","[GB, US]",2009-12-10,2787965087,162.0,"[en, es]",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [687]:
# df_movies = df_movies[df_movies['box_office_revenue']!=0]
y = df_movies['box_office_revenue']
X = df_movies.drop(columns='box_office_revenue')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) 

In [688]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   production_budget     4803 non-null   int64         
 1   movie_genres          4803 non-null   object        
 2   official_website      1712 non-null   object        
 3   movie_id              4803 non-null   int64         
 4   movie_keywords        4803 non-null   object        
 5   original_language     4803 non-null   object        
 6   original_title        4803 non-null   object        
 7   synopsis              4800 non-null   object        
 8   audience_score        4803 non-null   float64       
 9   production_studios    4803 non-null   object        
 10  production_countries  4803 non-null   object        
 11  release_date          4802 non-null   datetime64[ns]
 12  box_office_revenue    4803 non-null   int64         
 13  movie_runtime     

In [689]:
# df_movies.head()
