In [3]:
import sqlite3
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

conn = sqlite3.connect('zippedData/im.db')

In [4]:
tmdb_movie_df = pd.read_csv('zippedData/tmdb.movies.csv')
tn_movie_df = pd.read_csv('zippedData/tn.movie_budgets.csv')
bom_movie_df = pd.read_csv('zippedData/bom.movie_gross.csv')

In [5]:
%%bash
sqlite3 zippedData/im.db
.tables

directors           movie_basics        principals          us_movies_df_sql  
known_for           movie_ratings       tn_movie_sql        us_movies_filt_sql
movie_akas          persons             tn_sql              writers           


In [6]:
movies_2000s = pd.read_sql("""
select *
from movie_basics
join movie_akas
using(movie_id)
where start_year >= 2000
group by movie_id
""", conn)

movies_2000s

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres,ordering,title,region,language,types,attributes,is_original_title
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",1,Sangharsh,IN,hi,,alternative transliteration,0.0
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama",1,One Day Before the Rainy Season,XWW,en,,,0.0
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama,1,O Outro Lado do Vento,BR,,imdbDisplay,,0.0
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama",1,Subse Bada Sukh,IN,bn,,,0.0
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy",1,La Telenovela Errante,,,original,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122297,tt9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,130.0,Drama,1,Padmavyuhathile Abhimanyu,,,original,,1.0
122298,tt9913248,Nepal - Homebird,Nepal - Homebird,2019,52.0,Documentary,1,Nepal - Homebird,,,original,,1.0
122299,tt9914254,A Cherry Tale,Kirseb√¶reventyret,2019,85.0,Documentary,1,Kirseb√¶reventyret,,,original,,1.0
122300,tt9915436,Vida em Movimento,Vida em Movimento,2019,70.0,Documentary,1,Life in Movement,US,,alternative,,0.0


In [7]:
us_keep = 'US'
us_movies_tmdb = movies_2000s['region'] == us_keep
us_movies_filt = movies_2000s[us_movies_tmdb]
us_movies_filt

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres,ordering,title,region,language,types,attributes,is_original_title
6,tt0112502,Bigfoot,Bigfoot,2017,,"Horror,Thriller",1,Bigfoot,US,,,,0.0
16,tt0192528,Heaven & Hell,Reverse Heaven,2018,104.0,Drama,1,Heaven & Hell,US,,imdbDisplay,,0.0
17,tt0230212,The Final Journey,The Final Journey,2010,120.0,Drama,1,The Final Journey,US,,,,0.0
22,tt0255820,Return to Babylon,Return to Babylon,2013,75.0,"Biography,Comedy,Drama",1,Return to Babylon,US,,,,0.0
33,tt0297400,Snowblind,Snowblind,2015,,"Crime,Drama",1,Snowblind,US,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
122187,tt9814242,Homeless at 17,Homeless at 17,2019,,Drama,1,Homeless at 17,US,,,,0.0
122188,tt9815714,The Hard Way,The Hard Way,2019,92.0,Action,1,The Hard Way,US,,,,0.0
122200,tt9820914,Look Closer: The Rise and Fall of Robert Benfer,Look Closer: The Rise and Fall of Robert Benfer,2018,46.0,Documentary,1,Look Closer: The Rise and Fall of Robert Benfer,US,,,,0.0
122204,tt9824512,L'Odyss√©e du Loup,L'Odyss√©e du Loup,2019,,Documentary,1,A Wolf's Journey,US,,alternative,,0.0


In [8]:
us_movies_filt_sql = us_movies_filt

In [9]:
us_movies_filt_sql.to_sql('us_movies_filt_sql', conn, if_exists='replace', index=False)

36453

In [10]:
us_movie_ratings = pd.read_sql("""
SELECT *
FROM us_movies_filt_sql us
JOIN movie_ratings mr
ON us.movie_id = mr.movie_id
WHERE mr.averagerating >= 7
ORDER BY mr.averagerating DESC;
""", conn)

us_movie_ratings

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres,ordering,title,region,language,types,attributes,is_original_title,movie_id.1,averagerating,numvotes
0,tt1770682,Freeing Bernie Baran,Freeing Bernie Baran,2010,100.0,"Crime,Documentary",1,Freeing Bernie Baran,US,,,,0.0,tt1770682,10.0,5
1,tt4960818,Revolution Food,Revolution Food,2015,70.0,Documentary,1,Revolution Food,US,,,,0.0,tt4960818,10.0,8
2,tt6991826,A Dedicated Life: Phoebe Brand Beyond the Group,A Dedicated Life: Phoebe Brand Beyond the Group,2015,93.0,Documentary,1,A Dedicated Life: Phoebe Brand and the Legacy ...,US,,,,0.0,tt6991826,10.0,5
3,tt4944240,Dog Days in the Heartland,Dog Days in the Heartland,2017,,Drama,1,Dog Days in the Heartland,US,,,,0.0,tt4944240,10.0,5
4,tt5089804,Fly High: Story of the Disc Dog,Fly High: Story of the Disc Dog,2019,65.0,Documentary,1,Fly High: Story of the Disc Dog,US,,,,0.0,tt5089804,10.0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7310,tt4687782,No Greater Love,No Greater Love,2015,94.0,Documentary,1,No Greater Love,US,,,,0.0,tt4687782,7.0,122
7311,tt5147042,Unseen,Unseen,2016,75.0,"Crime,Documentary",1,Unseen,US,,,,0.0,tt5147042,7.0,351
7312,tt6670584,Mundo: From Altar Boy to Hitman,Mundo: From Altar Boy to Hitman,2018,89.0,Crime,1,Mundo: From Altar Boy to Hitman,US,,,,0.0,tt6670584,7.0,43
7313,tt1975286,The Land of Eb,The Land of Eb,2012,87.0,"Drama,Family",1,The Land of Eb,US,,,,0.0,tt1975286,7.0,32


In [11]:
us_movie_ratings = us_movie_ratings.drop(columns=['language', 'types', 'attributes'])

In [12]:
us_movies_filt = us_movies_filt.drop(columns=['language', 'types', 'attributes'])

In [13]:
us_movie_ratings = us_movie_ratings.rename(columns={'start_year': 'year'})

In [14]:
us_movie_ratings

Unnamed: 0,movie_id,primary_title,original_title,year,runtime_minutes,genres,ordering,title,region,is_original_title,movie_id.1,averagerating,numvotes
0,tt1770682,Freeing Bernie Baran,Freeing Bernie Baran,2010,100.0,"Crime,Documentary",1,Freeing Bernie Baran,US,0.0,tt1770682,10.0,5
1,tt4960818,Revolution Food,Revolution Food,2015,70.0,Documentary,1,Revolution Food,US,0.0,tt4960818,10.0,8
2,tt6991826,A Dedicated Life: Phoebe Brand Beyond the Group,A Dedicated Life: Phoebe Brand Beyond the Group,2015,93.0,Documentary,1,A Dedicated Life: Phoebe Brand and the Legacy ...,US,0.0,tt6991826,10.0,5
3,tt4944240,Dog Days in the Heartland,Dog Days in the Heartland,2017,,Drama,1,Dog Days in the Heartland,US,0.0,tt4944240,10.0,5
4,tt5089804,Fly High: Story of the Disc Dog,Fly High: Story of the Disc Dog,2019,65.0,Documentary,1,Fly High: Story of the Disc Dog,US,0.0,tt5089804,10.0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7310,tt4687782,No Greater Love,No Greater Love,2015,94.0,Documentary,1,No Greater Love,US,0.0,tt4687782,7.0,122
7311,tt5147042,Unseen,Unseen,2016,75.0,"Crime,Documentary",1,Unseen,US,0.0,tt5147042,7.0,351
7312,tt6670584,Mundo: From Altar Boy to Hitman,Mundo: From Altar Boy to Hitman,2018,89.0,Crime,1,Mundo: From Altar Boy to Hitman,US,0.0,tt6670584,7.0,43
7313,tt1975286,The Land of Eb,The Land of Eb,2012,87.0,"Drama,Family",1,The Land of Eb,US,0.0,tt1975286,7.0,32


In [19]:
tn_movie_df

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"
...,...,...,...,...,...,...
5777,78,"Dec 31, 2018",Red 11,"$7,000",$0,$0
5778,79,"Apr 2, 1999",Following,"$6,000","$48,482","$240,495"
5779,80,"Jul 13, 2005",Return to the Land of Wonders,"$5,000","$1,338","$1,338"
5780,81,"Sep 29, 2015",A Plague So Pleasant,"$1,400",$0,$0


In [None]:
all_same = (us_movie_ratings['primary_title'] == us_movie_ratings['original_title']) & \
        (us_movie_ratings['original_title'] == us_movie_ratings['title'])

num_same = all_same.sum()
total_rows = len(us_movie_ratings)

print(f"‚úÖ {num_same} out of {total_rows} rows have identical values in all three columns.")

In [None]:
diff_rows = us_movie_ratings[~all_same]
print(f"‚ö†Ô∏è {len(diff_rows)} rows have different values in at least one column.")

In [None]:
display(diff_rows.head(20))

In [None]:
if num_same == total_rows:
    us_movie_ratings = us_movie_ratings.drop(columns=['title', 'original_title'])
    print("‚úÖ Dropped 'title' and 'original_title' since they are identical to 'primary_title'.")

In [None]:
us_movie_ratings['primary_title'] = us_movie_ratings['primary_title'].fillna(us_movie_ratings['original_title'])

us_movie_ratings = us_movie_ratings.drop(columns=['title', 'original_title'])
print("‚úÖ Kept 'primary_title' and filled missing values where possible.")

In [None]:
us_movie_ratings

In [None]:
us_movie_ratings['title_year'] = us_movie_ratings.apply(lambda row: f"{row['primary_title']}, {row['start_year']}", axis=1)

In [None]:
us_movie_ratings

In [None]:
print(us_movie_ratings.columns)

In [None]:
us_movie_ratings.columns = [f"{col}_{i}" if us_movie_ratings.columns.duplicated()[i] else col
                            for i, col in enumerate(us_movie_ratings.columns)]

In [None]:
us_movie_ratings

In [None]:
same_movie_id = (us_movie_ratings['movie_id'] == us_movie_ratings['movie_id_8'])

num_same_movie_id = same_movie_id.sum()
total_rows_movie_id = len(us_movie_ratings)

print(f"‚úÖ {num_same_movie_id} out of {total_rows_movie_id} rows have identical values in both columns.")

In [None]:
us_movie_ratings

In [None]:
tn_movie_df = tn_movie_df.drop(columns=['release_date', 'domestic_gross'])
tn_movie_df

In [None]:
tn_sql = tn_movie_df.copy()

In [None]:
tn_sql.to_sql('tn_sql', conn, if_exists='replace', index=False)

In [None]:
us_movies_df_sql = us_movie_ratings

In [None]:
us_movies_df_sql.to_sql('us_movies_df_sql', conn, if_exists='replace', index=False)

In [None]:
us_movies_dir_wri = pd.read_sql("""
select umds.*, d.director_names, w.writer_names
from us_movies_df_sql umds
left join(
    select d.movie_id, group_concat(p.primary_name, ', ') as director_names
    from directors d
    join persons p on d.person_id = p.person_id
    group by d.movie_id
) d on umds.movie_id = d.movie_id
left join(
    select w.movie_id, group_concat(p.primary_name, ', ') as writer_names
    from writers w
    join persons p on w.person_id = p.person_id
    group by w.movie_id
) w on umds.movie_id = w.movie_id;
""", conn)

In [None]:
us_movies_dir_wri

In [None]:
 us_movies_df = us_movies_dir_wri.copy()

In [None]:
horror_count = us_movies_df['genres'].str.contains("Horror", case=False, na=False).sum()
comedy_count = us_movies_df['genres'].str.contains("Comedy", case=False, na=False).sum()
drama_count = us_movies_df['genres'].str.contains("Drama", case=False, na=False).sum()

print(f"üé≠ Drama appears in {drama_count} movies")
print(f"üòÇ Comedy appears in {comedy_count} movies")
print(f"üò± Horror appears in {horror_count} movies")

In [None]:
us_horror = us_movies_df[us_movies_df['genres'].str.contains("Horror", case=False, na=False)].copy()
us_comedy = us_movies_df[us_movies_df['genres'].str.contains("Comedy", case=False, na=False)].copy()
us_drama = us_movies_df[us_movies_df['genres'].str.contains("Drama", case=False, na=False)].copy()

print(f"‚úÖ Created DataFrames: Horror ({len(us_horror)}), Comedy ({len(us_comedy)}), Drama ({len(us_drama)})")

In [None]:
dup_check = tn_movie_df.duplicated(subset=['title_year', 'year'], keep=False)  # Keep=False marks ALL duplicates
tn_dupes = tn_movie_df[dup_check]  # Filter only duplicates

print(f"üîπ Found {len(tn_dupes)} duplicate title-year pairs in tn_movie_df.")

In [None]:
tn_movie_df_clean = tn_movie_df.drop_duplicates(subset=['title_year', 'year'], keep='first')

In [None]:
us_movies_df

In [None]:
us_movies_df = us_movies_df.drop(columns=['production_budget', 'worldwide_gross'])

In [None]:
us_movies_df.set_index('title_year', inplace=True)
us_movies_df

In [None]:
tn_movie_df.set_index('title_year', inplace=True)
tn_movie_df

In [None]:
tn_movie_df['production_budget'] = tn_movie_df['production_budget'].str.replace('[\$,]', '', regex=True)
tn_movie_df['worldwide_gross'] = tn_movie_df['worldwide_gross'].str.replace('[\$,]', '', regex=True)

In [None]:
tn_movie_df

In [None]:
usmovies_budgets = us_movies_df.join(tn_movie_df, on='title_year', how='inner', rsuffix='_md')
clean_movies = usmovies_budgets[['movie_id', 'year', 'production_budget', 'worldwide_gross', 'genres', 'director_names', 'writer_names']]
clean_movies

In [None]:
clean_movies

In [None]:
us_movies_df.index = us_movies_df.index.astype(str).str.strip()
tn_movie_df.index = tn_movie_df.index.astype(str).str.strip()

In [None]:
print(f"Unique title_year values in us_movies_df: {len(us_movies_df.index.unique())}")
print(f"Unique title_year values in tn_movie_df: {len(tn_movie_df.index.unique())}")

In [None]:
# Find title_year values in us_movies_df that are NOT in tn_movie_df
missing_in_tn = us_movies_df.index.difference(tn_movie_df.index)

# Find title_year values in tn_movie_df that are NOT in us_movies_df
missing_in_us = tn_movie_df.index.difference(us_movies_df.index)

print(f"üîπ Title_year values in us_movies_df but missing in tn_movie_df: {len(missing_in_tn)}")
print(f"üîπ Title_year values in tn_movie_df but missing in us_movies_df: {len(missing_in_us)}")

In [None]:
print("üîç Sample of title_year values in us_movies_df but not in tn_movie_df:")
print(missing_in_tn[:10])

print("\nüîç Sample of title_year values in tn_movie_df but not in us_movies_df:")
print(missing_in_us[:10])

In [None]:
import re

def clean_title_year(index_series):
    return index_series.to_series().str.lower().apply(lambda x: re.sub(r"[^a-z0-9, ]", "", x))

# Apply cleaning to both DataFrame indexes
us_movies_df.index = clean_title_year(us_movies_df.index).values
tn_movie_df.index = clean_title_year(tn_movie_df.index).values

print("‚úÖ Cleaned title_year in both DataFrames for better matching!")

In [None]:
# Find title_year values in us_movies_df that are NOT in tn_movie_df
missing_in_tn = us_movies_df.index.difference(tn_movie_df.index)

# Find title_year values in tn_movie_df that are NOT in us_movies_df
missing_in_us = tn_movie_df.index.difference(us_movies_df.index)

print(f"üîπ Title_year values in us_movies_df but missing in tn_movie_df: {len(missing_in_tn)}")
print(f"üîπ Title_year values in tn_movie_df but missing in us_movies_df: {len(missing_in_us)}")

In [None]:
# Show a few mismatched title_year values from each DataFrame
print("üîç Sample of unmatched title_years in us_movies_df:")
print(missing_in_tn[:10])

print("\nüîç Sample of unmatched title_years in tn_movie_df:")
print(missing_in_us[:10])

In [None]:
# Ensure all leading/trailing spaces are removed
us_movies_df.index = us_movies_df.index.str.strip()
tn_movie_df.index = tn_movie_df.index.str.strip()

print("‚úÖ Removed hidden leading/trailing spaces in title_year!")

In [None]:
# Find title_year values in us_movies_df that are NOT in tn_movie_df
missing_in_tn = us_movies_df.index.difference(tn_movie_df.index)

# Find title_year values in tn_movie_df that are NOT in us_movies_df
missing_in_us = tn_movie_df.index.difference(us_movies_df.index)

print(f"üîπ Title_year values in us_movies_df but missing in tn_movie_df: {len(missing_in_tn)}")
print(f"üîπ Title_year values in tn_movie_df but missing in us_movies_df: {len(missing_in_us)}")

In [None]:
# Convert sets to sorted lists for easy comparison
missing_in_tn_list = sorted(list(missing_in_tn))[:10]  # Titles in us_movies_df but missing in tn_movie_df
missing_in_us_list = sorted(list(missing_in_us))[:10]  # Titles in tn_movie_df but missing in us_movies_df

# Display the values for manual comparison
print("üîç Titles in us_movies_df but NOT in tn_movie_df:")
for title in missing_in_tn_list:
    print(f"- {title}")

print("\nüîç Titles in tn_movie_df but NOT in us_movies_df:")
for title in missing_in_us_list:
    print(f"- {title}")

In [None]:
import re

def normalize_numbers(index_series):
    return index_series.to_series().apply(lambda x: re.sub(r"(\d),(\d)", r"\1\2", x))  # Removes commas in numbers

# Apply normalization
us_movies_df.index = normalize_numbers(us_movies_df.index).values
tn_movie_df.index = normalize_numbers(tn_movie_df.index).values

print("‚úÖ Normalized number formatting in title_year!")

In [None]:
missing_in_tn = us_movies_df.index.difference(tn_movie_df.index)
missing_in_us = tn_movie_df.index.difference(us_movies_df.index)

print(f"üîπ Title_year values in us_movies_df but missing in tn_movie_df: {len(missing_in_tn)}")
print(f"üîπ Title_year values in tn_movie_df but missing in us_movies_df: {len(missing_in_us)}")

In [None]:
# Create DataFrames to compare title_year values side by side
us_titles_df = pd.DataFrame(us_movies_df.index, columns=['title_year'])
us_titles_df['source'] = 'us_movies_df'

tn_titles_df = pd.DataFrame(tn_movie_df.index, columns=['title_year'])
tn_titles_df['source'] = 'tn_movie_df'

# Combine both DataFrames
title_comparison = pd.concat([us_titles_df, tn_titles_df])

# Show the first 50 rows to analyze mismatches
print(title_comparison.head(50))

In [None]:
# Find how many title_year values exist in both DataFrames
common_title_years = us_movies_df.index.intersection(tn_movie_df.index)

print(f"‚úÖ Shared title_year values: {len(common_title_years)} out of {len(us_movies_df)} in us_movies_df")
print(f"‚úÖ Shared title_year values: {len(common_title_years)} out of {len(tn_movie_df)} in tn_movie_df")

In [None]:
# Convert to a list and show the first 20 matches
print("üîç Movies that exist in BOTH datasets:")
print(list(common_title_years)[:20])  # Show first 20 matches

In [None]:
# Remove documentary movies
us_movies_df = us_movies_df[~us_movies_df['genres'].str.contains('documentary', case=False, na=False)]

print(f"‚úÖ Removed documentaries! New row count: {len(us_movies_df)}")

In [None]:
common_title_years = us_movies_df.index.intersection(tn_movie_df.index)

print(f"‚úÖ Shared title_year values: {len(common_title_years)} out of {len(us_movies_df)} in us_movies_df")
print(f"‚úÖ Shared title_year values: {len(common_title_years)} out of {len(tn_movie_df)} in tn_movie_df")

In [None]:
# Sample of movies in tn_movie_df but missing from us_movies_df
missing_in_us_sample = list(missing_in_us)[:20]  # First 20 missing movies
print("üîç Sample of movies in tn_movie_df but missing in us_movies_df:")
for movie in missing_in_us_sample:
    print(f"- {movie}")

# Sample of movies in us_movies_df but missing from tn_movie_df
missing_in_tn_sample = list(missing_in_tn)[:20]  # First 20 missing movies
print("\nüîç Sample of movies in us_movies_df but missing in tn_movie_df:")
for movie in missing_in_tn_sample:
    print(f"- {movie}")

In [None]:
# Filter by a minimum vote count (adjust if needed)
min_votes = 500  # Adjust this number if the dataset is too small or large
us_movies_df = us_movies_df[us_movies_df['numvotes'] >= min_votes]

print(f"‚úÖ Filtered out low-vote movies. New row count: {len(us_movies_df)}")

In [None]:
conn.close()