# Q: What is the Highest Grossing Movie With The Least Production Cost?

In [None]:
import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [None]:
csv_files = glob("./zippedData/*.csv.gz")
csv_files

In [None]:
csv_files_dict = {}
for filename in csv_files:
    filename_cleaned = os.path.basename(filename).replace(".csv", "").replace(".", "_") # cleaning the filenames
    filename_df = pd.read_csv(filename, index_col=0)
    csv_files_dict[filename_cleaned] = filename_df


In [None]:
#End of importing files and libraries

In [None]:
# create dataframes out of each file
title_crew_df = csv_files_dict['imdb_title_crew_gz']
movies_df = csv_files_dict['tmdb_movies_gz']
title_akas_df = csv_files_dict['imdb_title_akas_gz']
title_ratings_df = csv_files_dict['imdb_title_ratings_gz']
name_basics_df = csv_files_dict['imdb_name_basics_gz']
title_basics_df = csv_files_dict['imdb_title_basics_gz']
movie_budgets_df = csv_files_dict['tn_movie_budgets_gz']
movie_gross_df = csv_files_dict['bom_movie_gross_gz']
title_principals_df = csv_files_dict['imdb_title_principals_gz']

In [None]:
#turn movie budgets file into a dataframe and find out it's type
movie_budgets_df = csv_files_dict['tn_movie_budgets_gz']
type(movie_budgets_df)

In [None]:
#find out how many rows and columns in the file
movie_budgets_df.shape

In [None]:
#look at the first five rows of the dataframe
movie_budgets_df.head()

In [None]:
#find out information to clean the dataframe
movie_budgets_df.info()

## Data Cleaning

### Dealing with datatypes

In [None]:
#drop the column without having to reasign the df
movie_budgets_df.drop('worldwide_gross', axis=1, inplace=True)
movie_budgets_df.head()  #check to see if the column is dropped 

In [None]:
#confirming no na values
#movie_budgets_df.isna().sum()

In [None]:
#changing dtype of columns
#we want to change the types of 3 columns. 

def convert_amt_to_int(df, col):
    df[col] = df[col].str.replace("$", "").str.replace(",", "").astype('int')
    return df

In [None]:
#making a list of all the cols where we want to change the dtype 
money_cols = ['production_budget', 'domestic_gross']

for col in money_cols:
    movie_budgets_df = convert_amt_to_int(movie_budgets_df, col)

In [None]:
# check to see if type was changed to int
movie_budgets_df.info()

In [None]:
#? How do I set up data to compare 
#highest grossing of the lowest production cost

In [None]:
df = movie_budgets_df
plt.scatter(df['production_budget'],df['domestic_gross'])
ax = sns.scatterplot(x="production_budget", y="domestic_gross", data=df)
#ax.set(xticks=np.arange(1.5E8, 4E8, 1E8),
  #    yticks=np.arange(0.5E8, 7E8, 2E8))
plt.show()

In [None]:
narrow_prod_budget_df = df[(df['production_budget'] >= 1.8E8) & 
                           (df['production_budget'] <= 2.2E8)]
narrow_prod_budget_df.info()
narrow_prod_budget_df


## Split Up Genres

### Bring in the title_basics Data Frame 

In [None]:
#create a dataframe out of a csv file
genre_categories = pd.read_csv("cleaned_genre_exploration.csv")

type(genre_categories) #verify 

In [None]:
genre_categories.info()

In [None]:
genre_categories.head()

In [None]:
#Title basics - creating the df
df_title_basics = pd.read_csv('zippedData/imdb.title.basics.csv.gz')
#Creating the split genre column
df_title_basics[['G1','G2','G3']] = df_title_basics.genres.str.split(",",expand=True) 
#Dropping the old genre column
df_title_basics.drop(['genres'], axis=1, inplace = True)
#cleaned runtime minutes by using mean
mean_runtime = df_title_basics['runtime_minutes'].mean()
df_title_basics['runtime_minutes'].fillna(mean_runtime, inplace=True)
#cleaning original title by replacing missing ones with the primary title
df_title_basics['original_title'].fillna('primary_title', inplace=True)

display(df_title_basics.head())
display(df_title_basics.info())
display(df_title_basics.isna().sum())

In [None]:
#result = df1.append(df2)
combo_category = genre_categories.append(df_title_basics)

In [None]:
combo_category.head()
combo_category.info()

In [None]:
#combo_category.isnull().any()
combo_category.isna().sum()
#combo_category.isnull().sum()
combo_category.info()

In [None]:
#combo_category.dropna(inplace = True)

In [None]:
#changing dtype of columns
#we want to change the types of 3 columns. Good idea to write a function for this

def convert_amt_to_int(combo_category, col):
    combo_category[col] = combo_category[col].str.replace("$", "").str.replace(",", "").astype('int')
    return combo_category
combo_category.info()

#combo_category = combo_category.astype({"production_budget": int, "domestic_gross": int})

In [None]:
combo_category.head()

In [None]:
combo_category.info()
combo_category

In [None]:
#changing dtype of columns
#we want to change the types of 3 columns. Good idea to write a function for this

# def convert_amt_to_int(df, col):
#     df[col] = df[col].str.replace("$", "").str.replace(",", "").astype('int')
#     return df

In [None]:
#making a list of all the cols where we want to change the dtype 

# money_cols = ['production_budget', 'domestic_gross', 'worldwide_gross']

# for col in money_cols:
#     movie_budgets_df = convert_amt_to_int(movie_budgets_df, col)

In [None]:
limited_prod_budget = df[(df['production_budget'] >= 1.8E8) & 
              (df['production_budget'] <= 2.2E8)]#['production_budget']
limited_prod_budget.info()

In [None]:
df = combo_category
ax = sns.barplot(x=('production_budget'), y= 'G1', color = 'c', data=df, 
                 order=df.sort_values('production_budget').G1)
#ax. set(xlim=(1.8, 4))
ax.set_xlabel('Production_Budget')
ax.set_ylabel('Genre')

In [None]:
#genre = genre_categories = df[['G1']]
genre_categories = [(df['production_budget'] >= 1.8E8) & 
                    (df['production_budget'] <= 2.2E8), ('G1')]

                     
#df['production_budget'] >= 1.8E8) & (df['production_budget'] <= 2.2E8)

#turn movie budgets file into a dataframe and find out it's type
movie_budgets_df = csv_files_dict['tn_movie_budgets_gz']
type(movie_budgets_df)


genre_categories = pd.read_csv("cleaned_genre_exploration.csv")
genre_categories.info()
genre_categories

In [None]:
title_basics_df.head()

In [None]:
title_basics_df.shape

In [None]:
title_basics_df.info()

In [None]:
#drop rows where data is missing
cleanedTB = title_basics_df.dropna()
cleanedTB

In [None]:
#check the counts after rows dropped
cleanedTB.info()

In [None]:
# title_basics_df['genres'] = title_basics_df['genres'].apply(lambda x: 
#                                                             x.split(",") 
#                                                             if x else x)
# title_basics_df.head()

In [None]:
#making a set of all genres we have. set doesn't allow duplicate values

# all_genres = set()
# for genres in cleanedTB['genres']:
#     if genres:
#         all_genres.update(genres)

In [None]:
#show all the genres
# all_genres