# Movies Project
### "An attempt to find data correlations"

#### Import libraries and dataset

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib
from matplotlib import pyplot as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure as fig

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12, 8) #Plots configuration adjustment

import os

import re

In [2]:
# We get present working directory in order to have an independent notebook
pwd = os.getcwd()

In [3]:
# Reading data
df = pd.read_csv(pwd + '\\movies.csv')

In [4]:
pd.set_option('display.max_rows', 200)

#### Fix missing values

In [None]:
# First look at the data
df.head()
# df.columns

In [None]:
# Check for null values
for col in df.columns:
    missing_percentage = np.mean(df[col].isnull())
    print(f'{col} - {missing_percentage}%')

In [7]:
# Now we remove all rows, that have more than 3 null values. 26 movies
df.dropna(thresh=13, inplace=True)
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)

In [8]:
# We can also propagate non-null values forward and fill the dataset
df.fillna(method='ffill', inplace=True)

In [None]:
# No null values anymore
df.isnull().sum()

#### Clean data

In [None]:
# Check data types
df.dtypes

In [11]:
# We use regular expressions, in order to find the year in string and return it
def pick_year(string):
    match = re.findall(r'.*([1-3][0-9]{3})', string)
    year = str(match[0])
    return year

In [12]:
# pick_year function needs string
df['released'] = df['released'].astype('string')

In [13]:
# We notice that the years in df['year'] sometimes are not the same as the years in df['released']
# So we create new column with correct years
df['temp'] = df['released'].apply(pick_year)
df['temp'] = df['temp'].astype('string')

In [14]:
# Delete old year column with wrong values and set as year the new test column with correct values 
df.drop(columns=['year'], inplace=True)
df.rename(columns={'temp' : 'year'}, inplace=True)


In [15]:
# Correct data types. No need for floats
df['budget'] = df['budget'].astype('int64')
df['gross'] = df['gross'].astype('int64')

In [16]:
#Rearrange columns as it was 
cols = df.columns.tolist()
cols = cols[0:3] + cols[-1:] + cols[3:-1]
df = df[cols]

In [17]:
# Similar to ORDER BY gross
df.sort_values(by=['gross'], ascending=False, inplace=True)
df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)

In [None]:
# Remove duplicates if exist
# df['company'].drop_duplicates().sort_values(ascending=False)

#### Data correlations with gross

In [None]:
# Start looking at correlation
df.corr(method= 'pearson') 
# df.corr(method= 'kendall') 
# df.corr(method= 'spearman') 

In [20]:
# Set a nice dark theme
sns.set_theme(style="darkgrid")

In [None]:
# Scatter plot budget vs gross with matplotlib
plt.scatter(x=df['budget'], y=df['gross'], color='purple',)
plt.title('Budget vs Gross Earnings', fontweight='bold')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget For Film')
# plt.show

In [None]:
# Scatter plot budget vs gross with seaborn
sns.regplot(x='budget', y='gross', data=df, scatter_kws={'color': 'black'}, line_kws={'color': 'orange'}).set_title('Budget vs Gross Earnings', fontweight='bold')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget For Film')

In [None]:
# We notice that there is high correlation between budget and gross
cor_mtrx = df.corr(method= 'pearson')
sns.heatmap(cor_mtrx, annot=True).set_title('Correlation Matrix For Numeric Features', fontweight='bold')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')

In [None]:
# We believe that there is high correlation between the company and the gross
# So we will try to numerize companies with a temporary dataframe for safety reasons
num_df = df.copy()

for col in num_df.columns:
    if(num_df[col].dtype == 'object'):
        num_df[col] = num_df[col].astype('category')
        num_df[col] = num_df[col].cat.codes
        
num_df

In [None]:
# We see the correlation between the company and the gross
cor_mtrx_2 = num_df.corr(method= 'pearson')
sns.heatmap(cor_mtrx_2, annot=True).set_title('Correlation Matrix For Numeric Features', fontweight='bold')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')

In [None]:
# Try to find correlations
cor_mtx_2 = num_df.corr(method='pearson')
cor_pairs = cor_mtx_2.unstack()
sorted_pairs = cor_pairs.sort_values()
sorted_pairs

In [None]:
# As we see there is no significant correlation between gross and company 
# Finally budget and votes have the highest correlation with gross earnings
high_cor = sorted_pairs[(sorted_pairs) > 0.5]
high_cor