In [None]:
# Import libraries
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib
import matplotlib.pyplot as plt

print("matplotlib: {}".format(matplotlib.__version__))

plt.style.use("ggplot")
from matplotlib.pyplot import figure

# %matplotlib widget
%matplotlib inline
matplotlib.rcParams["figure.figsize"] = (12, 8)  # adjusts the configuration of the plots we will create


: 

In [None]:
# Read in the data
df = pd.read_csv(r'C:\Users\Shemeika\Downloads\movies.csv')


: 

In [None]:
# Let's look at the data
df.head()

: 

In [None]:
# Find missing data 
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{}-{}%'.format(col, pct_missing))

: 

In [None]:
# Drop missing/NaN values and recheck dataframe
df.dropna(inplace = True)

for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{}-{}%'.format(col, pct_missing))

: 

In [None]:
# Data types for our columns
df.dtypes

: 

In [None]:
# Reformat columns to remove the unnecessary decimal point
df = df.astype({"budget": 'int64', "gross": 'int64', "votes": 'int64',
                "score": 'int64', "runtime": 'int64'}) 
df.dtypes


: 

In [None]:
# check for changes

df

: 

In [None]:
# Transform "released" column to str from obj 
df['released'] = df['released'].astype('string')
df.dtypes

: 

In [None]:
# Extract the release yr from "released" and call it "yearcorrect"
# could combine the above cell and the below regex into 1: 
# df['yearcorrect'] = df['released'].astype('string').str.extract(r'[^\d]*[\d]+[^\d]+([\d]+)', expand=False)
import re

df['yearcorrect'] = df['released'].str.extract(r'[^\d]*[\d]+[^\d]+([\d]+)', expand=False)
df

: 

In [None]:
# sort movies by gross column in descending order 
df.sort_values(by=['gross'], inplace=False, ascending=False)

: 

In [None]:
# drop duplicates
df['company'].drop_duplicates().sort_values(ascending=False)

: 

In [None]:
# Build a scatterplot with budget vs gross
plt.scatter(x = df['budget'], y = df['gross'])
plt.title('Budget vs Gross Earning')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget for Film')
plt.show()

: 

In [None]:
# Regression Plot - plot the budget vs gross using seaborn
sns.regplot(x='budget', y='gross', data=df, scatter_kws={"color": "green"}, line_kws={"color":"blue"})

# It shows a positive correlation

: 

In [None]:
# Let's start looking at correlation
# There are different correlations Pearson (default), 
# Kendall, and Spearman
p_corr = df.corr(method='pearson', numeric_only=True)
k_corr = df.corr(method='kendall', numeric_only=True)  
s_corr = df.corr(method='spearman', numeric_only=True)

print("Pearson\n", p_corr)
print("\nKendall\n", k_corr) 
print("\nSpearman\n", s_corr)

: 

In [None]:
correlation_matrix = df.corr(method='pearson', numeric_only=True)

sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

: 

In [None]:
# Look at Company, which is not numeric and numerize
df_numerized = df

# For loop
for col_name in df_numerized.columns:
    if (df_numerized[col_name].dtype == 'object'):
        df_numerized[col_name] = df_numerized[col_name].astype('category')
        df_numerized[col_name] = df_numerized[col_name].cat.codes

: 

In [None]:
# Sort the values according to the gross column in descending order to view highest correlations first
df.head().sort_values('gross', ascending=False)

: 

In [None]:
# Creat and view the correlation matrix
correlation_matrix = df_numerized.corr(method='pearson', numeric_only=True)

sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

: 

In [None]:
# View the unstacked, paired correlation
correlation_mat = df_numerized.corr(numeric_only=True)
corr_pairs = correlation_mat.unstack()
corr_pairs

# alternatively, view the above using:
# sorted_pairs = corr_pairs.sort_values(ascending=False)
# sorted_pairs

: 

In [None]:
#  View the correlation pairs that register above 0.5
high_corr = sorted_pairs[(sorted_pairs)>0.5]
high_corr

: 