## Problem 2 – probabilities (40%)


Next, you are to generate visualizations of various probability distributions (univariate,
multivariate), and interpret them.
You can use any of the visualization methods presented in the lectures (and reading lists.)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')

In [None]:
df = pd.read_csv('Top_1000_IMDb_movies_New_version.csv')
df = df.drop(['Description', 'Unnamed: 0'], axis = 1)
df['Year of Release'] = df['Year of Release'].str.extract(r'(\d{4})').astype(int)
df['Votes'] = df['Votes'].str.replace(',', '', regex=True).astype(int)
df['Gross'] = df['Gross'].str.replace('[^\d.]', '', regex=True).astype(float)

In [None]:
df.head()

In [None]:
#first we select the columns with NaN values, or 
#the ones we want to do data amputation on.
columns_with_NaN = ['Gross', 'Metascore of movie']

#next, we calculate the means for these columns, so we can fill the NaN fields
averages = df[columns_with_NaN].mean()

#finally, we select the columns, and fill them with their respective averages.
df[columns_with_NaN] = df[columns_with_NaN].fillna(averages)

# Univariate Visualization:
For the sake of this part, I show a variety of graphs meant to help visualize the data, but not all of them are elaborated on in the p2.pdf due to the sheer amount of graphs. The information can be interpreted similarly, so I use the KDE plot instead to show my observations.

In [None]:
ax1 = df['Metascore of movie'].plot(kind='hist', bins=20, title = 'Metascore of Movies by Distribution')
ax1.set_xlabel('Metascore')

In [None]:
sns.kdeplot(x = df['Metascore of movie'])

In [None]:
ax1 = df['Year of Release'].plot(kind='hist', bins=25, title = 'Year of Release by Distribution')
ax1.set_xlabel('Year of Release')

In [None]:
sns.kdeplot(x = df["Year of Release"])

In [None]:
ax1 = df['Movie Rating'].plot(kind='hist', bins=15, title = 'Movie Rating by Distribution')
ax1.set_xlabel('Movie Rating')

In [None]:
sns.kdeplot(x = df["Movie Rating"])

In [None]:
ax1 = df['Watch Time'].plot(kind='hist', bins=20, title = 'Watch Time by Distribution')
ax1.set_xlabel('Watch Time')

In [None]:
sns.kdeplot(x = df["Watch Time"])

In [None]:
ax1 = df['Gross'].plot(kind='hist', bins=15, title = 'Gross by Distribution')
ax1.set_xlabel('Gross')

In [None]:
sns.kdeplot(x = df["Gross"])

In [None]:
ax1 = df['Votes'].plot(kind='hist', bins=20, title = 'Votes by Distribution')
ax1.set_xlabel('Votes')

In [None]:
sns.kdeplot(x = df["Votes"])

# Multivariate Visualization

In [None]:
sns.pairplot(df, vars=['Year of Release', 'Watch Time','Movie Rating',
                       'Metascore of movie','Gross', 'Votes']
            )
#this pariplot helps to plot pairwise relationships in a dataset, 
#which helps to make it easier to view the distributions and variance in the
#multivariate visualization

In [None]:
sns.scatterplot(x = "Year of Release",
                y = "Gross",
                hue = "Year of Release",
                data = df)

In [None]:
sns.kdeplot(x = "Year of Release",
                y = "Gross",
#                 hue = "Year of Release",
                data = df)

In [None]:
sns.scatterplot(x = "Gross",
                y = "Votes",
                hue = "Gross",
                data = df)

In [None]:
sns.kdeplot(x = "Gross",
                y = "Votes",
#                 hue = "Gross",
                data = df)

In [None]:
sns.scatterplot(x = "Year of Release",
                y = "Metascore of movie",
                hue = "Year of Release",
                data = df)

In [None]:
sns.kdeplot(x = "Year of Release",
                y = "Metascore of movie",
#                 hue = "Year of Release",
                data = df)

In [None]:
sns.scatterplot(x = "Watch Time",
                y = "Movie Rating",
                hue = "Watch Time",
                data = df)

In [None]:
sns.kdeplot(x = "Watch Time",
                y = "Movie Rating",
#                 hue = "Watch Time",
                data = df)

In [None]:
sns.scatterplot(x = "Metascore of movie",
                y = "Movie Rating",
                hue = "Metascore of movie",
                data = df)

In [None]:
sns.kdeplot(x = "Metascore of movie",
            y = "Movie Rating",
            data = df)
plt.show()

In [None]:
#here, we graph the correlation in the form of a heatmap to better visualize the 
#correlation coefficients.
df_corr = df[['Year of Release', 'Watch Time','Movie Rating',
                       'Metascore of movie','Gross', 'Votes']].corr()

In [None]:
sns.heatmap(df_corr, annot=True)