# II. Analytic and modelling

## Import libraries and dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [53]:
df = pd.read_csv('https://file.io/Z3Ns20ARTPE2')
df.head()

Unnamed: 0,Movie,Release Date,Critic 1,Critic 2,Critic 3,Critic 4,Critic 5,Audience Rating
0,Iron Man,2-May-08,4.0,4.0,7.0,6.0,5.0,9.1
1,The Incredible Hulk,13-Jun-08,,,6.0,3.0,5.0,7.0
2,Iron Man 2,7-May-10,3.0,6.0,8.0,6.0,5.0,7.1
3,Thor,6-May-11,4.0,6.0,8.0,5.0,,7.6
4,Captain America: The First Avenger,22-Jul-11,5.0,5.0,,7.0,7.0,7.5


## Check dataframe

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Movie            22 non-null     object 
 1   Release Date     22 non-null     object 
 2   Critic 1         16 non-null     float64
 3   Critic 2         18 non-null     float64
 4   Critic 3         19 non-null     float64
 5   Critic 4         17 non-null     float64
 6   Critic 5         15 non-null     float64
 7   Audience Rating  22 non-null     float64
dtypes: float64(6), object(2)
memory usage: 1.5+ KB


In [55]:
df.describe()

Unnamed: 0,Critic 1,Critic 2,Critic 3,Critic 4,Critic 5,Audience Rating
count,16.0,18.0,19.0,17.0,15.0,22.0
mean,4.75,6.666667,7.578947,6.882353,6.866667,8.186364
std,1.612452,2.057983,1.346427,1.964763,2.099887,1.080254
min,3.0,3.0,5.0,3.0,4.0,4.5
25%,3.0,5.25,6.5,5.0,5.0,7.65
50%,4.5,6.5,8.0,7.0,7.0,8.55
75%,6.25,8.0,8.5,9.0,8.5,8.975
max,7.0,10.0,10.0,10.0,10.0,9.2


In [56]:
df.isna().sum()

Movie              0
Release Date       0
Critic 1           6
Critic 2           4
Critic 3           3
Critic 4           5
Critic 5           7
Audience Rating    0
dtype: int64

## Analytics

### 1. Compute average critics rating for each film

In [57]:
df['Mean'] = df.iloc[:, 2:6].mean(axis=1)
df

Unnamed: 0,Movie,Release Date,Critic 1,Critic 2,Critic 3,Critic 4,Critic 5,Audience Rating,Mean
0,Iron Man,2-May-08,4.0,4.0,7.0,6.0,5.0,9.1,5.25
1,The Incredible Hulk,13-Jun-08,,,6.0,3.0,5.0,7.0,4.5
2,Iron Man 2,7-May-10,3.0,6.0,8.0,6.0,5.0,7.1,5.75
3,Thor,6-May-11,4.0,6.0,8.0,5.0,,7.6,5.75
4,Captain America: The First Avenger,22-Jul-11,5.0,5.0,,7.0,7.0,7.5,5.666667
5,Marvel's The Avengers,4-May-12,3.0,8.0,8.0,,9.0,9.1,6.333333
6,Iron Man 3,3-May-13,3.0,,8.0,,5.0,7.8,5.5
7,Thor: The Dark World,8-Nov-13,3.0,3.0,9.0,5.0,5.0,7.5,5.0
8,Captain America: The Winter Soldier,4-Apr-14,7.0,7.0,9.0,9.0,,9.2,8.0
9,Guardians of the Galaxy,1-Aug-14,5.0,6.0,,9.0,8.0,9.2,6.666667


### 2. Based on audience rating, separate critics into appropriate groups, and possible characteristics of those groups. Explain clearly what your method is and why you chose it.

## Modelling

### 1. Fill in the missing values of critics rating by following step

#### 1. Normalize the values for the critics and transform values to z-scores

In [36]:
from scipy.stats import zscore

In [48]:
df['Critic 1'] = (df['Critic 1'] - df['Critic 1'].mean())/df['Critic 1'].std(ddof=0)
df['Critic 2'] = (df['Critic 2'] - df['Critic 2'].mean())/df['Critic 2'].std(ddof=0)
df['Critic 3'] = (df['Critic 3'] - df['Critic 3'].mean())/df['Critic 3'].std(ddof=0)
df['Critic 4'] = (df['Critic 4'] - df['Critic 4'].mean())/df['Critic 4'].std(ddof=0)
df['Critic 5'] = (df['Critic 5'] - df['Critic 5'].mean())/df['Critic 5'].std(ddof=0)
df

Unnamed: 0,Movie,Release Date,Critic 1,Critic 2,Critic 3,Critic 4,Critic 5,Audience Rating,Mean
0,Iron Man,2-May-08,-0.920137,-1.333333,-0.441771,-0.46291,-0.920137,9.1,5.25
1,The Incredible Hulk,13-Jun-08,-0.920137,,-1.204829,-2.036804,-0.920137,7.0,4.5
2,Iron Man 2,7-May-10,-0.920137,-0.333333,0.321288,-0.46291,-0.920137,7.1,5.75
3,Thor,6-May-11,,-0.333333,0.321288,-0.987541,,7.6,5.75
4,Captain America: The First Avenger,22-Jul-11,0.065724,-0.833333,,0.061721,0.065724,7.5,5.666667
5,Marvel's The Avengers,4-May-12,1.051585,0.666667,0.321288,,1.051585,9.1,6.333333
6,Iron Man 3,3-May-13,-0.920137,,0.321288,,-0.920137,7.8,5.5
7,Thor: The Dark World,8-Nov-13,-0.920137,-1.833333,1.084346,-0.987541,-0.920137,7.5,5.0
8,Captain America: The Winter Soldier,4-Apr-14,,0.166667,1.084346,1.110984,,9.2,8.0
9,Guardians of the Galaxy,1-Aug-14,0.558655,-0.333333,,1.110984,0.558655,9.2,6.666667


#### 2. Compute the mean and covariance matrix for the critics using normalized values

In [51]:
cov_df = df[['Critic 1', 'Critic 2', 'Critic 3', 'Critic 4', 'Critic 5']].cov()
cov_df

Unnamed: 0,Critic 1,Critic 2,Critic 3,Critic 4,Critic 5
Critic 1,1.071429,0.881587,0.398931,0.623007,1.071429
Critic 2,0.881587,1.058824,-0.172597,0.406445,0.881587
Critic 3,0.398931,-0.172597,1.055556,0.314541,0.398931
Critic 4,0.623007,0.406445,0.314541,1.0625,0.623007
Critic 5,1.071429,0.881587,0.398931,0.623007,1.071429


In [52]:
mean_df = df[['Critic 1', 'Critic 2', 'Critic 3', 'Critic 4', 'Critic 5']].mean()
mean_df

Critic 1   -7.401487e-17
Critic 2   -2.467162e-16
Critic 3    3.038505e-16
Critic 4   -2.089832e-16
Critic 5    2.368476e-16
dtype: float64

#### 3. Assuming critics normalized values follow a multivariate normal distribution, fill in the missing values by maximizing log likelihood of the joint distribution