# Data Exploration Analysis (EDA)

This notebook provides insights to create a regression/classification model to predict the scores of movies (```imdb_score```)

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import os
import itertools

In [None]:
# Get working directory
os.getcwd()

In [None]:
# Read data
imdb = pd.read_csv('../data/movie_metadata.csv')

In [None]:
# Inspecting type of columns
imdb.dtypes

In [None]:
# Inspecting data
imdb.head()

In [None]:
# Transforming type of variable
imdb['aspect_ratio'] = imdb['aspect_ratio'].astype('object')

# Adding the imdb_score categorized
imdb['categorical_imdb_score'] = pd.cut(imdb['imdb_score'], bins=[0, 4, 6, 8, 10], right=True, labels=False) + 1

# Removing uninformative column
imdb.drop(['movie_imdb_link', 'movie_title'], axis=1, inplace=True)

imdb.head()

In [None]:
# Summarizing missing columns
imdb.isna().sum().sort_values(ascending=False)

## Handling with multi level of categorical variables

Notice that ```genres``` and ```plot_keywords``` have multiple categories for the same movies, in order to make these variables usable to model the ```categorical_imdb_score```  let's transform each of them in dummy variables

### ```genres```

In [None]:
# genres in list of lists
genres_list = imdb['genres'].str.split('|').tolist()

# Unique genres
unique_genres = set(list(itertools.chain.from_iterable(genres_list)))

len(unique_genres)

The length of unique genres is not so big, let's make then as dummy variables and explore the relation with ```categorical_imdb_score```

In [None]:
genres_dummy = imdb['genres'].str.get_dummies()

# Removing genres column
imdb.drop(['genres'], axis=1, inplace=True)

genres_dummy.head()

In [None]:
# Create DF with dummy variables and categorical_imdb_score
genres_df = pd.concat([imdb['categorical_imdb_score'], genres_dummy], axis=1)

# List of variables to generate the figure
all_variables = genres_df.columns.tolist()

# Auxiliary parameteres for figure
n_vars = len(all_variables) - 1
n_row = np.ceil(np.sqrt(n_vars))
i = 1

fig = plt.figure(figsize=(15, 12))

for var in all_variables:
    if var != 'categorical_imdb_score':
        ax = fig.add_subplot(n_row, n_row, i)
        sns.countplot(x='categorical_imdb_score', hue=var, data=genres_df)
        plt.xlabel("")
        i += 1
fig.tight_layout(pad=.5)
plt.show()

### ```plot_keywords```

In [None]:
# plot_keywords in list of lists
keywords_list = imdb['plot_keywords'].str.split('|').tolist()

# For each list replace space character by '-', if nan then replace by 'nan' 
# string
clean_keywords = list()
for line in keywords_list:
  if line is np.nan:
    clean_line = 'nan'
  else:
    clean_line = list()
    for word in line:
      clean_line.append(word.replace(' ', '-'))
  clean_keywords.append(clean_line)

# Unique plot_keywords
unique_keywords = set(list(itertools.chain.from_iterable(clean_keywords)))

len(unique_keywords)

## Analysis of ```imdb_score``` variables

In [None]:
# Distribution of imdb_score
hist = ggplot(imdb, aes(x='imdb_score')) +\
    geom_histogram()

qq_plot = ggplot(imdb, aes(sample='imdb_score')) +\
    geom_qq() +\
    geom_qq_line()

print(hist)
print(qq_plot)

As we see the above, the distribution of ```imdb_score``` is not even close to normal, let's apply the Boxcox transformation to approximate to the normal distribution 

In [None]:
# Boxcox transformation of imdb_score
bc = stats.boxcox(np.array(imdb['imdb_score']))
imdb['bc_imdb_score'] = bc[0]

# Lambda value
bc[1]

In [None]:
# Distribution of bc_imdb_score
hist = ggplot(imdb, aes(x='bc_imdb_score')) +\
    geom_histogram()

qq_plot = ggplot(imdb, aes(sample='bc_imdb_score')) +\
    geom_qq() +\
    geom_qq_line()

print(hist)
print(qq_plot)


In [None]:
# Distribution of categorical_imdb_score
imdb['categorical_imdb_score'].value_counts()

## Relation between ```categorical_imdb_score``` and other variables

### Categorical variables

In [None]:
# Selecting only categorical variables
categorical = imdb.select_dtypes(include=['object'])

# Numer of unique values of categorical variables
for var in categorical:
    unique_values = list(set(imdb[var]))
    print(var + ':', len(unique_values))

let's analize the influence of each level of the categorical variables in response to ```categorical_imdb_score```

#### ```aspect_ratio```

In [None]:
ggplot(aes(x='categorical_imdb_score', fill='aspect_ratio'), data=imdb) +\
    geom_bar(position='fill')

#### ```content_rating```

In [None]:
ggplot(aes(x='categorical_imdb_score', fill='content_rating'), data=imdb) +\
    geom_bar(position='fill')

Let's try to work with four groups in ```content_rating``` variable:
- PG-13
- R
- PG
- Other

In [None]:
# Change the values in content_rating
for i in range(0, len(imdb)): 
    if imdb['content_rating'][i] not in ['PG-13', 'R', 'PG']:
        imdb['content_rating'][i] = 'Other'

In [None]:
ggplot(aes(x='categorical_imdb_score', fill='content_rating'), data=imdb) +\
    geom_bar(position='fill')

#### ```language```

In [None]:
ggplot(aes(x='categorical_imdb_score', fill='language'), data=imdb) +\
    geom_bar(position='fill')

When ```categorized_imdb_score``` grows the other languages appear more frequently. So let's work with 2 levels, "English" and "Other"

In [None]:
# Change the values in language
for i in range(0, len(imdb)): 
    if imdb['language'][i] not in ['English']:
        imdb['language'][i] = 'Other'

In [None]:
ggplot(aes(x='categorical_imdb_score', fill='language'), data=imdb) +\
    geom_bar(position='fill')

#### ```country```

In [None]:
# Distribution of country
imdb['country'].value_counts().head(10)

In [None]:
# Agrouping levels
imdb_country = imdb.copy()

# Change the values in country
for i in range(0, len(imdb_country)): 
    if imdb_country['country'][i] not in ['USA', 'UK', 'France', 'Canada', 'Germany', 'Australia']:
        imdb_country['country'][i] = 'Other'

In [None]:
ggplot(aes(x='categorical_imdb_score', fill='country'), data=imdb_country) +\
    geom_bar(position='fill')

In [None]:
imdb = imdb_country.copy()

#### ```director_name```

In [None]:
# Count the number of movies per director
director_values = imdb['director_name'].value_counts()

# Get the best k directors
k = 5
top_directors = list(director_values[:k].index)

# Data to dummy directors
director_dummy = imdb['director_name'].copy()

for i in range(0, len(imdb)): 
    if director_dummy[i] not in top_directors:
        director_dummy[i] = np.nan

# Dummy variables of the directors
director_dummy = director_dummy.str.get_dummies()

director_values.head(k)

In [None]:
# Create DF with dummy variables and categorical_imdb_score
director_df = pd.concat([imdb['categorical_imdb_score'], director_dummy], axis=1)

# List of variables to generate the figure
all_variables = director_df.columns.tolist()

# Auxiliary parameteres for figure
n_vars = len(all_variables) - 1
n_row = np.ceil(np.sqrt(n_vars))
i = 1

fig = plt.figure(figsize=(10, 8))

for var in all_variables:
    if var != 'categorical_imdb_score':
        ax = fig.add_subplot(n_row, n_row, i)
        sns.countplot(x='categorical_imdb_score', hue=var, data=director_df)
        plt.xlabel("")
        i += 1
fig.tight_layout(pad=.5)
plt.show()


It's seems that the most frequent directors made movies with ```categorical_imdb_score``` 3 or above.

#### ```actor_1_name```

In [None]:
# Count the number of movies per actor_1_name
actor1_values = imdb['actor_1_name'].value_counts()

# Get the best k directors
k = 5
top_actors = list(actor1_values[:k].index)

# Data to dummy directors
actor1_dummy = imdb['actor_1_name'].copy()

for i in range(0, len(imdb)): 
    if actor1_dummy[i] not in top_actors:
        actor1_dummy[i] = np.nan

# Dummy variables of the directors
actor1_dummy = actor1_dummy.str.get_dummies()

actor1_values.head(k)

In [None]:
# Create DF with dummy variables and categorical_imdb_score
actor1_df = pd.concat([imdb['categorical_imdb_score'], actor1_dummy], axis=1)

# List of variables to generate the figure
all_variables = actor1_df.columns.tolist()

# Auxiliary parameteres for figure
n_vars = len(all_variables) - 1
n_row = np.ceil(np.sqrt(n_vars))
i = 1

fig = plt.figure(figsize=(10, 8))

for var in all_variables:
    if var != 'categorical_imdb_score':
        ax = fig.add_subplot(n_row, n_row, i)
        sns.countplot(x='categorical_imdb_score', hue=var, data=actor1_df)
        plt.xlabel("")
        i += 1
fig.tight_layout(pad=.5)
plt.show()

### Quantitative variables

Before explore the relation between ```categorical_imdb_score``` and quantitative variables, let's check their distribution

In [None]:
# Selecting quantitative variables
quantitative = imdb.select_dtypes(include=['float64', 'int64']).drop(['imdb_score', 'bc_imdb_score', 'categorical_imdb_score'], axis=1)

# List of variables to plot
quantitative_vars = quantitative.columns.tolist()

quantitative_vars

#### Distribution of quantative variables

In [None]:
# Auxiliary parameteres for figure
n_vars = len(quantitative_vars)
n_row = np.ceil(np.sqrt(n_vars))
i = 1

fig = plt.figure(figsize=(14, 10))

for var in quantitative_vars:
        ax = fig.add_subplot(n_row, n_row, i)
        sns.distplot(quantitative[var], kde=False)
        i += 1
fig.tight_layout(pad=.5)
plt.show()

The distribution of variables is very tailored, so let's apply log transformation to get closer to normal distribution

In [None]:
# Transforming to log scale
log_quantitative = quantitative.transform(lambda x: np.log(x + 0.0001))

# List of variables to plot
quantitative_vars = log_quantitative.columns.tolist()

quantitative_vars

In [None]:
# Auxiliary parameteres for figure
i = 1

fig = plt.figure(figsize=(14, 10))

for var in quantitative_vars:
        ax = fig.add_subplot(n_row, n_row, i)
        sns.distplot(log_quantitative[var], kde=False)
        i += 1
fig.tight_layout(pad=.5)
plt.show()

#### Relation between ```categorical_imdb_score``` and quantitative variables

In [None]:
# Adding categorical_imdb_score after processing the other variables
log_quantitative_df = pd.concat([imdb['categorical_imdb_score'], log_quantitative], axis=1)

# Auxiliary parameteres for figure
i = 1

fig = plt.figure(figsize=(15, 12))

for var in quantitative_vars:
    if var != 'categorical_imdb_score':
        ax = fig.add_subplot(n_row, n_row, i)
        sns.boxplot(x='categorical_imdb_score', y=var, data=log_quantitative_df)
        plt.xlabel("")
        i += 1
fig.tight_layout(pad=.8)
plt.show()

Notice that a few variables are related to values of ```categorical_imdb_score``` such as:
- ```num_critic_for_reviews```
- ```duration```
- ```director_facebook_likes```
- ```gross```
- ```num_voted_users```
- ```num_users_for_reviews```
- ```title_year```
- ```movie_facebook_likes```

And other can be related:
- ```actor_3_facebook_likes```
- ```actor_1_facebook_likes```
- ```budget```
- ```actor_2_facebook_likes```