# References
- [Handling imbalanced data (kdnuggets)](https://www.kdnuggets.com/2017/06/7-techniques-handle-imbalanced-data.html)
- [Resampling strategies for imbalanced datasets (Kaggle)](https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets)
- [Handling Imbalanced Data (TensorFlow)](https://www.tensorflow.org/tutorials/structured_data/imbalanced_data)

# Import modules

In [1]:
# For data processing
import numpy as np
import pandas as pd

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For data splitting
from sklearn.model_selection import train_test_split

# For handling imbalanced dataset
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Data Splitting

In [2]:
# Select several columns
cols = ['popularity', 'original_language',
        'top_casts_popularity_avg', 'casts_popularity_sum',
        'crews_popularity_sum', 'genres', 'decade_released', 
        'budget', 'revenue', 'vote_count', 'vote_average']

df = pd.read_csv('cleaned-movie-dataset.csv', index_col=0)[cols]

# Create a new column 'in_en', 1 if the original language is in English and 0 otherwise
df['in_en'] = (df['original_language'] == 'en').astype('int64')

df = df.drop(columns=['original_language'])

# Rearrange columns
df = df[['popularity', 'top_casts_popularity_avg', 'casts_popularity_sum',
        'crews_popularity_sum', 'budget', 'decade_released', 'genres',
        'revenue', 'vote_count', 'vote_average']]

df.head()

Unnamed: 0,popularity,top_casts_popularity_avg,casts_popularity_sum,crews_popularity_sum,budget,decade_released,genres,revenue,vote_count,vote_average
0,283.822,30.3952,755.45,473.269,356000000.0,2010s,"['Adventure', 'Science Fiction', 'Action']",2797801000.0,17799,8.3
1,102.112,10.2648,108.804,878.826,237000000.0,2000s,"['Action', 'Adventure', 'Fantasy', 'Science Fi...",2787965000.0,23178,7.5
2,86.808,11.9526,201.581,122.649,200000000.0,1990s,"['Drama', 'Romance']",2187464000.0,18982,7.9
3,53.93,16.0132,391.303,235.947,245000000.0,2010s,"['Action', 'Adventure', 'Science Fiction', 'Fa...",2068224000.0,15675,7.4
4,299.524,30.3952,528.712,501.005,300000000.0,2010s,"['Adventure', 'Action', 'Science Fiction']",2046240000.0,21489,8.3


In [3]:
# Split the DF into predictors and response
X, y = df.iloc[:, :-1], df.iloc[:, -1]

# Define good as vote_average >= 7.2 (80% percentile)
y = (y >= 7.2).astype('int64')

# Split the dataset into 70% train and 30% test set
# Set random_state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=20)

In [4]:
print("Good vs Not Good Movies Train Set Proportion")
print(y_train.value_counts()[1], ':', y_train.value_counts()[0])

Good vs Not Good Movies Train Set Proportion
1018 : 3931


## Random Oversampling

In [5]:
# Define oversampling strategy (creates 1:2 ratio)
over_smp = RandomOverSampler(sampling_strategy=0.5)

# Fit and apply the transform
X_res, y_res = over_smp.fit_resample(X_train, y_train)

In [6]:
print("Good vs Not Good Movies Train Set Proportion (After oversampling)")
print(y_res.value_counts()[1], ':', y_res.value_counts()[0])

Good vs Not Good Movies Train Set Proportion (After oversampling)
1965 : 3931


In [7]:
# Combine the X and y to a single DF and rename column
oversampled_train_df = pd.concat([X_res, y_res], axis=1).rename(columns={'vote_average': 'is_good'})
oversampled_train_df.head()

Unnamed: 0,popularity,top_casts_popularity_avg,casts_popularity_sum,crews_popularity_sum,budget,decade_released,genres,revenue,vote_count,is_good
0,1.68,2.4858,34.059,8.671,3300000.0,2010s,"['Comedy', 'Action', 'Drama']",10000000.0,8,0
1,18.013,5.9504,46.018,9.456,26000000.0,2010s,"['Thriller', 'Adventure', 'Mystery', 'Fantasy'...",34592118.0,200,0
2,17.237,11.428,182.113,198.986,65000000.0,2000s,"['Drama', 'History', 'Thriller']",163247198.0,1122,0
3,11.931,8.5968,110.585,44.527,20000000.0,2000s,"['Action', 'Drama', 'Thriller']",16605763.0,129,0
4,14.717,7.8144,99.835,75.898,51000000.0,2000s,"['Comedy', 'Family']",47231070.0,405,0


In [8]:
# Save as csv
oversampled_train_df.to_csv('oversampled-train.csv')

## SMOTETomek Resampling
This only applies for numerical variables.

In [9]:
# Select numerical variables only
X_train = X_train[['popularity', 'top_casts_popularity_avg', 'casts_popularity_sum',
                   'crews_popularity_sum', 'budget',
                   'revenue', 'vote_count']]
X_train.head()

Unnamed: 0,popularity,top_casts_popularity_avg,casts_popularity_sum,crews_popularity_sum,budget,revenue,vote_count
5527,1.68,2.4858,34.059,8.671,3300000.0,10000000.0,8
3263,18.013,5.9504,46.018,9.456,26000000.0,34592118.0,200
1022,17.237,11.428,182.113,198.986,65000000.0,163247198.0,1122
4603,11.931,8.5968,110.585,44.527,20000000.0,16605763.0,129
2733,14.717,7.8144,99.835,75.898,51000000.0,47231070.0,405


In [10]:
# Set random_state for reproducibility
smt = SMOTETomek(random_state=20)
X_res, y_res = smt.fit_resample(X_train, y_train)

In [11]:
print("Good vs Not Good Movies Train Set Proportion (After SMOTETomek resampling)")
print(y_res.value_counts()[1], ':', y_res.value_counts()[0])

Good vs Not Good Movies Train Set Proportion (After SMOTETomek resampling)
3431 : 3431


In [12]:
# Combine the X and y to a single DF and rename column
smotetomek_train_df = pd.concat([X_res, y_res], axis=1).rename(columns={'vote_average': 'is_good'})
smotetomek_train_df.head()

Unnamed: 0,popularity,top_casts_popularity_avg,casts_popularity_sum,crews_popularity_sum,budget,revenue,vote_count,is_good
0,1.68,2.4858,34.059,8.671,3300000.0,10000000.0,8,0
1,18.013,5.9504,46.018,9.456,26000000.0,34592118.0,200,0
2,11.931,8.5968,110.585,44.527,20000000.0,16605763.0,129,0
3,14.717,7.8144,99.835,75.898,51000000.0,47231070.0,405,0
4,23.357,5.9202,99.896,62.278,32000000.0,83615414.0,897,0


In [13]:
# Save as csv
smotetomek_train_df.to_csv('smotetomek-train.csv')

## SMOTEENN Resampling
Similar with SMOTETomek resampling, this only applies for numerical variables.

In [14]:
X_train = X_train[['popularity', 'top_casts_popularity_avg', 'casts_popularity_sum',
                   'crews_popularity_sum', 'budget',
                   'revenue', 'vote_count']]
X_train.head()

Unnamed: 0,popularity,top_casts_popularity_avg,casts_popularity_sum,crews_popularity_sum,budget,revenue,vote_count
5527,1.68,2.4858,34.059,8.671,3300000.0,10000000.0,8
3263,18.013,5.9504,46.018,9.456,26000000.0,34592118.0,200
1022,17.237,11.428,182.113,198.986,65000000.0,163247198.0,1122
4603,11.931,8.5968,110.585,44.527,20000000.0,16605763.0,129
2733,14.717,7.8144,99.835,75.898,51000000.0,47231070.0,405


In [15]:
# Set random_state for reproducibility
sme = SMOTEENN(random_state=20)
X_res, y_res = sme.fit_resample(X_train, y_train)

In [16]:
print("Good vs Not Good Movies Train Set Proportion (After SMOTEENN resampling)")
print(y_res.value_counts()[1], ':', y_res.value_counts()[0])

Good vs Not Good Movies Train Set Proportion (After SMOTEENN resampling)
1939 : 1756


In [17]:
# Combine the X and y to a single DF and rename column
smoteenn_train_df = pd.concat([X_res, y_res], axis=1).rename(columns={'vote_average': 'is_good'})
smoteenn_train_df.head()

Unnamed: 0,popularity,top_casts_popularity_avg,casts_popularity_sum,crews_popularity_sum,budget,revenue,vote_count,is_good
0,1.68,2.4858,34.059,8.671,3300000.0,10000000.0,8,0
1,11.931,8.5968,110.585,44.527,20000000.0,16605763.0,129,0
2,14.717,7.8144,99.835,75.898,51000000.0,47231070.0,405,0
3,12.05,4.4022,33.433,10.263,17500000.0,25024919.0,380,0
4,7.503,6.0656,36.672,13.52,160000.0,4000000.0,51,0


In [18]:
# Save as csv
smoteenn_train_df.to_csv('smoteenn-train.csv')