In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import sklearn.preprocessing as pre
import matplotlib.pyplot as plt
import math

In [2]:
movie_data = pd.read_csv("../../data/interim/movie-data.csv")
movie_data.head()

Unnamed: 0,revenue,class,id,budget,starpower,certification,year,month,Action,Adventure,...,Horror,Music,Mystery,Romance,Science Fiction,Thriller,War,Western,rating,ratingClass
0,191502426,8,tt0101272,30000000,11000920558,11,1991,11,0,0,...,1,0,0,0,0,0,0,0,6.8,6
1,152368585,8,tt0101393,75000000,19781169355,6,1991,5,1,0,...,0,0,1,0,0,1,0,0,6.7,6
2,6153939,2,tt0101410,9000000,22026008332,6,1991,8,0,0,...,0,0,0,0,0,0,0,0,7.7,7
3,377350553,9,tt0101414,25000000,27872758981,14,1991,11,0,0,...,0,1,0,1,0,0,0,0,8.0,8
4,57504069,5,tt0101507,6500000,13348646490,6,1991,7,0,0,...,0,0,0,0,0,0,0,0,7.8,7


In [3]:
norm_data = movie_data.copy()

In [4]:
# Applying log transformation and standard scaling

log_std_columns = ['revenue', 'starpower', 'budget']

def log(data):
    return data.map(lambda x: np.log(x + 1))

def std(data):
    return pre.StandardScaler().fit_transform(np.array(data).reshape(-1, 1))

for column in log_std_columns:
    norm_data[column] = log(norm_data[column])
    norm_data[column] = std(norm_data[column])

In [5]:
# Applying MinMax transform

min_max_columns = ['year', 'certification']

def minmax(data):
    return pre.MinMaxScaler(feature_range=(-1, 1)).fit_transform(np.array(data).reshape(-1, 1))

for column in min_max_columns:
    norm_data[column] = minmax(norm_data[column])



In [6]:
# Applying Cyclic Transformation

norm_data['month1'] = norm_data['month'].map(lambda x: np.sin(x * 2 * math.pi / 12))
norm_data['month2'] = norm_data['month'].map(lambda x: np.cos(x * 2 * math.pi / 12))

In [7]:
# Normalizing rating

norm_data['rating'] = round(norm_data['rating']) / 10

In [8]:
# Removing ununsed columns

norm_data = norm_data.drop('month', axis=1)

In [9]:
norm_data.columns

Index(['revenue', 'class', 'id', 'budget', 'starpower', 'certification',
       'year', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror',
       'Music', 'Mystery', 'Romance', 'Science Fiction', 'Thriller', 'War',
       'Western', 'rating', 'ratingClass', 'month1', 'month2'],
      dtype='object')

In [10]:
norm_data.to_csv('../../data/processed/normalized-movie-data.csv', index=False)