## Profiles

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import pandas as pd
import os
import numpy as np
"""
Need to upload the file FeatureExtractor to current folder also
"""
from FeatureExtractor import BORFeatureExtractor
from sklearn.preprocessing import MultiLabelBinarizer
import ast

In [6]:
os.chdir("/content/drive/MyDrive/MovieRSystem")

In [7]:
dataset = pd.read_csv("CleanedData/dataset.csv")

In [8]:
columns = ['genres', 'production_companies','production_countries', 'cast', 'crew', 'keywords']

In [9]:
"""
This pieces of code transform the raw text into python data structure (List, Dict, ...)
"""
for column in columns:
    dataset[column] = dataset[column].apply(ast.literal_eval)

In [10]:
"""
Feature Extraction including data about crew, cast, production_companies, genres
"""
fe = BORFeatureExtractor()
fe.fit(dataset)
fe.extract(dataset)

mlb = MultiLabelBinarizer()
onehot_genre = mlb.fit_transform(dataset['genres'])
dataset = dataset.join(pd.DataFrame(onehot_genre,
                                columns=mlb.classes_,
                                index=dataset.index))

After feature extraction, we will drop these columns

In [11]:
dataset.drop(['overview', 'title', 'production_companies','production_countries'\
            ,'cast', 'crew', 'keywords', 'original_language', 'genres'], axis = 1, inplace = True)

In [12]:
dataset

Unnamed: 0,budget,id,runtime,revenue,vote_average,vote_count,CastsRank,NumLeadActors,HasTop50Actors,NumCrews,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,60000000,949,170.0,187436818.0,7.7,1886.0,14107,3,0,71,...,0,0,0,0,0,0,0,1,0,0
1,58000000,710,130.0,352194034.0,6.6,1194.0,6030,3,1,46,...,0,0,0,0,0,0,0,1,0,0
2,98000000,1408,119.0,10017322.0,5.7,137.0,3535,0,0,16,...,0,0,0,0,0,0,0,0,0,0
3,52000000,524,178.0,116112375.0,7.8,1343.0,6221,0,0,10,...,0,0,0,0,0,0,0,0,0,0
4,16500000,4584,136.0,135000000.0,7.2,364.0,4812,2,1,8,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2825,0,80831,121.0,0.0,6.5,2.0,1206,0,0,4,...,0,0,0,0,0,0,0,0,0,0
2826,0,3104,92.0,0.0,5.9,33.0,1206,0,0,11,...,0,1,0,0,0,1,0,0,0,0
2827,0,64197,97.0,0.0,6.0,5.0,449,0,0,4,...,0,0,0,0,1,0,0,0,0,0
2828,0,98604,91.0,0.0,4.6,6.0,2180,0,0,4,...,0,0,0,0,1,0,0,0,0,0


In [13]:
!mkdir FeatureExtracted

Save to folder

In [14]:
dataset.to_csv("FeatureExtracted/dataset.csv", index = False)