# Automatic assignment of genres from movie synopsis using supervised machine learning

## 1. Import libraries and load data

In [46]:
packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import json
from textblob import TextBlob, Word
import nltk
import re
import csv
import matplotlib.pyplot as plt 
#import seaborn as sns
#from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [64]:
# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
# Performance metric
from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import jaccard_score
from sklearn.metrics import accuracy_score

In [2]:
train= pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

## 2. DATA PREPARATION 

In [17]:
ps =PorterStemmer() # stemmer to lemmatize words
#function to remove all the stopwords that may affects the prestation of the model

nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/marco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
# function for text cleaning 
def preprocess_text(text):
    text = text.lower() # lowercase
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\'\n", " ", text) #line breaks
    #text = re.sub(r"\'\xa0", " ", text) # xa0 Unicode representing spaces
    #text = re.sub('\s+', ' ', text) # one or more whitespace characters
    text = text.strip(' ') # spaces
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    #lemmatize and remove stopwords
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    text = ' '.join(no_stopword_text) 
        
    return text

train['clean_plot'] = train['synopsis'].apply(lambda x: preprocess_text(x))
test['clean_plot'] = test['synopsis'].apply(lambda x: preprocess_text(x))

In [40]:
def lemma(text): # Lemmatization of cleaned body
        sent = TextBlob(text)
        tag_dict = {"J": 'a', 
                    "N": 'n', 
                    "V": 'v', 
                    "R": 'r'}
        words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
        lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
        seperator=' '
        lemma = seperator.join(lemmatized_list) 
        return lemma

In [41]:
train['lemma'] = train['clean_plot'].apply(lambda x: lemma(x))
test['lemma'] = test['clean_plot'].apply(lambda x: lemma(x))
train.head()

Unnamed: 0,movie_id,year,synopsis,genres,clean_plot,lemma,lemmalist
0,30924,2005,Cruel But Necessary is the story of Betty Muns...,Drama,cruel necessary story betty munson strange jou...,cruel necessary story betty munson strange jou...,cruel necessary story betty munson strange jou...
1,34841,2012,"Yorkshire, 1974, the Maynard family moves into...",Drama Horror Thriller,yorkshire maynard family moves dream house dre...,yorkshire maynard family move dream house drea...,yorkshire maynard family move dream house drea...
2,23408,2017,When a renowned architecture scholar falls sud...,Drama,renowned architecture scholar falls suddenly i...,renowned architecture scholar fall suddenly il...,renowned architecture scholar fall suddenly il...
3,39470,1996,The story dealt with Lord Rama and his retalia...,Children Drama,story dealt lord rama retaliation ravana chara...,story dealt lord rama retaliation ravana chara...,story dealt lord rama retaliation ravana chara...
4,7108,2003,A Thai playboy cons a girl into bed and then l...,Comedy Drama Horror Thriller,thai playboy cons girl bed leaves finding preg...,thai playboy con girl bed leave find pregnant ...,thai playboy con girl bed leave find pregnant ...


## 3. Variables preparation 

In [42]:
#estrapolate the genres vector from the train dataset
X = train['lemma']
y = train['genres']
    

### 3.1 Target variable one hot encoding

In [49]:
#apply the onehot transformation for the genres vector

one_hot = MultiLabelBinarizer() # encoder for the  tags 
y_onehot = one_hot.fit_transform(y.str.split(' ')) 
y_bin = pd.DataFrame(y_onehot, columns=one_hot.classes_ ) # transform it to Pandas object

In [50]:
print(y_bin.shape)
y_bin.head()


(36518, 19)


Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0


In [53]:
tfidf_vectorizer = TfidfVectorizer(norm=None,analyzer='word',min_df = 5, max_df = 0.8, ngram_range=(1,2),max_features = 10000, use_idf=True)

In [54]:
# applying TF-IDF features to the synopsis
TF_IDF = tfidf_vectorizer.fit_transform(X)
TF_IDF_dense = TF_IDF.todense()

In [55]:
# 80/20 split TF-IDF
X_tfidf_train, X_tfidf_val, y_train_bin, y_val_bin = train_test_split(TF_IDF_dense, y_bin,  test_size=0.2,train_size=0.8, random_state=0)

## 4.The Model

In [62]:
# Define a time counter function to test the algorythms performance 
_start_time = time.time()

def process_time_starts():
    global _start_time 
    _start_time = time.time()

def time_elapsed():
    t_sec = round(time.time() - _start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('The process took: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))

In [70]:
def print_score(y_pred, clf):
    
    print("Clf: ", clf.__class__.__name__)
    print("Jaccard score: {}".format(jaccard_score(y_val_bin, y_pred, average='weighted'))) # measure of similarity for the two sets of data. Average is set to "samples" since is between two multilabeled
    print("Hamming loss: {}".format((hamming_loss(y_pred, y_val_bin)))) #% of the wrong labels to the total number of labels
    print("Accuracy score: {}".format((accuracy_score(y_val_bin, y_pred))))   
    print("---")  
    

## 5.The prediction

In [None]:
y_pred = clf.predict(tfidf_vectorizer.transform(X_test))

In [None]:
y_pred.shape

In [None]:
print(len(multilabel_binarizer.inverse_transform(y_pred)))
pred_gen = multilabel_binarizer.inverse_transform(y_pred)

In [None]:
submission = pd.DataFrame(data= {'movie_id':x_test.movie_id,'predicted_genres':pred_gen})

In [None]:
for i in range(0,len(submission.predicted_genres)):
   submission.predicted_genres[i] =(','.join((submission.predicted_genres[i])))

In [None]:
for i in range(0,len(submission.predicted_genres)):
    submission.predicted_genres[i] = submission.predicted_genres[i].replace(","," ")

In [None]:
submission.to_csv('submission.csv',index=False)