<a href="https://colab.research.google.com/github/mrcyme/Gru-classifier-for-movie-genres-based-on-synopsis/blob/orphan_branch/Predict_movie_genres.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
"""Module does blah blah."""

from flask import Flask, request, jsonify, Response
import io
import pandas as pd
import numpy as np
import re

import json
from tensorflow.keras.layers import LSTM, Dense, Embedding, GRU
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, Adagrad,RMSprop
from sklearn.preprocessing import MultiLabelBinarizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

app = Flask(__name__)
MAX_WORDS = 80000
EMB_DIM = 100
NUMBER_OF_GENRES = 19
MULTILABEL_BINARIZER = MultiLabelBinarizer()
BATCH_SIZE = 128
N_EPOCHS = 10
MAX_VECTOR_LEN = 200
DROUPOUT_RATE = 0.5
HIDDEN_DIM = 128
LEARNING_RATE = 0.002
EARLY_STOPPING = EarlyStopping(monitor='val_loss', 
                               mode='min',
                               restore_best_weights=True)


def clean_text(text):
    """Clean text."""
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    stopWords = set(stopwords.words('english'))
    text = ' '.join([w for w in text.split() if w not in stopWords])
    return text


def get_embedded_synopsis(df,tokenizer=None):
    """Preprocess synopsis."""
    x_train = df['synopsis'].apply(lambda x: clean_text(x)).to_numpy()
    if not tokenizer:
      tokenizer = text.Tokenizer(num_words=MAX_WORDS)
      tokenizer.fit_on_texts(x_train)
      with open('tokenizer.json', 'w', encoding='utf-8') as f:  
          f.write(json.dumps(tokenizer.to_json(), ensure_ascii=False))
    x_train_seq = tokenizer.texts_to_sequences(x_train)
    x_train_pad = sequence.pad_sequences(x_train_seq, maxlen=MAX_VECTOR_LEN)
    return x_train_pad


def get_one_hot_genres(df):
    """Return the genres under one hot form."""
    genres = df['genres'].apply(lambda x: x.split(" ")).to_numpy()
    y_train_one_hot = MULTILABEL_BINARIZER.fit_transform(genres)
    y_train_one_hot = np.array([[x / np.sum(row) for x in row] for row in y_train_one_hot])
    return y_train_one_hot


def probas_to_top_five(y_prob):
    """Convert the vector of probabilities assigned to each genre to a a list containing the five genres with max probability."""
    indices = np.argsort(-y_prob)[:5]
    top_five_genre = []
    for ind in indices:
        y_one_hot = np.zeros(19, dtype=int)
        y_one_hot[ind] = 1
        top_five_genre.append(MULTILABEL_BINARIZER.inverse_transform(np.expand_dims(y_one_hot, axis=0))[0][0])
    return top_five_genre


def generate_lstm(input_length=MAX_VECTOR_LEN, dropout_rate=DROUPOUT_RATE, hidden_dim=HIDDEN_DIM):
    """Generate a lstm model."""
    model_lstm = Sequential()
    model_lstm.add(Embedding(MAX_WORDS, EMB_DIM, input_length=input_length))
    model_lstm.add(LSTM(hidden_dim,
                        dropout=dropout_rate,
                        return_sequences=False))
    model_lstm.add(Dense(NUMBER_OF_GENRES, activation='softmax'))
    optimizer = RMSprop(learning_rate=LEARNING_RATE)
    model_lstm.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])
    return model_lstm

def generate_gru(input_length=MAX_VECTOR_LEN,dropout_rate=DROUPOUT_RATE,hidden_dim=HIDDEN_DIM):
    model_gru = Sequential()
    model_gru.add(Embedding(MAX_WORDS, EMB_DIM, input_length=input_length))
    model_gru.add(GRU(hidden_dim,dropout=dropout_rate,return_sequences=False))
    model_gru.add(Dense(NUMBER_OF_GENRES, activation = 'softmax'))
    optimizer = Adam(learning_rate=LEARNING_RATE)
    model_gru.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])
    return model_gru

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data preprocessing

In [None]:
df_movies = pd.read_csv('/content/gdrive/MyDrive/Radix data/train.csv')
msk = np.random.rand(len(df_movies)) < 0.8
df_movies_train = df_movies[msk]
df_movies_test = df_movies[~msk]

x_train = get_embedded_synopsis(df_movies_train)
y_train = get_one_hot_genres(df_movies_train)
model = generate_gru()
model.fit(x_train,
          y_train,
          batch_size=BATCH_SIZE,
          validation_split=0.1,
          epochs=N_EPOCHS,
          callbacks=[EARLY_STOPPING])



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<tensorflow.python.keras.callbacks.History at 0x7f9f9cbb4d10>

In [None]:
with open('tokenizer.json') as f: 
    tokenizer = text.tokenizer_from_json(json.load(f))
x_test = get_embedded_synopsis(df_movies_test,tokenizer=tokenizer)
y_test = get_one_hot_genres(df_movies_test)
prediction = model.predict(x_test)
pred_to_five = [probas_to_top_five(y) for y in prediction]

In [None]:
def average_precision_score(one_hot_actual, predicted_proba, k=5):
    predicted_list = probas_to_top_five(predicted_proba)
    one_hot_actual = one_hot_actual>0
    actual_list = list(MULTILABEL_BINARIZER.inverse_transform(np.expand_dims(one_hot_actual, axis=0))[0])
    score = 0.0
    num_hits = 0.0
    for i,p in enumerate(predicted_list):
        if p in actual_list and p not in predicted_list[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual_list:
        return 1.0
    if min(len(actual_list), k) == 0:
        return 0.0
    else:
        return score / min(len(actual_list), k)

score = np.mean([average_precision_score(y_test[i], prediction[i]) for i in range(len(prediction))])
print(score)

0.6564793593814717


In [None]:
def apk(actual, predicted, k=5):
    score = 0.0
    num_hits = 0.0
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual:
        return 1.0
    if min(len(actual), k) == 0:
        return 0.0
    else:
        return score / min(len(actual), k)
        
y_test_array = df_movies_test.genres.apply(lambda x: x.split(" ")).to_numpy()
score = np.mean([apk(y_test_array[i],pred_to_five[i]) for i in range(len(pred_to_five))])
print(score)

0.38333737662298406


In [None]:
df_movies = pd.read_csv('/content/gdrive/MyDrive/Radix data/test.csv')
x_test = get_embedded_synopsis(df_movies)


In [None]:
genre_mapping = {g:i for i,g in enumerate(set(x for g in df_movies['clean_genres'] for x in g))}
to_int = [[genre_mapping[x] for x in y] for y in  ytrain.to_numpy()]

In [None]:
def generate_gru(input_length=MAX_VECTOR_LEN,dropout_rate=DROUPOUT_RATE,hidden_dim=HIDDEN_DIM):
    model_gru = Sequential()
    model_gru.add(Embedding(MAX_WORDS, EMB_DIM , input_length=input_length))
    model_gru.add(GRU(hidden_dim,dropout=dropout_rate, return_sequences=True))
    model_gru.add(GRU(hidden_dim,dropout=dropout_rate,return_sequences=False))
    model_gru.add(Dense(NUMBER_OF_GENRES, activation = 'softmax'))
    optimizer = Adam(learning_rate=LEARNING_RATE)
    model_gru.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])
    return model_gru

In [None]:
DROUPOUT_RATE = 0.8
HIDDEN_DIM = 128
LEARNING_RATE = 0.002
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=1)
model_gru = Sequential()
model_gru.add(Embedding(max_words, emb_dim , input_length=xtrain_pad.shape[1]))
model_gru.add(GRU(HIDDEN_DIM, return_sequences=True))
model_gru.add(Dropout(DROUPOUT_RATE))
model_gru.add(GRU(HIDDEN_DIM, return_sequences=False))
model_gru.add(Dropout(DROUPOUT_RATE))
model_gru.add(Dense(19, activation = 'softmax'))
optimizer = Adam(learning_rate=LEARNING_RATE)
model_gru.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])

batch_size = 256
epochs  = 10
history_gru = model_gru.fit(xtrain_pad, ytrain_one_hot, validation_split=0.1, batch_size = batch_size, epochs = epochs,callbacks=[es,])