In [0]:
from google.colab import drive
import os

drive.mount('gdrive')
os.chdir('gdrive/My Drive/movie plot')

In [0]:
from gensim import models

model = models.KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin.gz', binary=True)

In [0]:
import pandas as pd
import numpy as np

dataset=pd.read_csv("dataset.csv")

plots=dataset["Plot"].to_numpy()
labels=dataset["Genre"].to_numpy()
print(plots.shape,labels.shape)

In [0]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
print(labels[0:10])
labels=encoder.fit_transform(labels)
print(labels[0:10])
print(encoder.classes_)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(plots, labels, test_size=0.2)
X_val, X_test,y_val,y_test=train_test_split(X_test, y_test, test_size=0.2)
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape,X_val.shape,y_val.shape)

In [0]:
def remove_stopwords(plots):
  import nltk
  import re

  nltk.download('stopwords')
  from nltk.corpus import stopwords
  stopwords=set(stopwords.words('english'))
  print(len(stopwords))

  result=[]
  for plot in plots:
    plot=plot.lower()
    for stopword in stopwords:
      plot=re.sub(" "+stopword+" "," ",plot)
      plot=re.sub("^"+stopword+" "," ",plot)
      plot=re.sub(stopword+"$"," ",plot)
    result.append(plot)
  return result

X_train=remove_stopwords(X_train)
X_val=remove_stopwords(X_val)
X_test=remove_stopwords(X_test)


In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer=Tokenizer(num_words=20001, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
print(tokenizer.word_index)

In [0]:
print(len(tokenizer.word_index.keys()))

In [0]:
X_train_seq=tokenizer.texts_to_sequences(X_train)

In [0]:
vocabulary=set()
for seq in X_train_seq:
  for word in seq:
    vocabulary.add(word)


In [0]:
print(len(vocabulary))

In [0]:
index2word={index:word for word,index in tokenizer.word_index.items()}

In [0]:
import json

with open('dataset/word2index','w') as f:
  json.dump(tokenizer.word_index,f)


In [0]:
with open('dataset/index2word','w') as f:
  json.dump(index2word,f)

In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_padded=pad_sequences(X_train_seq,padding='post',truncating='pre',maxlen=100)
print(X_train_padded.shape)

In [0]:
np.save("dataset/X_train.npy",X_train_padded)
np.save("dataset/y_train.npy",y_train)

In [0]:
X_val_seq=tokenizer.texts_to_sequences(X_val)
X_val_padded=pad_sequences(X_val_seq,padding='post',truncating='pre',maxlen=100)
print(X_val_padded.shape)
np.save("dataset/X_val.npy",X_val_padded)
np.save("dataset/y_val.npy",y_val)

In [0]:
X_test_seq=tokenizer.texts_to_sequences(X_test)
X_test_padded=pad_sequences(X_test_seq,padding='post',truncating='pre',maxlen=100)
print(X_test_padded.shape)
np.save("dataset/X_test.npy",X_test_padded)
np.save("dataset/y_test.npy",y_test)

In [0]:
embedding_matrix=np.zeros(shape=(20001,300))

In [0]:
print(embedding_matrix.shape)

In [0]:
from gensim import models

model = models.KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin.gz', binary=True)

In [0]:
have_embedding=0
no_embedding=0
bound = 1
for index in vocabulary:
  embedding=None
  word=index2word[index]
  try:
    embedding=model.word_vec(word)
  except:
    embedding=None
  if embedding is not None:
    have_embedding+=1
    embedding_matrix[index,:]=np.array(embedding)
  else:
    embedding_matrix[index,:]=np.random.uniform(-bound, bound, 300)
    no_embedding+=1

print("{}words have embedding".format(have_embedding))
print("{}words have no embedding".format(no_embedding))
np.save("dataset/embedding_matrix.npy",embedding_matrix)

In [0]:
print(embedding_matrix[0:5].shape)