In [11]:
import json # to open json files which contain the movie infos
import os # to find the files that are going to be opened
import pandas as pd # to do dataframe operations

import nltk # to do word preprocessing
from nltk.corpus import stopwords # to remove stopwords
from nltk.tokenize import word_tokenize # to tokenize words
from nltk.stem import WordNetLemmatizer #to lemmatize words
import string # to do all other string operations

import numpy as np # import numpy to do matrix operations with integers faster

from tensorflow.keras.preprocessing.text import Tokenizer # to tokenize text to later use as neural network inputs
from tensorflow.keras.preprocessing.sequence import pad_sequences #used for padding squences rapidly
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
from time import time
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D, Convolution1D, LSTM, Conv2D, MaxPooling1D 
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras import layers
#used for creating and training neural networks

from sklearn.metrics import f1_score,precision_score,recall_score # to calculate different success rates

nltk.download('stopwords')
nltk.download('punkt')

def find_content(line, which_quote): #used for finding movie genres and synopsis in files
    j = 0
    k = -1
    while (line[j] != '"' or which_quote != 1):
        if (line[j] == '"'):
            which_quote-=1
        j+=1
    while (line[k] != '"'):
        k-=1
    return line[j+1:k]
        

def add_to_dataframe(json_path): # used for adding data in each file to a dataframe
    descriptions = [] #array of descriptions
    genres = [] #array of genres
    #with open("data_tekli/" + json_path, "r") as movie_infos: #opens given file
    with open("data/" + json_path, "r") as movie_infos: #opens given file
        data = json.load(movie_infos) # loads the data in the file
        for p in data['data']: # for each movie info in the file
            appending = [] # next piece of data to be added to the dataframe
            if "overview" in p and "genres" in p: # if overview and genres are not empty
                for x in p.get("genres"): #get genres of the next movie
                    appending.append(x) # append these genres
                genres.append(appending) # append newly added genres as an array
                descriptions.append(p.get("overview")) # get synopsis of the next movie
        print("reading " + json_path)
    lemmatizer = WordNetLemmatizer() # new lemmatizer object
    stop_words = set(stopwords.words('english')) #denotes that English stopwords will be removed
    eng_words = set(nltk.corpus.words.words()) # list of English words
    table = str.maketrans("","",string.punctuation) #table that will help to remove punctuation marks
    for i in range(len(descriptions)): # for each description
        word_tokens = word_tokenize(str(descriptions[i])) #tokenize description
        filtered_sentence = [lemmatizer.lemmatize(w.lower()) for w in word_tokens if (not lemmatizer.lemmatize(w.lower()) in stop_words) and (w.isalpha()) and (lemmatizer.lemmatize(w.lower()) in eng_words)] #remove the stop words, get the words that exist in the English language, lower them and save them to an array
        descriptions[i] = word_tokenize(str(filtered_sentence).translate(table))#string is cleansed from punctuation marks
        
    df2 = pd.DataFrame(zip(descriptions, genres), columns = ['descs', 'genres']) # create dataframe from given array of information with tags of descs and genres
    return df2 #returns the new dataframe

def pad_text(encoded_syn, maxlen): # function used for padding a tokenized words array
    while len(encoded_syn) < maxlen: # while a given array is shorter than a length
        encoded_syn.append(0) # add 0s to the end of the array
    return encoded_syn # returns padded array

def remove_spaces(x): # removes spaces between words
    nospace=[]
    for item in x:
        item=item.lstrip()
        nospace.append(item)
    return (",").join(nospace)

def genre_vectorize(genre_to_be_vectorized): # vectorizes genres of each movie with 0s and 1s
    final = [] # final array to be used
    for i in range(len(genre_to_be_vectorized)): # for each genre
        cur_genres = genre_to_be_vectorized.iloc[i].split(",") # seperate genres in each movie
        array_to_be_added = [] 
        for genre in genres_array: # make an array that will denote each movie genre with 0s and 1s
            if genre in cur_genres:
                array_to_be_added.append(1)
            else:
                array_to_be_added.append(0)
        final.append(array_to_be_added) # makes all vectorized arrays into a matrix to train/test them in neural networks
    return np.array(final, dtype = np.int64) # return the matrix as a numpy array

def f1micro(y_true, y_pred): # used for calculating f1 score
    return tf.py_func(f1_score(y_true, y_pred,average='mirco'),tf.double)

def preprocess(syn): # preprocesses given text to predict outcome
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english')) #denotes that English stopwords will be removed
    eng_words = set(nltk.corpus.words.words())
    table = str.maketrans("","",string.punctuation) #table that will help to remove punctuation marks
    word_tokens = word_tokenize(str(syn)) #tokenize description
    filtered_sentence = [lemmatizer.lemmatize(w.lower()) for w in word_tokens if (not lemmatizer.lemmatize(w.lower()) in stop_words) and (w.isalpha()) and (lemmatizer.lemmatize(w.lower()) in eng_words)] #remove the stop words and save it to an array
    syn = str(filtered_sentence).translate(table) #string is cleansed from punctuation marks after stop words are cleansed
    syn = word_tokenize(str(syn))
    return syn

def predict_genre(synopsis, vect): # used for predicting genres
    synopsis = preprocess(synopsis) #preprocess inputted test
    encoded_syn = vect.texts_to_sequences([synopsis]) # make preprocessed text into a sequence
    deneme = pad_text(encoded_syn[0], 100) 
    padded_syn = pad_sequences(encoded_syn, maxlen=100, padding='post') # pad text with maximum length of 100 (is a different function than the one in keras, keras padder doesn't work here, pads each word into a different array or pads letter by letter)
    predictions2=model.predict([padded_syn]) #predict given input's genres
    pred2 = predictions2.copy()
    pred2[pred2>=0.3]=1 # if activation score is above the threshold, make in a 1, else a 0
    pred2[pred2<0.3]=0
    '''print(padded_syn)
    print(len(pred2[0]))
    print(len(pred2))
    print(pred2)'''
    print("Predicted genres: ")
    final = [] # used for final output string
    for i in range(len(pred2[0])): # for each prediction, if final score is 1 count that as a predicted genre and print it as an output
        if pred2[0][i] == 1:
            final.append(list(genres_array)[i])
            #final.append(", ")
    print(final)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ozral\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ozral\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
vect = None

    
files = os.listdir("data") # list all files in a spesific directory

df = pd.DataFrame(columns = ['descs', 'genres']) # creates a empty dataframe to be later filled
for file in files: # for each file in the directory
    df = df.append(add_to_dataframe(file), ignore_index = True) # append file's contents to the dataframe with tags of descs and genres

df['genres']=df['genres'].apply(remove_spaces) # makes genres into a string with no space

genres_array = [] # stores all the genres in a set to later use them to label neural network outputs
for x in df['genres']: # for each movie's genres
        for y in x.split(","): #split them to find all genres
            if y not in genres_array:
                genres_array.append(y) #add them to an array if they are not in the array. Since we are going to need indexes, we can't use sets for this step.

train = df.iloc[:int(len(df)*0.85)] # divides data to two parts to use them for training and testing
test = df.iloc[int(len(df)*0.85)+1:]

reading output-0-10000.json
reading output-10000-15000.json
reading output-15000-20000.json
reading output-20000-30000.json
reading output-30000-40000.json
reading output-40000-50000.json
reading output-50000-60000.json
reading output-60000-70000.json
reading output-70000-80000.json
reading output-80000-90000.json
reading output-90000-100000.json


In [3]:
y_train = genre_vectorize(train['genres']) #vectorize dataframes
y_test=genre_vectorize(test['genres'])

vect = Tokenizer()
vect.fit_on_texts(train['descs'])
vocab_size = len(vect.word_index) + 1 # total number of different words

encoded_docs_train = vect.texts_to_sequences(train['descs']) #turns train tokens into sequences
max_length = vocab_size
padded_docs_train = pad_sequences(encoded_docs_train,maxlen=100, padding='post') # pads them to be maximum of 100 words length

encoded_docs_test = vect.texts_to_sequences(test['descs']) #same process with test tokens
padded_docs_test = pad_sequences(encoded_docs_test,maxlen=100, padding='post')

threshold = 0.3

In [4]:
model = Sequential() # create a sequential neural network
model.add(Embedding(vocab_size, output_dim=256, input_length=100)) 
model.add(Conv1D(128, kernel_size = 5, activation = 'relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(.1))
model.add(Flatten())
model.add(layers.Dense(128, activation='sigmoid'))
model.add(Dropout(.15))
model.add(layers.Dense(19, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

model.fit(padded_docs_train, y_train, epochs=3, verbose=1, validation_data=(padded_docs_test, y_test), batch_size=32) # fit the model


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 256)          6277888   
_________________________________________________________________
conv1d (Conv1D)              (None, 96, 128)           163968    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 48, 128)           0         
_________________________________________________________________
dropout (Dropout)            (None, 48, 128)           0         
_________________________________________________________________
flatten (Flatten)            (None, 6144)              0         
_________________________________________________________________
dense (Dense)                (None, 128)               786560    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0

<tensorflow.python.keras.callbacks.History at 0x1bb693a8f48>

In [5]:
predictions=model.predict([padded_docs_test]) # get a list of predictions for each data in test set

#calculates success rates by making values 1 if above threshold, 0 otherwise
print("For threshold: ", threshold)
pred=predictions.copy()
      
pred[pred>=threshold]=1
pred[pred<threshold]=0

      
precision = precision_score(y_test, pred, average='micro')
recall = recall_score(y_test, pred, average='micro')
f1 = f1_score(y_test, pred, average='micro')
       
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

For threshold:  0.3
Micro-average quality numbers
Precision: 0.4918, Recall: 0.5904, F1-measure: 0.5366


In [29]:
print("Enter movie synopsis: ") 
synopsis = input()
predict_genre(synopsis, vect)

Enter movie synopsis: 
A family heads to an isolated hotel for the winter sees horrific forebodings from both past and future.where a sinister presence influences the father into violence, while his psychic son 
Predicted genres: 
['Thriller', 'Horror']
