In [8]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd

column_names = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']

train_data=pd.read_csv('train_data.txt',sep=' ::: ', engine='python', header=None, names=column_names)
test_data=pd.read_csv('test_data.txt',sep=' ::: ', engine='python', header=None, names=column_names)

print(train_data.head())
print(test_data.head())


   ID                             TITLE     GENRE  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                         DESCRIPTION  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  
   ID                        TITLE  \
0   1         Edgar's Lunch (1998)   
1   2     La guerra de papá (1977)   
2   3  Off the Beaten Track (2010)   
3   4       Meu Amigo Hindu (2015)   
4   5            Er nu zhai (1955)   

                                               GENRE  DESCRIPTION  
0  L.R. Brane loves his life - his car, his apart...  

In [10]:
# Data Preprocessing
train_data['DESCRIPTION'] = train_data['DESCRIPTION'].astype(str).str.lower()
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].astype(str).str.lower()

# Encode Labels
label_encoder = LabelEncoder()
train_data['GENRE'] = label_encoder.fit_transform(train_data['GENRE'])

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['DESCRIPTION'])
X_train = tokenizer.texts_to_sequences(train_data['DESCRIPTION'])
X_test = tokenizer.texts_to_sequences(test_data['DESCRIPTION'])

# Padding
max_len = 200
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
y_train = train_data['GENRE']

In [None]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

model_cnn = Sequential()
model_cnn.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))
model_cnn.add(Conv1D(64, 5, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dense(len(label_encoder.classes_), activation='softmax'))

model_cnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training
model_cnn.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

# Evaluation
accuracy_cnn = model_cnn.evaluate(X_train, y_train)[1] * 100
print(f"CNN Model Training Accuracy: {accuracy_cnn:.2f}%")

Epoch 1/10




[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 22ms/step - accuracy: 0.3686 - loss: 2.2513 - val_accuracy: 0.5096 - val_loss: 1.7257
Epoch 2/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 22ms/step - accuracy: 0.5569 - loss: 1.5517 - val_accuracy: 0.5463 - val_loss: 1.5839
Epoch 3/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 22ms/step - accuracy: 0.6444 - loss: 1.2370 - val_accuracy: 0.5469 - val_loss: 1.5922
Epoch 4/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 22ms/step - accuracy: 0.7277 - loss: 0.9542 - val_accuracy: 0.5405 - val_loss: 1.7205
Epoch 5/10
[1m635/678[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 21ms/step - accuracy: 0.8182 - loss: 0.6582