In [4]:
pip install tensorflow

Collecting tensorflow
  Using cached tensorflow-2.19.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (4.0 kB)
Collecting keras>=3.5.0 (from tensorflow)
  Downloading keras-3.9.0-py3-none-any.whl.metadata (6.1 kB)
Using cached tensorflow-2.19.0-cp312-cp312-macosx_12_0_arm64.whl (252.7 MB)
Downloading keras-3.9.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: keras, tensorflow
  Attempting uninstall: keras
    Found existing installation: keras 2.14.0
    Uninstalling keras-2.14.0:
      Successfully uninstalled keras-2.14.0
Successfully installed keras-3.9.0 tensorflow-2.19.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [10]:
import pandas as pd

column_names = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']

train_data=pd.read_csv('train_data.txt',sep=' ::: ', engine='python', header=None, names=column_names)
test_data=pd.read_csv('test_data.txt',sep=' ::: ', engine='python', header=None, names=column_names)

print(train_data.head())
print(test_data.head())

   ID                             TITLE     GENRE  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                         DESCRIPTION  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  
   ID                        TITLE  \
0   1         Edgar's Lunch (1998)   
1   2     La guerra de papá (1977)   
2   3  Off the Beaten Track (2010)   
3   4       Meu Amigo Hindu (2015)   
4   5            Er nu zhai (1955)   

                                               GENRE  DESCRIPTION  
0  L.R. Brane loves his life - his car, his apart...  

In [12]:
# Data Preprocessing
train_data['DESCRIPTION'] = train_data['DESCRIPTION'].astype(str).str.lower()
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].astype(str).str.lower()

# Encode Labels
label_encoder = LabelEncoder()
train_data['GENRE'] = label_encoder.fit_transform(train_data['GENRE'])


In [14]:
# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['DESCRIPTION'])
X_train = tokenizer.texts_to_sequences(train_data['DESCRIPTION'])
X_test = tokenizer.texts_to_sequences(test_data['DESCRIPTION'])

# Padding
max_len = 200
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
y_train = train_data['GENRE']

In [None]:
# Model Definition
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

# Evaluation
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Accuracy Calculation
accuracy = model.evaluate(X_train, y_train)[1] * 100
print(f"Training Accuracy: {accuracy:.2f}%")

Epoch 1/10




[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 149ms/step - accuracy: 0.3075 - loss: 2.3666 - val_accuracy: 0.4542 - val_loss: 1.9078
Epoch 2/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 151ms/step - accuracy: 0.4711 - loss: 1.8391 - val_accuracy: 0.5064 - val_loss: 1.7098
Epoch 3/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 154ms/step - accuracy: 0.5302 - loss: 1.6242 - val_accuracy: 0.5318 - val_loss: 1.6140
Epoch 4/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 149ms/step - accuracy: 0.5661 - loss: 1.4800 - val_accuracy: 0.5429 - val_loss: 1.5861
Epoch 5/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 157ms/step - accuracy: 0.5923 - loss: 1.3821 - val_accuracy: 0.5506 - val_loss: 1.5576
Epoch 6/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 160ms/step - accuracy: 0.6253 - loss: 1.2779 - val_accuracy: 0.5599 - val_loss: 1.5415
Epoch 7/10
[1m