In [2]:
# Import necessary libraries for data handling, preprocessing, and deep learning
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [4]:
# Column names based on the dataset description
column_names = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']

# Load training and test datasets
train_data = pd.read_csv('train_data.txt', sep=' ::: ', engine='python', header=None, names=column_names)
test_data = pd.read_csv('test_data.txt', sep=' ::: ', engine='python', header=None, names=column_names)

# Display the first few rows of data for validation
print("Training Data Preview:")
print(train_data.head())

print("\nTest Data Preview:")
print(test_data.head())


Training Data Preview:
   ID                             TITLE     GENRE  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                         DESCRIPTION  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  

Test Data Preview:
   ID                        TITLE  \
0   1         Edgar's Lunch (1998)   
1   2     La guerra de papá (1977)   
2   3  Off the Beaten Track (2010)   
3   4       Meu Amigo Hindu (2015)   
4   5            Er nu zhai (1955)   

                                               GENRE  DESCRIPTION  
0  L.R. Bra

In [6]:
# Convert descriptions to lowercase for uniformity and normalization
train_data['DESCRIPTION'] = train_data['DESCRIPTION'].astype(str).str.lower()
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].astype(str).str.lower()

# Encode the genre labels into numerical values using LabelEncoder
label_encoder = LabelEncoder()
train_data['GENRE'] = label_encoder.fit_transform(train_data['GENRE'])

# Print the unique genres and their encoded values
print("Encoded Genres:", dict(zip(label_encoder.classes_, range(len(label_encoder.classes_)))))


Encoded Genres: {'action': 0, 'adult': 1, 'adventure': 2, 'animation': 3, 'biography': 4, 'comedy': 5, 'crime': 6, 'documentary': 7, 'drama': 8, 'family': 9, 'fantasy': 10, 'game-show': 11, 'history': 12, 'horror': 13, 'music': 14, 'musical': 15, 'mystery': 16, 'news': 17, 'reality-tv': 18, 'romance': 19, 'sci-fi': 20, 'short': 21, 'sport': 22, 'talk-show': 23, 'thriller': 24, 'war': 25, 'western': 26}


In [8]:
# Tokenize the text data to convert sentences into sequences of integers
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['DESCRIPTION'])

# Convert descriptions to sequences
X_train = tokenizer.texts_to_sequences(train_data['DESCRIPTION'])
X_test = tokenizer.texts_to_sequences(test_data['DESCRIPTION'])

# Pad sequences to ensure uniform input size (max_len = 200)
max_len = 200
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
y_train = train_data['GENRE']

print(f"Training data shape: {X_train.shape}, Labels shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape}")


Training data shape: (54214, 200), Labels shape: (54214,)
Test data shape: (54200, 200)


In [10]:
# Build a Convolutional Neural Network (CNN) model for text classification
model_cnn = Sequential()

# Embedding Layer: Convert words into dense vectors
model_cnn.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))

# Convolutional Layer: Extract key features using a kernel size of 5
model_cnn.add(Conv1D(64, 5, activation='relu'))

# Global Max Pooling Layer: Reduce dimensionality by taking the max feature value
model_cnn.add(GlobalMaxPooling1D())

# Fully Connected Layers
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model using sparse categorical crossentropy for classification
model_cnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print("CNN Model Summary:")
model_cnn.summary()


CNN Model Summary:




In [12]:
# Train the CNN model with training data using validation split
history = model_cnn.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

# Save the trained model for later use
model_cnn.save('genre_classification_cnn.h5')
print("Model saved successfully.")


Epoch 1/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 20ms/step - accuracy: 0.3655 - loss: 2.2833 - val_accuracy: 0.5030 - val_loss: 1.7609
Epoch 2/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 21ms/step - accuracy: 0.5436 - loss: 1.6041 - val_accuracy: 0.5445 - val_loss: 1.5950
Epoch 3/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 20ms/step - accuracy: 0.6253 - loss: 1.3045 - val_accuracy: 0.5537 - val_loss: 1.5709
Epoch 4/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 24ms/step - accuracy: 0.7163 - loss: 0.9995 - val_accuracy: 0.5462 - val_loss: 1.6777
Epoch 5/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 21ms/step - accuracy: 0.8032 - loss: 0.7066 - val_accuracy: 0.5308 - val_loss: 1.8901
Epoch 6/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 21ms/step - accuracy: 0.8770 - loss: 0.4558 - val_accuracy: 0.5203 - val_loss: 2.2518
Epoch 7/10
[1m6



Model saved successfully.


In [None]:
# Evaluate the model performance on training data
train_loss, train_accuracy = model_cnn.evaluate(X_train, y_train)
print(f"CNN Model Training Accuracy: {train_accuracy * 100:.2f}%")