In [148]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [149]:
# Read the test data with the correct encoding
with open('./Genre Classification Dataset/test_data.txt', 'r', encoding='utf-8') as test_file:
    test_content = test_file.readlines()


In [175]:
test_content

["1 ::: Edgar's Lunch (1998) ::: L.R. Brane loves his life - his car, his apartment, his job, but especially his girlfriend, Vespa. One day while showering, Vespa runs out of shampoo. L.R. runs across the street to a convenience store to buy some more, a quick trip of no more than a few minutes. When he returns, Vespa is gone and every trace of her existence has been wiped out. L.R.'s life becomes a tortured existence as one strange event after another occurs to confirm in his mind that a conspiracy is working against his finding Vespa.\n",
 '2 ::: La guerra de papá (1977) ::: Spain, March 1964: Quico is a very naughty child of three belonging to a wealthy middle-class family. Since Cristina\'s birth, he feels he has lost the privileged position of "prince" of the house for his eight months old sister. So, with his brother Juan, who is eight years old and is quite disobedient, spend their time committing prank after prank, causing the resulting anger of his mother, the nanny and the ol

In [150]:
# Load the training dataset
training_file_path = "./Genre Classification Dataset/train_data.txt"
training_dataset = pd.read_csv(
    training_file_path, 
    sep=':::', 
    names=['Title', 'Category', 'Summary'], 
    engine='python'
)
print(training_dataset.head(10))

                                 Title       Category  \
1        Oscar et la dame rose (2009)          drama    
2                        Cupid (1997)       thriller    
3    Young, Wild and Wonderful (1980)          adult    
4               The Secret Sin (1915)          drama    
5              The Unrecovered (2007)          drama    
6              Quality Control (2011)    documentary    
7                  "Pink Slip" (2009)         comedy    
8                One Step Away (1985)          crime    
9            "Desperate Hours" (2016)     reality-tv    
10                   Spirits (2014/I)         horror    

                                              Summary  
1    Listening in to a conversation between his do...  
2    A brother and sister with a past incestuous r...  
3    As the bus empties the students for their fie...  
4    To help their unemployed father make ends mee...  
5    The film's title refers not only to the un-re...  
6    Quality Control consists of a s

In [151]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

In [152]:
# Fit TF-IDF on the training set descriptions
training_features = vectorizer.fit_transform(training_dataset['Summary'])

In [153]:
# Apply the TF-IDF transformation to the test set descriptions
test_features = vectorizer.transform(test_content)

In [154]:
# Prepare data for training and validation
features = training_features
labels = training_dataset['Category']
train_features, validation_features, train_labels, validation_labels = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

In [155]:
# Initialize and train the Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(train_features, train_labels)

In [156]:
# Generate predictions on the validation set
validation_predictions = nb_model.predict(validation_features)

In [157]:
# Assess the model's performance
val_accuracy = accuracy_score(validation_labels, validation_predictions)
print("Validation Accuracy:", val_accuracy)
print(classification_report(validation_labels, validation_predictions))

Validation Accuracy: 0.44507977497002676


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

      action        0.00      0.00      0.00       263
       adult        0.00      0.00      0.00       112
   adventure        0.00      0.00      0.00       139
   animation        0.00      0.00      0.00       104
   biography        0.00      0.00      0.00        61
      comedy        0.57      0.03      0.06      1443
       crime        0.00      0.00      0.00       107
 documentary        0.54      0.90      0.67      2659
       drama        0.38      0.89      0.53      2697
      family        0.00      0.00      0.00       150
     fantasy        0.00      0.00      0.00        74
   game-show        0.00      0.00      0.00        40
     history        0.00      0.00      0.00        45
      horror        0.00      0.00      0.00       431
       music        0.00      0.00      0.00       144
     musical        0.00      0.00      0.00        50
     mystery        0.00      0.00      0.00        56
        n

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [158]:
# Predict genres for the test dataset
test_predictions = nb_model.predict(test_features)
test_results = pd.DataFrame({'Description': test_content, 'Predicted_Category': test_predictions})

In [159]:
# Save the predictions to a CSV file
test_results.to_csv('genre_predictions.csv', index=False)

In [160]:
# Display a preview of the results
print(test_results)

                                             Description Predicted_Category
0      1 ::: Edgar's Lunch (1998) ::: L.R. Brane love...             drama 
1      2 ::: La guerra de papá (1977) ::: Spain, Marc...             drama 
2      3 ::: Off the Beaten Track (2010) ::: One year...       documentary 
3      4 ::: Meu Amigo Hindu (2015) ::: His father ha...             drama 
4      5 ::: Er nu zhai (1955) ::: Before he was know...             drama 
...                                                  ...                ...
54195  54196 ::: "Tales of Light & Dark" (2013) ::: C...             drama 
54196  54197 ::: Der letzte Mohikaner (1965) ::: As A...             drama 
54197  54198 ::: Oliver Twink (2007) ::: A movie 169 ...             drama 
54198  54199 ::: Slipstream (1973) ::: Popular, but m...             drama 
54199  54200 ::: Curitiba Zero Grau (2010) ::: Curiti...       documentary 

[54200 rows x 2 columns]
