## Read the dataset
dataset link : https://www.kaggle.com/datasets/hijest/genre-classification-dataset-imdb

In [16]:
import pandas as pd

train_file_path = "./Dataset/train_data.txt"
test_file_path = "./Dataset/test_data_solution.txt"

columns = ["ID", "Title", "Genre", "Description"]

train_df = pd.read_csv(train_file_path, delimiter=":::", header=None, names=columns, engine="python")
test_df = pd.read_csv(test_file_path, delimiter=":::", header=None, names=columns, engine="python")


## Feature selection to remove unnecssary features (id and title)

In [17]:
train_df = train_df.drop(columns=["ID", "Title"], axis=1)
test_df = test_df.drop(columns=["ID", "Title"], axis=1)

## Split data into Train test splits

In [18]:

X_train, Y_train = train_df["Description"], train_df["Genre"]
X_test, Y_test = test_df["Description"], test_df["Genre"]


## Feature Extraction Using TF-IDF

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

#converting the text into a matrix of tf-idf features, i removed the stop words beceause they'll add confusion to the model later as they're repeated and won't add any sense
verctorizer = TfidfVectorizer(stop_words="english")

#now we'll use the fit transform function which does 2 things, learning the vocab of the data with fit function and transforming it into a matrix of tf-idf features where each row represents a document and each column represents a term from the vocab
x_train_tfidf = verctorizer.fit_transform(X_train)

#now transforming the test data into the same tfidf space learned from the x_train, ensuring the test data is represented in the same way as train data
x_test_tfidf = verctorizer.transform(X_test)


## Model selection (Naive Bayes)

In [20]:
from sklearn.naive_bayes import MultinomialNB

naive_model = MultinomialNB()
naive_model.fit(x_train_tfidf, Y_train)
predictions = naive_model.predict(x_test_tfidf)

## Model selection (Logistic Regression)

In [21]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(x_train_tfidf, Y_train)


## Model selection (SVM)
Please note that this model takes too long to train based on your pc spacification (+1 hour).

In [125]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(x_train_tfidf, Y_train)

## Predctions

In [23]:
nb_predictions = naive_model.predict(x_test_tfidf)
lr_predictions = logistic_model.predict(x_test_tfidf)
svm_predictions = svm_model.predict(x_test_tfidf)

## Evaluation

In [24]:
from sklearn.metrics import classification_report
print("Naive Bayes Classifier Report:\n", classification_report(Y_test, nb_predictions))
print("Logistic Regression Classifier Report:\n", classification_report(Y_test, lr_predictions))
print("SVM Classifier Report:\n", classification_report(Y_test, svm_predictions))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Naive Bayes Classifier Report:
                precision    recall  f1-score   support

      action        0.00      0.00      0.00      1314
       adult        0.00      0.00      0.00       590
   adventure        0.00      0.00      0.00       775
   animation        0.00      0.00      0.00       498
   biography        0.00      0.00      0.00       264
      comedy        0.72      0.05      0.09      7446
       crime        0.00      0.00      0.00       505
 documentary        0.53      0.90      0.66     13096
       drama        0.38      0.88      0.53     13612
      family        0.00      0.00      0.00       783
     fantasy        0.00      0.00      0.00       322
   game-show        0.00      0.00      0.00       193
     history        0.00      0.00      0.00       243
      horror        0.00      0.00      0.00      2204
       music        0.00      0.00      0.00       731
     musical        0.00      0.00      0.00       276
     mystery        0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression Classifier Report:
                precision    recall  f1-score   support

      action        0.52      0.26      0.35      1314
       adult        0.62      0.21      0.31       590
   adventure        0.69      0.14      0.24       775
   animation        0.60      0.02      0.05       498
   biography        0.00      0.00      0.00       264
      comedy        0.55      0.59      0.57      7446
       crime        0.42      0.02      0.04       505
 documentary        0.66      0.87      0.75     13096
       drama        0.54      0.80      0.64     13612
      family        0.58      0.08      0.14       783
     fantasy        0.71      0.02      0.03       322
   game-show        0.93      0.48      0.63       193
     history        0.00      0.00      0.00       243
      horror        0.66      0.57      0.61      2204
       music        0.70      0.39      0.50       731
     musical        1.00      0.01      0.01       276
     mystery        1.00

  _warn_prf(average, modifier, msg_start, len(result))


## Test

In [27]:
description = ["A scientist invents a time machine and travels to the past."]
description_tfidf = verctorizer.transform(description)
print("Predicted Genre (Naive Bayes):", naive_model.predict(description_tfidf)[0])
print("Predicted Genre (Logistic Regression):", logistic_model.predict(description_tfidf)[0])
print("Predicted Genre (SVM):", svm_model.predict(description_tfidf))

Predicted Genre (Naive Bayes):  documentary 
Predicted Genre (Logistic Regression):  sci-fi 
