## Read the dataset
dataset link : https://www.kaggle.com/datasets/hijest/genre-classification-dataset-imdb

In [None]:
import pandas as pd

train_file_path = "./Dataset/train_data.txt"
test_file_path = "./Dataset/test_data_solution.txt"

columns = ["ID", "Title", "Genre", "Description"]

train_df = pd.read_csv(train_file_path, delimiter=":::", header=None, names=columns, engine="python")
test_df = pd.read_csv(test_file_path, delimiter=":::", header=None, names=columns, engine="python")


## Feature selection to remove unnecssary features (id and title)

In [None]:
train_df = train_df.drop(columns=["ID", "Title"], axis=1)
test_df = test_df.drop(columns=["ID", "Title"], axis=1)

## Split data into Train test splits

In [None]:

X_train, Y_train = train_df["Description"], train_df["Genre"]
X_test, Y_test = test_df["Description"], test_df["Genre"]


## Feature Extraction Using TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#converting the text into a matrix of tf-idf features, i removed the stop words beceause they'll add confusion to the model later as they're repeated and won't add any sense
verctorizer = TfidfVectorizer(stop_words="english")

#now we'll use the fit transform function which does 2 things, learning the vocab of the data with fit function and transforming it into a matrix of tf-idf features where each row represents a document and each column represents a term from the vocab
x_train_tfidf = verctorizer.fit_transform(X_train)

#now transforming the test data into the same tfidf space learned from the x_train, ensuring the test data is represented in the same way as train data
x_test_tfidf = verctorizer.transform(X_test)


## Model selection (Naive Bayes)

In [None]:
from sklearn.naive_bayes import MultinomialNB

naive_model = MultinomialNB()
naive_model.fit(x_train_tfidf, Y_train)
predictions = naive_model.predict(x_test_tfidf)

## Model selection (Logistic Regression)

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(x_train_tfidf, Y_train)


## Model selection (SVM)
Please note that this model takes too long to train based on your pc spacification (+1 hour).

In [None]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(x_train_tfidf, Y_train)

## Predctions

In [None]:
nb_predictions = naive_model.predict(x_test_tfidf)
lr_predictions = logistic_model.predict(x_test_tfidf)
svm_predictions = svm_model.predict(x_test_tfidf)

## Evaluation

In [None]:
from sklearn.metrics import classification_report
print("Naive Bayes Classifier Report:\n", classification_report(Y_test, nb_predictions))
print("Logistic Regression Classifier Report:\n", classification_report(Y_test, lr_predictions))
print("SVM Classifier Report:\n", classification_report(Y_test, svm_predictions))

## Test

In [None]:
description = ["A scientist invents a time machine and travels to the past."]
description_tfidf = verctorizer.transform(description)
print("Predicted Genre (Naive Bayes):", naive_model.predict(description_tfidf)[0])
print("Predicted Genre (Logistic Regression):", logistic_model.predict(description_tfidf)[0])
print("Predicted Genre (SVM):", svm_model.predict(description_tfidf))