In [None]:
!pip install nltk scikit-learn




# punkt → tokenizer (to split text into words).

# wordnet → for lemmatization (getting base form of words).

# stopwords → list of common words like the, is, and that you will remove.

# punkt_tab is a newer extra resource for tokenization.

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Imports & Load the Dataset

In [None]:
import pandas as pd
import numpy as np

from ast import literal_eval   # safer than eval

# ML & evaluation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Text preprocessing / feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline

# Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re



# Basic Cleaning & Convert genres → Primary Genre

In [None]:
# Load your CSV
df = pd.read_csv("book_details.csv")

# Drop rows with any NaNs (title/description/genres)
df = df.dropna(subset=["description", "genres"]).reset_index(drop=True)

# Convert "['Classics','Fiction',...]" → ['Classics','Fiction',...]
def parse_genres(g):
    try:
        return literal_eval(g)
    except Exception:
        return []

df["genres_parsed"] = df["genres"].apply(parse_genres)

# Take ONLY the first genre as the "primary" genre
primary_genres = []
indices_to_drop = []

for idx, g_list in enumerate(df["genres_parsed"]):
    if len(g_list) > 0:
        primary_genres.append(g_list[0])
    else:
        indices_to_drop.append(idx)

# Drop rows with empty genre lists (if any)
df = df.drop(index=indices_to_drop).reset_index(drop=True)

# Add the single-label column
df["primary_genre"] = primary_genres

print(df[["description", "genres", "primary_genre"]].head())
print("Unique primary genres:", df["primary_genre"].nunique())


                                         description  \
0  The unforgettable novel of a childhood in a sl...   
1  Harry Potter thinks he is an ordinary boy - un...   
2  Alternate cover edition of ISBN 9780679783268S...   
3  Discovered in the attic in which she spent the...   
4  Librarian's note: There is an Alternate Cover ...   

                                              genres primary_genre  
0  ['Classics', 'Fiction', 'Historical Fiction', ...      Classics  
1  ['Fantasy', 'Fiction', 'Young Adult', 'Magic',...       Fantasy  
2  ['Classics', 'Fiction', 'Romance', 'Historical...      Classics  
3  ['Classics', 'Nonfiction', 'History', 'Biograp...      Classics  
4  ['Classics', 'Fiction', 'Dystopia', 'Fantasy',...      Classics  
Unique primary genres: 151


# Create Binary Target (Fiction vs Not Fiction)

In [None]:
TARGET_GENRE = "Fiction"   # << change here if you want another genre

df["label"] = df["primary_genre"].apply(lambda g: 1 if g == TARGET_GENRE else 0)

print(df["label"].value_counts())
print(df[["primary_genre", "label"]].head())


label
0    4811
1    1308
Name: count, dtype: int64
  primary_genre  label
0      Classics      0
1       Fantasy      0
2      Classics      0
3      Classics      0
4      Classics      0


In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#Text Preprocessing with NLTK

We’ll create a clean text column with:

lowercasing

remove URLs, numbers, punctuation

stopword removal

lemmatization

In [None]:
stop_words = set(stopwords.words("english"))
lemm = WordNetLemmatizer()

def clean_text(text):
    # lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)
    # keep only letters and spaces
    text = re.sub(r"[^a-z\s]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # remove stopwords and lemmatize
    tokens = [lemm.lemmatize(tok) for tok in tokens if tok not in stop_words and len(tok) > 2]
    return " ".join(tokens)

df["clean_description"] = df["description"].apply(clean_text)

print(df[["description", "clean_description"]].head())


                                         description  \
0  The unforgettable novel of a childhood in a sl...   
1  Harry Potter thinks he is an ordinary boy - un...   
2  Alternate cover edition of ISBN 9780679783268S...   
3  Discovered in the attic in which she spent the...   
4  Librarian's note: There is an Alternate Cover ...   

                                   clean_description  
0  unforgettable novel childhood sleepy southern ...  
1  harry potter think ordinary boy rescued owl ta...  
2  alternate cover edition isbn since immediate s...  
3  discovered attic spent last year life anne fra...  
4  librarian note alternate cover edition edition...  


# Train–Test Split

In [None]:
X = df["clean_description"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])


Train size: 4895
Test size: 1224


# Baseline Models with TF-IDF (No Scaling Needed)
## Naive Bayes

In [None]:
nb_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ("clf", MultinomialNB())
])

nb_pipeline.fit(X_train, y_train)
y_pred_nb = nb_pipeline.predict(X_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.7843137254901961
              precision    recall  f1-score   support

           0       0.79      1.00      0.88       962
           1       0.33      0.01      0.01       262

    accuracy                           0.78      1224
   macro avg       0.56      0.50      0.45      1224
weighted avg       0.69      0.78      0.69      1224



# Logistic Regression

In [None]:
lr_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=1000))
])

lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.8120915032679739
              precision    recall  f1-score   support

           0       0.81      0.99      0.89       962
           1       0.82      0.16      0.26       262

    accuracy                           0.81      1224
   macro avg       0.82      0.57      0.58      1224
weighted avg       0.81      0.81      0.76      1224



# Linear SVM

In [None]:
svm_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ("clf", LinearSVC())
])

svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)

print("Linear SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


Linear SVM Accuracy: 0.8055555555555556
              precision    recall  f1-score   support

           0       0.85      0.92      0.88       962
           1       0.57      0.40      0.47       262

    accuracy                           0.81      1224
   macro avg       0.71      0.66      0.67      1224
weighted avg       0.79      0.81      0.79      1224



# Cross-Validation (e.g., Logistic Regression)

We’ll do k-fold cross-validation on the training data.

In [None]:
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    lr_pipeline,
    X_train,
    y_train,
    cv=kf,
    scoring="accuracy",
    n_jobs=-1
)

print("CV accuracies:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())


CV accuracies: [0.80183861 0.79570991 0.79673136 0.79877426 0.79570991]
Mean CV accuracy: 0.7977528089887641


# Hyperparameter Tuning with GridSearchCV
## 1 Hyperparameter Tuning for Logistic Regression

We tune:

C (regularization strength)

ngram_range

max_features

In [None]:
lr_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression(max_iter=1000))
])

param_grid_lr = {
    "tfidf__max_features": [3000, 5000, 8000],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "clf__C": [0.1, 1, 5],
    "clf__penalty": ["l2"],
    "clf__solver": ["lbfgs"]
}

grid_lr = GridSearchCV(
    lr_pipeline,
    param_grid=param_grid_lr,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

grid_lr.fit(X_train, y_train)

print("Best LR params:", grid_lr.best_params_)
print("Best LR CV score:", grid_lr.best_score_)

best_lr_model = grid_lr.best_estimator_

# Evaluate on test set
y_pred_lr_best = best_lr_model.predict(X_test)
print("Tuned LR Test Accuracy:", accuracy_score(y_test, y_pred_lr_best))
print(classification_report(y_test, y_pred_lr_best))


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best LR params: {'clf__C': 5, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}
Best LR CV score: 0.819203268641471
Tuned LR Test Accuracy: 0.8186274509803921
              precision    recall  f1-score   support

           0       0.84      0.95      0.89       962
           1       0.65      0.33      0.44       262

    accuracy                           0.82      1224
   macro avg       0.75      0.64      0.66      1224
weighted avg       0.80      0.82      0.79      1224



# Hyperparameter Tuning for Linear SVM

In [None]:
svm_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LinearSVC())
])

param_grid_svm = {
    "tfidf__max_features": [3000, 5000, 8000],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "clf__C": [0.1, 1, 5]
}

grid_svm = GridSearchCV(
    svm_pipeline,
    param_grid=param_grid_svm,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

grid_svm.fit(X_train, y_train)

print("Best SVM params:", grid_svm.best_params_)
print("Best SVM CV score:", grid_svm.best_score_)

best_svm_model = grid_svm.best_estimator_

y_pred_svm_best = best_svm_model.predict(X_test)
print("Tuned SVM Test Accuracy:", accuracy_score(y_test, y_pred_svm_best))
print(classification_report(y_test, y_pred_svm_best))


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best SVM params: {'clf__C': 1, 'tfidf__max_features': 8000, 'tfidf__ngram_range': (1, 2)}
Best SVM CV score: 0.8202247191011235
Tuned SVM Test Accuracy: 0.8186274509803921
              precision    recall  f1-score   support

           0       0.85      0.93      0.89       962
           1       0.61      0.41      0.49       262

    accuracy                           0.82      1224
   macro avg       0.73      0.67      0.69      1224
weighted avg       0.80      0.82      0.80      1224



# Using the Best Model for Prediction

In [None]:
def predict_genre_label(text, model=best_svm_model):
    clean = clean_text(text)
    pred = model.predict([clean])[0]
    return pred, ("Fiction" if pred == 1 else "Not Fiction")

sample_text = "A magical story of a young boy who discovers a hidden world of wizards."
# sample_text= "A young detective follows mysterious clues that lead to a shocking secret."
label_num, label_name = predict_genre_label(sample_text)

print("Predicted label:", label_num, "=>", label_name)


Predicted label: 1 => Fiction
