In [None]:
!pip install nltk scikit-learn




In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')   # needed in newer NLTK
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

# Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [None]:
# Load CSV
df = pd.read_csv("book_details.csv")

# Keep only rows where description and genres exist
df = df.dropna(subset=["description", "genres"]).reset_index(drop=True)

# Convert "['Fiction','Drama']" â†’ ['Fiction','Drama']
def parse_genres(g):
    try:
        return literal_eval(g)
    except Exception:
        return []

df["genres_parsed"] = df["genres"].apply(parse_genres)

primary_genres = []
indices_to_drop = []

for idx, g_list in enumerate(df["genres_parsed"]):
    if len(g_list) > 0:
        primary_genres.append(g_list[0])   # take only first genre
    else:
        indices_to_drop.append(idx)

# Drop rows with empty genres (if any)
df = df.drop(index=indices_to_drop).reset_index(drop=True)

df["primary_genre"] = primary_genres

print(df[["description", "genres", "primary_genre"]].head())
print("Number of unique primary genres:", df["primary_genre"].nunique())


                                         description  \
0  The unforgettable novel of a childhood in a sl...   
1  Harry Potter thinks he is an ordinary boy - un...   
2  Alternate cover edition of ISBN 9780679783268S...   
3  Discovered in the attic in which she spent the...   
4  Librarian's note: There is an Alternate Cover ...   

                                              genres primary_genre  
0  ['Classics', 'Fiction', 'Historical Fiction', ...      Classics  
1  ['Fantasy', 'Fiction', 'Young Adult', 'Magic',...       Fantasy  
2  ['Classics', 'Fiction', 'Romance', 'Historical...      Classics  
3  ['Classics', 'Nonfiction', 'History', 'Biograp...      Classics  
4  ['Classics', 'Fiction', 'Dystopia', 'Fantasy',...      Classics  
Number of unique primary genres: 151


In [None]:
TARGET_GENRE = "Fiction"

# Option 1: strict equality (only exact "Fiction")
# df["label"] = df["primary_genre"].apply(lambda g: 1 if g == TARGET_GENRE else 0)

# Option 2: any genre containing "Fiction" (Historical Fiction, etc.)
df["label"] = df["primary_genre"].apply(lambda g: 1 if "Fiction" in g else 0)

print(df["label"].value_counts())
print(df[["primary_genre", "label"]].head())


label
0    4249
1    1870
Name: count, dtype: int64
  primary_genre  label
0      Classics      0
1       Fantasy      0
2      Classics      0
3      Classics      0
4      Classics      0


In [None]:
stop_words = set(stopwords.words("english"))
lemm = WordNetLemmatizer()

def clean_text(text):
    # lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)
    # keep only letters and spaces
    text = re.sub(r"[^a-z\s]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # remove stopwords + short tokens, lemmatize
    tokens = [
        lemm.lemmatize(tok)
        for tok in tokens
        if tok not in stop_words and len(tok) > 2
    ]
    return " ".join(tokens)

df["clean_description"] = df["description"].apply(clean_text)

print(df[["description", "clean_description"]].head())


                                         description  \
0  The unforgettable novel of a childhood in a sl...   
1  Harry Potter thinks he is an ordinary boy - un...   
2  Alternate cover edition of ISBN 9780679783268S...   
3  Discovered in the attic in which she spent the...   
4  Librarian's note: There is an Alternate Cover ...   

                                   clean_description  
0  unforgettable novel childhood sleepy southern ...  
1  harry potter think ordinary boy rescued owl ta...  
2  alternate cover edition isbn since immediate s...  
3  discovered attic spent last year life anne fra...  
4  librarian note alternate cover edition edition...  


In [None]:
X = df["clean_description"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))


Train size: 4895
Test size: 1224


In [None]:
nb_pipeline = Pipeline([
    ("bow", CountVectorizer(max_features=5000, ngram_range=(1,2))),
    ("clf", MultinomialNB())
])

nb_pipeline.fit(X_train, y_train)
y_pred_nb = nb_pipeline.predict(X_test)

print("Naive Bayes (BoW) Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naive Bayes (BoW) Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.86      0.77      0.81       850
           1       0.57      0.71      0.64       374

    accuracy                           0.75      1224
   macro avg       0.72      0.74      0.72      1224
weighted avg       0.77      0.75      0.76      1224



In [None]:
lr_pipeline = Pipeline([
    ("bow", CountVectorizer(max_features=5000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=1000))
])

lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)

print("Logistic Regression (BoW) Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression (BoW) Accuracy: 0.7589869281045751
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       850
           1       0.62      0.55      0.58       374

    accuracy                           0.76      1224
   macro avg       0.72      0.70      0.71      1224
weighted avg       0.75      0.76      0.75      1224



In [None]:
svm_pipeline = Pipeline([
    ("bow", CountVectorizer(max_features=5000, ngram_range=(1,2))),
    ("clf", LinearSVC())
])

svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)

print("Linear SVM (BoW) Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))




Linear SVM (BoW) Accuracy: 0.7271241830065359
              precision    recall  f1-score   support

           0       0.80      0.80      0.80       850
           1       0.55      0.56      0.55       374

    accuracy                           0.73      1224
   macro avg       0.68      0.68      0.68      1224
weighted avg       0.73      0.73      0.73      1224



In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores_lr = cross_val_score(
    lr_pipeline,
    X_train,
    y_train,
    cv=kf,
    scoring="accuracy",
    n_jobs=-1
)

print("LR BoW CV scores:", cv_scores_lr)
print("Mean CV accuracy:", cv_scores_lr.mean())


LR BoW CV scores: [0.76404494 0.73953013 0.75995914 0.76506639 0.75689479]
Mean CV accuracy: 0.7570990806945862


In [None]:
lr_bow_pipeline = Pipeline([
    ("bow", CountVectorizer()),
    ("clf", LogisticRegression(max_iter=1000))
])

param_grid_lr = {
    "bow__max_features": [3000, 5000, 8000],
    "bow__ngram_range": [(1,1), (1,2)],
    "clf__C": [0.1, 1, 5],
    "clf__penalty": ["l2"],
    "clf__solver": ["lbfgs"]
}

grid_lr = GridSearchCV(
    lr_bow_pipeline,
    param_grid=param_grid_lr,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

grid_lr.fit(X_train, y_train)

print("Best LR BoW params:", grid_lr.best_params_)
print("Best LR BoW CV score:", grid_lr.best_score_)

best_lr_bow = grid_lr.best_estimator_

y_pred_lr_best = best_lr_bow.predict(X_test)
print("Tuned LR (BoW) Test Accuracy:", accuracy_score(y_test, y_pred_lr_best))
print(classification_report(y_test, y_pred_lr_best))


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best LR BoW params: {'bow__max_features': 8000, 'bow__ngram_range': (1, 1), 'clf__C': 0.1, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}
Best LR BoW CV score: 0.7730337078651686
Tuned LR (BoW) Test Accuracy: 0.7769607843137255
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       850
           1       0.68      0.52      0.59       374

    accuracy                           0.78      1224
   macro avg       0.74      0.70      0.72      1224
weighted avg       0.77      0.78      0.77      1224



In [None]:
svm_bow_pipeline = Pipeline([
    ("bow", CountVectorizer()),
    ("clf", LinearSVC())
])

param_grid_svm = {
    "bow__max_features": [3000, 5000, 8000],
    "bow__ngram_range": [(1,1), (1,2)],
    "clf__C": [0.1, 1, 5]
}

grid_svm = GridSearchCV(
    svm_bow_pipeline,
    param_grid=param_grid_svm,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

grid_svm.fit(X_train, y_train)

print("Best SVM BoW params:", grid_svm.best_params_)
print("Best SVM BoW CV score:", grid_svm.best_score_)

best_svm_bow = grid_svm.best_estimator_

y_pred_svm_best = best_svm_bow.predict(X_test)
print("Tuned SVM (BoW) Test Accuracy:", accuracy_score(y_test, y_pred_svm_best))
print(classification_report(y_test, y_pred_svm_best))


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best SVM BoW params: {'bow__max_features': 8000, 'bow__ngram_range': (1, 2), 'clf__C': 0.1}
Best SVM BoW CV score: 0.7481103166496424
Tuned SVM (BoW) Test Accuracy: 0.7508169934640523
              precision    recall  f1-score   support

           0       0.81      0.84      0.82       850
           1       0.60      0.56      0.58       374

    accuracy                           0.75      1224
   macro avg       0.71      0.70      0.70      1224
weighted avg       0.75      0.75      0.75      1224



In [None]:
def predict_fiction(text, model=best_svm_bow):
    clean = clean_text(text)
    pred = model.predict([clean])[0]   # 0 or 1
    label = "Fiction" if pred == 1 else "Not Fiction"
    return pred, label

sample_text = """
A young boy discovers a magical world of wizards, hidden secrets, and dangerous adventures.
"""

pred_num, pred_label = predict_fiction(sample_text)
print("Predicted:", pred_num, "=>", pred_label)


Predicted: 0 => Not Fiction
