In [2]:
! pip install nltk gensim scikit-learn
! pip install gensim  




In [3]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC


In [5]:
df = pd.read_csv("book_details.csv")

df = df.dropna(subset=["description", "genres"]).reset_index(drop=True)

def parse_genres(g):
    try:
        return literal_eval(g)
    except:
        return []

df["genres_parsed"] = df["genres"].apply(parse_genres)

primary_genres = []
for genres in df["genres_parsed"]:
    primary_genres.append(genres[0] if len(genres) > 0 else "Unknown")

df["primary_genre"] = primary_genres

# Binary label
df["label"] = df["primary_genre"].apply(lambda g: 1 if "Fiction" in g else 0)


In [6]:
stop_words = set(stopwords.words("english"))
lemm = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemm.lemmatize(tok) for tok in tokens if tok not in stop_words and len(tok) > 2]
    return tokens   # Return tokens, not string


In [7]:
df["tokens"] = df["description"].apply(clean_text)


In [10]:
sentences = df["tokens"].tolist()

w2v_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    sg=0          # CBOW
)

w2v_model.save("cbow_w2v.model")
print("Vocabulary Size:", len(w2v_model.wv))


Vocabulary Size: 41120


In [11]:

def document_vector(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    if len(vectors) == 0:
        return np.zeros(100)
    return np.mean(vectors, axis=0)

X_vectors = df["tokens"].apply(document_vector)

X = np.vstack(X_vectors.values)
y = df["label"].values


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [13]:
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)
print("LR Accuracy:", accuracy_score(y_test, pred_lr))
print(classification_report(y_test, pred_lr))


LR Accuracy: 0.7107569721115538
              precision    recall  f1-score   support

           0       0.72      0.98      0.83       881
           1       0.61      0.08      0.15       374

    accuracy                           0.71      1255
   macro avg       0.66      0.53      0.49      1255
weighted avg       0.68      0.71      0.62      1255



# Class 0 Performance (Majority Class)
Interpretation:

Recall = 0.98 → The model correctly finds 98% of class 0 examples.

Precision = 0.71 → When it predicts class 0, it is right 71% of the time.

F1 = 0.82 → Strong performance.

# Class 1 Performance (Minority Class)
interpretation:

Recall = 0.08 → It correctly identifies only 8% of class 1 cases.

Precision = 0.59 → When it predicts class 1, it's correct 59% of the time.

F1 = 0.14 → Very weak performance.
 model is failing badly on class 1

In [14]:
svm = LinearSVC()
svm.fit(X_train, y_train)

pred_svm = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, pred_svm))
print(classification_report(y_test, pred_svm))




SVM Accuracy: 0.7067729083665338
              precision    recall  f1-score   support

           0       0.71      0.97      0.82       881
           1       0.56      0.08      0.14       374

    accuracy                           0.71      1255
   macro avg       0.64      0.53      0.48      1255
weighted avg       0.67      0.71      0.62      1255



In [None]:
scores = cross_val_score(lr, X, y, cv=5, scoring="accuracy")
print("LR Cross-validation accuracy:", scores.mean())


LR Cross-validation accuracy: 0.7048132827541509


In [None]:
param_grid = {
    "C": [0.1, 1, 5, 10]
}

grid = GridSearchCV(LogisticRegression(max_iter=2000), param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)


Best params: {'C': 10}
Best score: 0.7132900104467474


In [None]:
def predict_genre(text, model):
    tokens = clean_text(text)
    vec = document_vector(tokens)
    pred = model.predict([vec])[0]
    return "Fiction" if pred == 1 else "Not Fiction"

sample = "A young boy discovers a magical world full of adventures."
print(predict_genre(sample, svm))


Fiction
