In [1]:
! pip install nltk gensim scikit-learn




In [2]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report


In [4]:
df = pd.read_csv("book_details.csv")

# Drop rows missing required fields
df = df.dropna(subset=["description", "genres"]).reset_index(drop=True)

# Convert string list → python list
def parse_genres(g):
    try:
        return literal_eval(g)
    except:
        return []

df["genres_parsed"] = df["genres"].apply(parse_genres)

# Extract FIRST genre → single-label classification
primary_genres = []
for g_list in df["genres_parsed"]:
    if len(g_list) > 0:
        primary_genres.append(g_list[0])
    else:
        primary_genres.append("Unknown")

df["primary_genre"] = primary_genres

print(df["primary_genre"].value_counts()[:10])


primary_genre
Fiction               1308
Nonfiction             677
Fantasy                610
Classics               453
Historical Fiction     328
Young Adult            287
Mystery                235
Science Fiction        219
Romance                172
Unknown                155
Name: count, dtype: int64


In [5]:
# more flexible: Fiction, Historical Fiction, Speculative Fiction → label 1
df["label"] = df["primary_genre"].apply(lambda g: 1 if "Fiction" in g else 0)

print(df["label"].value_counts())


label
0    4404
1    1870
Name: count, dtype: int64


In [6]:
stop_words = set(stopwords.words("english"))
lemm = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)

    tokens = nltk.word_tokenize(text)
    tokens = [
        lemm.lemmatize(tok)
        for tok in tokens
        if tok not in stop_words and len(tok) > 2
    ]
    return tokens  # for Word2Vec we return tokens


In [7]:
df["tokens"] = df["description"].apply(clean_text)
df["tokens"].head()


0    [unforgettable, novel, childhood, sleepy, sout...
1    [harry, potter, think, ordinary, boy, rescued,...
2    [alternate, cover, edition, isbn, since, immed...
3    [discovered, attic, spent, last, year, life, a...
4    [librarian, note, alternate, cover, edition, e...
Name: tokens, dtype: object

In [8]:
sentences = df["tokens"].tolist()

w2v_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    sg=1      # SKIP-GRAM
)

w2v_model.save("skipgram_w2v.model")
print("Vocabulary Size:", len(w2v_model.wv))


Vocabulary Size: 41120


In [9]:
def document_vector(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    if len(vectors) == 0:
        return np.zeros(100)
    return np.mean(vectors, axis=0)

X_vectors = df["tokens"].apply(document_vector)

X = np.vstack(X_vectors.values)
y = df["label"].values


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [11]:
lr = LogisticRegression(max_iter=3000)
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)

print("\nLogistic Regression (Skip-Gram) Accuracy:", accuracy_score(y_test, pred_lr))
print(classification_report(y_test, pred_lr))



Logistic Regression (Skip-Gram) Accuracy: 0.7195219123505976
              precision    recall  f1-score   support

           0       0.73      0.95      0.83       881
           1       0.60      0.17      0.27       374

    accuracy                           0.72      1255
   macro avg       0.67      0.56      0.55      1255
weighted avg       0.69      0.72      0.66      1255



In [12]:
svm = LinearSVC()
svm.fit(X_train, y_train)

pred_svm = svm.predict(X_test)

print("\nLinear SVM (Skip-Gram) Accuracy:", accuracy_score(y_test, pred_svm))
print(classification_report(y_test, pred_svm))



Linear SVM (Skip-Gram) Accuracy: 0.7282868525896414
              precision    recall  f1-score   support

           0       0.74      0.95      0.83       881
           1       0.63      0.21      0.31       374

    accuracy                           0.73      1255
   macro avg       0.69      0.58      0.57      1255
weighted avg       0.71      0.73      0.68      1255





In [13]:
scores = cross_val_score(lr, X, y, cv=5, scoring="accuracy")
print("Cross-Validation Accuracy (LR + SkipGram):", scores.mean())


Cross-Validation Accuracy (LR + SkipGram): 0.7231432801489418


In [15]:
def predict_fiction(text, model=svm):
    tokens = clean_text(text)
    vec = document_vector(tokens)
    pred = model.predict([vec])[0]
    return "Fiction" if pred == 1 else "Not Fiction"

sample = "A boy discovers a magical kingdom and goes on an epic adventure."
print(predict_fiction(sample))


Not Fiction
