In [None]:
!pip install nltk gensim scikit-learn


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import FastText

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report


In [None]:
df = pd.read_csv("book_details.csv")
df = df.dropna(subset=["description", "genres"]).reset_index(drop=True)

def parse_genres(g):
    try:
        return literal_eval(g)
    except:
        return []

df["genres_parsed"] = df["genres"].apply(parse_genres)

primary = []
for g in df["genres_parsed"]:
    primary.append(g[0] if len(g) > 0 else "Unknown")

df["primary_genre"] = primary
print(df["primary_genre"].value_counts()[:10])


primary_genre
Fiction               1308
Nonfiction             677
Fantasy                610
Classics               453
Historical Fiction     328
Young Adult            287
Mystery                235
Science Fiction        219
Romance                172
Unknown                155
Name: count, dtype: int64


In [None]:
df["label"] = df["primary_genre"].apply(
    lambda g: 1 if "Fiction" in g else 0
)
print(df["label"].value_counts())


label
0    4404
1    1870
Name: count, dtype: int64


In [None]:
stop_words = set(stopwords.words("english"))
lemm = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)

    tokens = nltk.word_tokenize(text)
    tokens = [
        lemm.lemmatize(tok)
        for tok in tokens
        if tok not in stop_words and len(tok) > 2
    ]
    return tokens


In [None]:
df["tokens"] = df["description"].apply(clean_text)
df["tokens"].head()


Unnamed: 0,tokens
0,"[unforgettable, novel, childhood, sleepy, sout..."
1,"[harry, potter, think, ordinary, boy, rescued,..."
2,"[alternate, cover, edition, isbn, since, immed..."
3,"[discovered, attic, spent, last, year, life, a..."
4,"[librarian, note, alternate, cover, edition, e..."


In [None]:
sentences = df["tokens"].tolist()

ft_model = FastText(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    sg=1,           # 1 = Skip-gram FastText
    workers=4
)

ft_model.save("fasttext_skipgram.model")

print("Vocabulary Size:", len(ft_model.wv))


Vocabulary Size: 41120


In [None]:
def document_vector(tokens):
    vectors = [ft_model.wv[word] for word in tokens if word in ft_model.wv]
    if len(vectors) == 0:
        return np.zeros(100)
    return np.mean(vectors, axis=0)

X_vectors = df["tokens"].apply(document_vector)

X = np.vstack(X_vectors.values)
y = df["label"].values


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
lr = LogisticRegression(max_iter=3000)
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)

print("\nLogistic Regression (FastText) Accuracy:", accuracy_score(y_test, pred_lr))
print(classification_report(y_test, pred_lr))



Logistic Regression (FastText) Accuracy: 0.7163346613545817
              precision    recall  f1-score   support

           0       0.73      0.95      0.83       881
           1       0.59      0.16      0.25       374

    accuracy                           0.72      1255
   macro avg       0.66      0.55      0.54      1255
weighted avg       0.69      0.72      0.65      1255



In [None]:
svm = LinearSVC()
svm.fit(X_train, y_train)

pred_svm = svm.predict(X_test)

print("\nLinear SVM (FastText) Accuracy:", accuracy_score(y_test, pred_svm))
print(classification_report(y_test, pred_svm))



Linear SVM (FastText) Accuracy: 0.7211155378486056
              precision    recall  f1-score   support

           0       0.73      0.95      0.83       881
           1       0.61      0.18      0.28       374

    accuracy                           0.72      1255
   macro avg       0.67      0.57      0.55      1255
weighted avg       0.69      0.72      0.66      1255



In [None]:
scores = cross_val_score(lr, X, y, cv=5, scoring="accuracy")
print("Cross-Validation Accuracy (LR + FastText):", scores.mean())


Cross-Validation Accuracy (LR + FastText): 0.7172464845561931


In [None]:
def predict_fiction(text, model=svm):
    tokens = clean_text(text)
    vec = document_vector(tokens)
    pred = model.predict([vec])[0]
    return "Fiction" if pred == 1 else "Not Fiction"

sample_text = "A magical adventure of a young wizard discovering a hidden world."
print(predict_fiction(sample_text))


Fiction
