In [None]:
import pandas as pd

books = pd.read_csv("books_cleaned.csv")

In [None]:
books["genre"].value_counts().reset_index()

In [None]:
category_mapping = {
  "Nonfiction": "Nonfiction",
  "History": "Nonfiction",
  "Games,Chess": "Nonfiction",
  "Esoterica,Astrology": "Nonfiction",
  "History,Nonfiction": "Nonfiction",
  "Music": "Nonfiction",
  "Combat,Martial Arts": "Nonfiction",
  "Crafts,Quilting": "Nonfiction",
  "Science,Mathematics": "Nonfiction",
  "Art": "Nonfiction",
  "Poetry": "Fiction",
  "Nurses,Nursing": "Nonfiction",
  "Fiction": "Fiction",
  "Occult,Tarot": "Nonfiction",
  "Romance,Romance,African American Romance": "Fiction",
  "Childrens": "Fiction",
  "Reference": "Nonfiction",
  "Alcohol,Wine": "Nonfiction",
  "Philosophy": "Nonfiction",
  "Romance": "Fiction",
  "Literature,Marathi": "Fiction",
  "Crafts,Origami": "Nonfiction",
  "Architecture": "Nonfiction",
  "Nonfiction,History": "Nonfiction",
  "Travel": "Nonfiction",
  "Science": "Nonfiction",
  "Romance,African American Romance": "Fiction",
  "Crafts,Sewing": "Nonfiction",
  "Cultural,Africa": "Nonfiction",
  "Spirituality": "Nonfiction",
  "Crafts,Knitting,Art,Crafts,Nonfiction": "Nonfiction",
  "Social Science,Social Work": "Nonfiction",
  "Food and Drink,Cookbooks": "Nonfiction",
  "Romance,African American Romance,Romance": "Fiction",
  "Gardening,Nonfiction": "Nonfiction",
  "Couture,Fashion": "Nonfiction",
  "Childrens,Picture Books,Childrens": "Fiction",
  "Aviation": "Nonfiction",
  "Crafts,Knitting,Nonfiction,Art,Crafts": "Nonfiction",
  "Gardening": "Nonfiction",
  "Crafts,Crochet": "Nonfiction",
  "Games,Role Playing Games": "Fiction",
  "Biography": "Nonfiction",
  "Business": "Nonfiction",
  "Religion": "Nonfiction",
  "Science,Chemistry": "Nonfiction",
  "Art,Art,Drawing": "Nonfiction",
  "Labor": "Nonfiction",
  "Art,Crafts": "Nonfiction",
  "Sports,Cycling": "Nonfiction",
  "Sports,Baseball,Sports,Sports,Nonfiction": "Nonfiction",
  "Fantasy": "Fiction",
  "Mystery": "Fiction",
  "Cultural,Iran": "Nonfiction",
  "Childrens,Picture Books": "Fiction",
  "Harlequin,Harlequin Romance": "Fiction",
  "Games,Role Playing Games,Fantasy": "Fiction",
  "Westerns":"Fiction",
  "Westerns,Fiction": "Fiction",
  "Asian Literature,Turkish Literature": "Fiction",
  "Horror": "Fiction"
}


In [None]:
books["simple_categories"] = books["genre"].map(category_mapping)

In [None]:
isbns = []
predicted_cats = []

missing_cats = books.loc[books["simple_categories"].isna(), ["isbn", "desc"]].reset_index(drop=True)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def predict_fiction_nonfiction(text_list):
    """
    Loads a fine-tuned DeBERTa model and predicts whether a list of texts
    is Fiction or Non-fiction.

    Args:
        text_list (list of str): A list of texts to classify.

    Returns:
        list of str: A list of predicted labels ("Fiction" or "Nonfiction").
    """
    # Load the fine-tuned model and tokenizer
    model_path = "./final_model"
    print(f"Loading model from {model_path}...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_path)
    except OSError:
        print(f"Error: Could not find a saved model and tokenizer at '{model_path}'.")
        print("Please make sure you have run the training script and the model was saved correctly.")
        return []

    # Set up the device (GPU or CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    print(f"Model loaded on: {device}")

    # Tokenize the input texts
    print("Tokenizing input...")
    inputs = tokenizer(text_list, padding=True, truncation=True, return_tensors="pt")

    # Move tensors to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get predictions
    print("Getting predictions...")
    with torch.no_grad(): 
        outputs = model(**inputs)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    predictions_cpu = predictions.cpu().numpy()

    # Decode the predictions to labels
    id2label = model.config.id2label
    predicted_labels = [id2label[idx] for idx in predictions_cpu]
    
    return predicted_labels

In [None]:
from tqdm import tqdm

for i in tqdm(range(0, len(missing_cats))):
    sequence = missing_cats["desc"][i]
    predicted_cats += [predict_fiction_nonfiction([sequence])]
    isbns += [missing_cats["isbn"][i]]

In [None]:
missing_predicted_df = pd.DataFrame({"isbn": isbns, "predicted_categories": predicted_cats})

In [None]:
import numpy as np
books = pd.merge(books, missing_predicted_df, on="isbn", how="left")
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])
books = books.drop(columns = ["predicted_categories"])

In [None]:
books.to_csv("books_with_categories.csv", index=False)