In [1]:
import re
import string
import numpy as np
import unicodedata
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import json

# Define cleaning and standardization function

In [2]:
def clean_string(cell):
    """
    Clean a single string cell by applying these rules:
      1. Strip trailing/leading whitespace.
      2. Normalize accented characters to non-accented equivalents.
      3. Remove punctuation.
      4. Remove any special characters (non-alphanumeric, non-space).
      5. If the cleaned cell consists solely of digits or a combination of the characters '_', '.', '*',
         return np.nan.
      6. If the cell contains an address (assumed to be comma-separated) followed by a city/country,
         keep only the last token.
      8. Remove any single (') or double (") quotes.
      9. Remove any backslashes (\).
    """
    if cell is None or (isinstance(cell, float) and np.isnan(cell)):
        return cell
    if not isinstance(cell, str):
        cell = str(cell)
    
    # 1. Strip whitespace.
    cell = cell.strip()
    
    # 2. Normalize accented characters to ASCII.
    cell = unicodedata.normalize('NFKD', cell).encode('ascii', 'ignore').decode('ascii')
    
    # 8 & 9. Remove quotes and backslashes.
    cell = cell.replace('"', '').replace("'", "").replace("\\", "")
    
    # 4 & 5. If the cell contains only digits or the symbols _ . *, set it to np.nan.
    if re.fullmatch(r'[\d_.*]+', cell):
        return np.nan
    
    # 3. Remove punctuation.
    translator = str.maketrans('', '', string.punctuation)
    cell = cell.translate(translator)
    
    # 4. Remove any remaining special characters (anything not alphanumeric or whitespace).
    cell = re.sub(r'[^A-Za-z0-9\s]', '', cell)
    
    # 6. If the cell appears to contain an address (comma-separated), keep only the last token.
    if ',' in cell:
        cell = cell.split(',')[-1].strip()
    
    if cell == "":
        return np.nan
    
    return cell

def fingerprint(s):
    """
    Create a canonical fingerprint for a string by:
      - Lower-casing
      - Splitting into tokens
      - Sorting the tokens alphabetically
      - Joining them back together
    This groups strings that have the same set of words in a different order.
    """
    tokens = s.lower().split()
    tokens.sort()
    return " ".join(tokens)

def standardize_series_fast(series):
    """
    Standardize a cleaned pandas Series of strings using fingerprint grouping.
    
    For each unique cleaned value, compute a fingerprint. Then group values by fingerprint and
    assign a canonical value for each group (the most frequent one in the original series).
    
    This approach is much faster than pairwise fuzzy matching for a large number of unique values.
    """
    # Build a mapping from fingerprint to list of original values
    fp_to_values = {}
    for val in series.dropna().unique():
        fp = fingerprint(val)
        fp_to_values.setdefault(fp, []).append(val)
    
    # Frequency count for each cleaned value
    freq = series.value_counts().to_dict()
    
    # For each fingerprint group, choose the canonical value (most frequent)
    canonical_map = {}
    for fp, values in fp_to_values.items():
        canonical = max(values, key=lambda x: freq.get(x, 0))
        for v in values:
            canonical_map[v] = canonical
    
    # Map each value in the series to its canonical value
    return series.map(lambda x: canonical_map.get(x, x))

def clean_and_standardize_series(series):
    """
    Clean and standardize a pandas Series of strings using a fingerprint-based grouping approach.
    
    Parameters:
      series (pd.Series): Input string column.
      
    Returns:
      pd.Series: Cleaned and standardized series.
    """
    # Clean the series with a progress bar
    cleaned = series.progress_apply(clean_string)
    # Standardize using the fast fingerprint-based method
    standardized = standardize_series_fast(cleaned)
    return standardized

  """


# Define primary key creation function

In [3]:
def create_primary_key(df, title_col, author_col):
    """
    Create a primary key for each book by combining title and author.
    
    The primary key is created by:
    1. Cleaning and standardizing both title and author 
    2. Converting them to lowercase
    3. Removing all whitespace
    4. Concatenating them with an underscore between them
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing book data
    title_col : str
        Name of the column containing book titles
    author_col : str
        Name of the column containing author names
        
    Returns:
    --------
    pandas.Series
        A series containing the primary key for each book
    
    Example:
    --------
    >>> create_primary_key(df, 'title', 'author').iloc[0]
    'thecountofmontecristo_alexandredumas'
    """
    # Clean and standardize title and author
    title = clean_and_standardize_series(df[title_col])
    author = clean_and_standardize_series(df[author_col])
    
    # Convert to lowercase and remove whitespace
    title = title.str.lower().str.replace(r'\s+', '', regex=True)
    author = author.str.lower().str.replace(r'\s+', '', regex=True)
    
    # Create the primary key by concatenating title and author with an underscore
    return title + '_' + author

# Books.csv

## Loading Books.csv

In [4]:
books = pd.read_csv('data/Books.csv').drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1)
books = books.rename(columns={'Book-Title': 'title', 'Book-Author': 'author', 'Year-Of-Publication': 'publish_year', 'Publisher': 'publisher'})
# Drop 50 rows with missing values in the columns of interest
books = books.dropna(subset=['title', 'author', 'publish_year', 'publisher'])
books.head()

  books = pd.read_csv('data/Books.csv').drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1)


Unnamed: 0,ISBN,title,author,publish_year,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


## Loading summaries.txt

Load book summaries too because it includes genres

In [5]:
# Load book summaries too because it includes genres
book_summaries = pd.read_csv(
    'data/booksummaries.txt', 
    sep='\t', 
    header=None, 
    names=['Wikipedia article ID', 'Freebase ID', 'Book title', 'Author', 'Publication date', 'Book genres (Freebase ID:name tuples)', 'Plot summary']
    )

# Drop unnecessary columns
book_summaries = book_summaries.drop(['Wikipedia article ID', 'Freebase ID', 'Publication date'], axis=1)

# Drop rows with missing authors or titles (unfortunately, this removes around 15% of the data)
book_summaries = book_summaries.dropna(subset=['Book title', 'Author'])

# Rename columns
book_summaries.rename(columns={'Book title': 'title', 'Author': 'author', 'Book genres (Freebase ID:name tuples)': 'genres', 'Plot summary': 'summary'}, inplace=True)

# Dissect the genres column into a list of genres
book_summaries['genres'] = book_summaries['genres'].apply(lambda x: list(json.loads(x).values()) if type(x) == str else [])

# Around 85% of books have 3 genres or less, so let's keep only the top 3 genres
book_summaries['genre_primary'] = book_summaries['genres'].apply(lambda x: x[0] if len(x) > 0 else np.nan)
book_summaries['genre_secondary'] = book_summaries['genres'].apply(lambda x: x[1] if len(x) > 1 else np.nan)
book_summaries['genre_tertiary'] = book_summaries['genres'].apply(lambda x: x[2] if len(x) > 2 else np.nan)
book_summaries = book_summaries.drop('genres', axis=1)

book_summaries.head()

Unnamed: 0,title,author,summary,genre_primary,genre_secondary,genre_tertiary
0,Animal Farm,George Orwell,"Old Major, the old boar on the Manor Farm, ca...",Roman à clef,Satire,Children's literature
1,A Clockwork Orange,Anthony Burgess,"Alex, a teenager living in near-future Englan...",Science Fiction,Novella,Speculative fiction
2,The Plague,Albert Camus,The text of The Plague is divided into five p...,Existentialism,Fiction,Absurdist fiction
3,An Enquiry Concerning Human Understanding,David Hume,The argument of the Enquiry proceeds by a ser...,,,
4,A Fire Upon the Deep,Vernor Vinge,The novel posits that space around the Milky ...,Hard science fiction,Science Fiction,Speculative fiction


## Impute missing genres in a smart way

This approach uses three separate classifiers for primary, secondary, and tertiary genres. We create combined_text by concatenating title, author, and summary. This way, the embedding will capture context from all three attributes for classification.

In [6]:
import re
import string
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression

# -------------------- Utility Functions -------------------- #
def clean_text(text):
    """
    Clean and preprocess text:
      - Lowercase
      - Remove punctuation
      - Remove extra whitespace
    """
    if isinstance(text, str):
        text = text.lower()
        text = ''.join([char for char in text if char not in string.punctuation])
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    return ''

def get_embedding(model, text):
    """Use a SentenceTransformer to encode a single text into an embedding."""
    return model.encode(text)

# -------------------- Main Imputation Logic -------------------- #
def impute_genres_with_transformer(book_summaries):
    # Print missing genre counts before imputation
    print(
        f"Missing genre counts before imputation:\n"
        f"  Primary:   {book_summaries['genre_primary'].isna().sum()}\n"
        f"  Secondary: {book_summaries['genre_secondary'].isna().sum()}\n"
        f"  Tertiary:  {book_summaries['genre_tertiary'].isna().sum()}\n"
    )

    # Combine title, author, and summary into a single text column
    # (so the classifier can consider all three)
    book_summaries['combined_text'] = (
        book_summaries[['title', 'author', 'summary']]
        .fillna('')
        .agg(' '.join, axis=1)
    )

    # Clean the combined text
    book_summaries['clean_text'] = book_summaries['combined_text'].apply(clean_text)

    # Initialize a pre-trained SentenceTransformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Helper function to train a classifier and predict missing labels
    def train_and_impute_genre(genre_col, df):
        """
        Trains a LogisticRegression model to predict the given `genre_col`.
        The training is done on rows where `genre_col` is not missing.
        Then it imputes predictions for rows where `genre_col` is missing.
        """
        # Train set: rows that have a known genre
        train_mask = df[genre_col].notna()
        if not train_mask.any():
            print(f"No known values for {genre_col}, cannot train a model.")
            return df

        X_train_texts = df.loc[train_mask, 'clean_text'].tolist()
        y_train = df.loc[train_mask, genre_col]

        # Generate embeddings for training
        X_train_emb = model.encode(X_train_texts, show_progress_bar=False)

        # Train a simple classifier (Logistic Regression)
        clf = LogisticRegression(max_iter=1000)
        clf.fit(X_train_emb, y_train)

        # Predict on missing rows
        predict_mask = df[genre_col].isna()
        if predict_mask.any():
            X_pred_texts = df.loc[predict_mask, 'clean_text'].tolist()
            X_pred_emb = model.encode(X_pred_texts, show_progress_bar=False)
            y_pred = clf.predict(X_pred_emb)
            df.loc[predict_mask, genre_col] = y_pred

        return df

    # Impute each genre column with a separate classifier
    for col in ['genre_primary', 'genre_secondary', 'genre_tertiary']:
        book_summaries = train_and_impute_genre(col, book_summaries)

    # Drop helper columns no longer needed
    book_summaries.drop(['combined_text', 'clean_text'], axis=1, inplace=True)

    # Print missing genre counts after imputation
    print(
        f"Missing genre counts after imputation:\n"
        f"  Primary:   {book_summaries['genre_primary'].isna().sum()}\n"
        f"  Secondary: {book_summaries['genre_secondary'].isna().sum()}\n"
        f"  Tertiary:  {book_summaries['genre_tertiary'].isna().sum()}\n"
    )

    return book_summaries

# -------------------- Usage Example -------------------- #
# Assuming you already have a DataFrame `book_summaries` with columns:
#   ['title', 'author', 'summary', 'genre_primary', 'genre_secondary', 'genre_tertiary']

# book_summaries = pd.read_csv("your_book_summaries.csv")  # example loading
book_summaries = impute_genres_with_transformer(book_summaries)

# Now, `book_summaries` has all its genre columns imputed via a transformer-based approach.
book_summaries.head()


Missing genre counts before imputation:
  Primary:   2122
  Secondary: 6112
  Tertiary:  9484



  attn_output = torch.nn.functional.scaled_dot_product_attention(


Missing genre counts after imputation:
  Primary:   0
  Secondary: 0
  Tertiary:  0



Unnamed: 0,title,author,summary,genre_primary,genre_secondary,genre_tertiary
0,Animal Farm,George Orwell,"Old Major, the old boar on the Manor Farm, ca...",Roman à clef,Satire,Children's literature
1,A Clockwork Orange,Anthony Burgess,"Alex, a teenager living in near-future Englan...",Science Fiction,Novella,Speculative fiction
2,The Plague,Albert Camus,The text of The Plague is divided into five p...,Existentialism,Fiction,Absurdist fiction
3,An Enquiry Concerning Human Understanding,David Hume,The argument of the Enquiry proceeds by a ser...,Non-fiction,Fiction,Novel
4,A Fire Upon the Deep,Vernor Vinge,The novel posits that space around the Milky ...,Hard science fiction,Science Fiction,Speculative fiction


In [7]:
# Drop summaries since we don't need them anymore
book_summaries = book_summaries.drop('summary', axis=1)

## Cleaning & Formatting

In [8]:
# Clean both title and author columns in books and book_summaries so that they can be used for matching
print("-"*50)
print('Cleaning books df')
print("-"*50)
print("Cleaning and standardizing titles")
books['title'] = clean_and_standardize_series(books['title'])
print("Cleaning and standardizing authors")
books['author'] = clean_and_standardize_series(books['author'])
print("Cleaning and standardizing publishers")
books['publisher'] = clean_and_standardize_series(books['publisher'])

print("-"*50)
print('Cleaning book_summaries df')
print("-"*50)
print("Cleaning and standardizing titles")
book_summaries['title'] = clean_and_standardize_series(book_summaries['title'])
print("Cleaning and standardizing authors")
book_summaries['author'] = clean_and_standardize_series(book_summaries['author'])
print("Cleaning and standardizing genres")
book_summaries['genre_primary'] = clean_and_standardize_series(book_summaries['genre_primary'])
book_summaries['genre_secondary'] = clean_and_standardize_series(book_summaries['genre_secondary'])
book_summaries['genre_tertiary'] = clean_and_standardize_series(book_summaries['genre_tertiary'])

--------------------------------------------------
Cleaning books df
--------------------------------------------------
Cleaning and standardizing titles


100%|██████████| 271356/271356 [00:01<00:00, 217779.28it/s]


Cleaning and standardizing authors


100%|██████████| 271356/271356 [00:00<00:00, 274507.70it/s]


Cleaning and standardizing publishers


100%|██████████| 271356/271356 [00:01<00:00, 265972.69it/s]


--------------------------------------------------
Cleaning book_summaries df
--------------------------------------------------
Cleaning and standardizing titles


100%|██████████| 14177/14177 [00:00<00:00, 251295.93it/s]


Cleaning and standardizing authors


100%|██████████| 14177/14177 [00:00<00:00, 268000.67it/s]


Cleaning and standardizing genres


100%|██████████| 14177/14177 [00:00<00:00, 274204.64it/s]
100%|██████████| 14177/14177 [00:00<00:00, 280126.10it/s]
100%|██████████| 14177/14177 [00:00<00:00, 294424.93it/s]


In [9]:
# Drop any newly created missing values
books = books.dropna()
book_summaries = book_summaries.dropna()

In [10]:
books.head()

Unnamed: 0,ISBN,title,author,publish_year,publisher
0,195153448,Classical Mythology,Mark P O Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo DEste,1991,HarperPerennial
3,374157065,Flu The Story of the Great Influenza Pandemic ...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E J W Barber,1999,W W Norton amp Company


In [11]:
book_summaries.head()

Unnamed: 0,title,author,genre_primary,genre_secondary,genre_tertiary
0,Animal Farm,George Orwell,Roman a clef,Satire,Childrens literature
1,A Clockwork Orange,Anthony Burgess,Science Fiction,Novella,Speculative fiction
2,The Plague,Albert Camus,Existentialism,Fiction,Absurdist fiction
3,An Enquiry Concerning Human Understanding,David Hume,Nonfiction,Fiction,Novel
4,A Fire Upon the Deep,Vernor Vinge,Hard science fiction,Science Fiction,Speculative fiction


## Primary key creation

In [12]:
books['book_pk'] = create_primary_key(books, 'title', 'author')
book_summaries['book_pk'] = create_primary_key(book_summaries, 'title', 'author')

100%|██████████| 271304/271304 [00:01<00:00, 221020.14it/s]
100%|██████████| 271304/271304 [00:01<00:00, 261454.72it/s]
100%|██████████| 14166/14166 [00:00<00:00, 239813.81it/s]
100%|██████████| 14166/14166 [00:00<00:00, 266551.72it/s]


In [13]:
books.head()

Unnamed: 0,ISBN,title,author,publish_year,publisher,book_pk
0,195153448,Classical Mythology,Mark P O Morford,2002,Oxford University Press,classicalmythology_markpomorford
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,claracallan_richardbrucewright
2,60973129,Decision in Normandy,Carlo DEste,1991,HarperPerennial,decisioninnormandy_carlodeste
3,374157065,Flu The Story of the Great Influenza Pandemic ...,Gina Bari Kolata,1999,Farrar Straus Giroux,fluthestoryofthegreatinfluenzapandemicof1918an...
4,393045218,The Mummies of Urumchi,E J W Barber,1999,W W Norton amp Company,themummiesofurumchi_ejwbarber


In [14]:
book_summaries.head()

Unnamed: 0,title,author,genre_primary,genre_secondary,genre_tertiary,book_pk
0,Animal Farm,George Orwell,Roman a clef,Satire,Childrens literature,animalfarm_georgeorwell
1,A Clockwork Orange,Anthony Burgess,Science Fiction,Novella,Speculative fiction,aclockworkorange_anthonyburgess
2,The Plague,Albert Camus,Existentialism,Fiction,Absurdist fiction,theplague_albertcamus
3,An Enquiry Concerning Human Understanding,David Hume,Nonfiction,Fiction,Novel,anenquiryconcerninghumanunderstanding_davidhume
4,A Fire Upon the Deep,Vernor Vinge,Hard science fiction,Science Fiction,Speculative fiction,afireuponthedeep_vernorvinge


## Adding genres to books

In [15]:
# Join books with genres in book_summaries
books = books.merge(book_summaries[['book_pk', 'genre_primary', 'genre_secondary', 'genre_tertiary']], on='book_pk', how='left')

In [19]:
books.notna().sum()

ISBN               271316
title              271316
author             271316
publish_year       271316
publisher          271316
book_pk            271312
genre_primary        9746
genre_secondary      9746
genre_tertiary       9746
dtype: int64

## Dropping NAs

In [23]:
books.dropna()

Unnamed: 0,ISBN,title,author,publish_year,publisher,book_pk,genre_primary,genre_secondary,genre_tertiary
5,0399135782,The Kitchen Gods Wife,Amy Tan,1991,Putnam Pub Group,thekitchengodswife_amytan,Fiction,Fiction,Fiction
18,0440234743,The Testament,John Grisham,1999,Dell,thetestament_johngrisham,Thriller,Fiction,Suspense
27,0345402871,Airframe,Michael Crichton,1997,Ballantine Books,airframe_michaelcrichton,Science Fiction,Novel,Novel
28,0345417623,Timeline,Michael Crichton,2000,Ballantine Books,timeline_michaelcrichton,Science Fiction,Fiction,Suspense
29,0684823802,OUT OF THE SILENT PLANET,CS Lewis,1996,Scribner,outofthesilentplanet_cslewis,Science Fiction,Speculative fiction,Fiction
...,...,...,...,...,...,...,...,...,...
271033,0670626511,The Sea the Sea,Iris Murdoch,1978,Viking Press,theseathesea_irismurdoch,Fiction,Novel,Novel
271083,0425028615,The Godmakers,Frank Herbert,1975,Berkley Publishing Group,thegodmakers_frankherbert,Science Fiction,Speculative fiction,Fiction
271090,0451520521,Babbitt,Sinclair Lewis,1982,Signet Book,babbitt_sinclairlewis,Satire,Fiction,Novel
271178,1561569119,Robinson Crusoe,Daniel Defoe,1998,Kidsbooks,robinsoncrusoe_danieldefoe,Speculative fiction,Childrens literature,Fiction


## Final Books.csv