In [None]:
from graphviz import Digraph

g = Digraph('G', node_attr={'style': 'filled'})

with g.subgraph(name='cluster_cleaning') as c:
    c.attr(color='black', label='Cleaning')
    c.edges([
        ('LangDetect', 'Lowercase'),
        ('Lowercase', 'RemovePunctuation'),
        ('RemovePunctuation', 'RemoveNonsenseSummary'),
        ('RemoveNonsenseSummary', 'FilterEmptySummary')
    ])
    
with g.subgraph(name='cluster_preprocess') as c:
    c.attr(color='black', label='Preprocessing')
    c.edges([
        ('RemoveStopwords', 'Lemmatize'),
        ('Lemmatize', 'Vectorize')
    ])
    
with g.subgraph(name='cluster_dimred') as c:
    c.attr(color='black', label='Dim. Reduction')
    c.edges([
        ('Scaler', 'ToDense'),
        ('ToDense', 'PCA')
    ])
    
with g.subgraph(name='cluster_title') as c:
    c.attr(color='black', label='Title Features')
    c.edges([
        ('FilterEmptySummary', 'TitleCharCount'),
        ('FilterEmptySummary', 'TitleWordCount'),
        ('FilterEmptySummary', 'TitleStopwordCount'),
        ('TitleCharCount', 'TitleAvgWordLen'),
        ('TitleWordCount', 'TitleAvgWordLen')
    ])

with g.subgraph(name='cluster_summ') as c:
    c.attr(color='black', label='Summary Features')
    c.edges([
        ('FilterEmptySummary', 'SummCharCount'),
        ('FilterEmptySummary', 'SummWordCount'),
        ('FilterEmptySummary', 'SummSentCount'),
        ('FilterEmptySummary', 'SummStopwordCount'),
        ('SummCharCount', 'SummAvgWordLen'),
        ('SummWordCount', 'SummAvgWordLen'),
        ('SummWordCount', 'SummAvgSentLen'),
        ('SummSentCount', 'SummAvgSentLen'),
    ])
    
with g.subgraph(name='cluster_combined') as c:
    c.attr(color='black', label='Combined Text Features')
    c.edges([
        ('FilterEmptySummary', 'UniqueWordCount'),
        ('FilterEmptySummary', 'SentimentScore')
    ])

g.edge('RawData', 'LangDetect')
g.edge('FilterEmptySummary', 'RemoveStopwords')
g.edge('Vectorize', 'Scaler')

feature_list = [
    'PCA', 'TitleCharCount', 'TitleWordCount', 'TitleStopwordCount', 'TitleAvgWordLen',
    'SummCharCount', 'SummWordCount', 'SummSentCount', 'SummStopwordCount', 'SummAvgWordLen',
    'SummAvgSentLen', 'UniqueWordCount', 'SentimentScore'
]

for f in feature_list:
    g.edge(f, 'FeatureUnion')
    
with g.subgraph(name='clust_finaldimred') as c:
    c.attr(color='black', label='Final Dim. Reduction')
    c.edges([
        ('FinalScaler', 'FinalToDense'),
        ('FinalToDense', 'FinalPCA')
    ])

g.edge('FeatureUnion', 'FinalScaler')
g.edge('FinalPCA', 'Model')

u = g.unflatten(stagger=3)
u

In [4]:
import pandas as pd
import numpy as np

# Dataset URL:
# https://www.kaggle.com/datasets/athu1105/book-genre-prediction

# Read the data into dataframe
df = pd.read_csv('../data/book_genre_dataset.csv')

# Create a column with the combined title and summary
df['combined'] = df['title'] + '. ' + df['summary']

X = df[['title', 'summary', 'combined']]
y = df['genre']
X.shape

(4657, 3)

In [5]:
from sklearn.base import TransformerMixin, BaseEstimator

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.loc[:,self.variables]

In [6]:
import langdetect

class LangDetection(BaseEstimator, TransformerMixin):
    def __init__(self, lang='en'):
        self.lang = lang
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_ = X.copy()
        X_['lang'] = X_.apply(lambda x: langdetect.detect(x))
        X_lang_only = X[X_['lang'] == self.lang]
        return X_lang_only

In [7]:
class LowercaseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_lower = X.apply(lambda x: x.lower())
        return X_lower

In [8]:
import re

class RemovePunctuation(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_no_punct = X.apply(lambda x: re.sub(r'[^\w\s]|_', '', x))
        return X_no_punct

In [33]:
class DropDataEntries(BaseEstimator, TransformerMixin):
    def __init__(self, ids):
        self.ids = ids
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_ = X.copy()
        for i in self.ids:
            X_ = X_.drop(i)
        return X_     

In [34]:
from sklearn.pipeline import Pipeline

nonsense_summaries_idx = [338, 588, 834, 1574, 1772, 2410, 2485]

clean_text_pipeline = Pipeline([
    #('get_combined_text', FeatureSelector('combined')),
    ('detect_lang', LangDetection()),
    ('lowercase',    LowercaseTransformer()),
    ('remove_punctuation', RemovePunctuation()),
    ('drop_nonsense_summaries', DropDataEntries(nonsense_summaries_idx))
])

In [38]:
clean_text_pipeline.transform(X['combined'])

KeyboardInterrupt: 