In [1]:
import pandas as pd
import numpy as np

# Dataset URL:
# https://www.kaggle.com/datasets/athu1105/book-genre-prediction

# Read the data into dataframe
df = pd.read_csv('../data/book_genre_dataset.csv')

# Create a column with the combined title and summary
df['combined'] = df['title'] + '. ' + df['summary']

In [2]:
from sklearn.base import TransformerMixin, BaseEstimator

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.loc[:,self.variables]

In [3]:
import langdetect

class LangDetection(BaseEstimator, TransformerMixin):
    def __init__(self, lang='en'):
        self.lang = lang
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_ = X.copy()
        X_['lang'] = X_.apply(lambda x: langdetect.detect(x))
        X_lang_only = X[X_['lang'] == self.lang]
        return X_lang_only

In [4]:
class LowercaseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_lower = X.apply(lambda x: x.lower())
        return X_lower

In [5]:
import re

class RemovePunctuation(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_no_punct = X.apply(lambda x: re.sub(r'[^\w\s]|_', '', x))
        return X_no_punct

In [6]:
class DropDataEntries(BaseEstimator, TransformerMixin):
    def __init__(self, ids):
        self.ids = ids
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_ = X.copy()
        for i in self.ids:
            X_ = X_.drop(i)
        return X_     

In [7]:
from sklearn.pipeline import Pipeline

nonsense_summaries_idx = [338, 588, 834, 1574, 1772, 2410, 2485]

clean_text_pipeline = Pipeline([
    ('get_combined_text', FeatureSelector('combined')),
    ('detect_lang', LangDetection()),
    ('lowercase',    LowercaseTransformer()),
    ('remove_punctuation', RemovePunctuation()),
    ('drop_nonsense_summaries', DropDataEntries(nonsense_summaries_idx))
])

In [8]:
clean_text_pipeline

In [9]:
X_combined_clean = clean_text_pipeline.fit_transform(df)

#'338' in X_combined_clean
df.drop([i for i in range(len(df)) if i not in X_combined_clean])
X = df[['title','summary','combined']]
y = df['genre']
X.shape, y.shape

((4657, 3), (4657,))

In [10]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

class RemoveStopwords(BaseEstimator, TransformerMixin):
    def __init__(self, stopwords):
        self.stopwords = stopwords
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_no_stopwords = X.apply(lambda x: ' '.join([w for w in x.split() if w not in self.stopwords]))
        return X_no_stopwords

[nltk_data] Downloading package stopwords to /home/mia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
from nltk.stem import WordNetLemmatizer

class Lemmatizer(BaseEstimator, TransformerMixin):
    def __init__(self, lemmatizer):
        self.lemmatizer = lemmatizer
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        lemmatize_func = lambda x: ' '.join(
            [self.lemmatizer.lemmatize(w) for w in x.split()]
        )
        X_lemmatized = X.apply(lemmatize_func)
        return X_lemmatized

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler
from mlxtend.preprocessing import DenseTransformer

preprocess_text_pipeline = Pipeline([
    ('remove_stopwords', RemoveStopwords(stop_words)),
    ('lemmatize', Lemmatizer(WordNetLemmatizer())),
    ('tfidf', TfidfVectorizer()),
    ('select_k_best', SelectKBest(k=5000))
])

In [13]:
from sklearn.preprocessing import StandardScaler
from mlxtend.preprocessing import DenseTransformer
from sklearn.decomposition import PCA

dim_reduction_pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('dense', DenseTransformer()),
    ('pca', PCA(n_components=5))
])

In [14]:
from sklearn.pipeline import FeatureUnion

text_pipeline = Pipeline([
    ('select_combined', FeatureSelector('combined')),
    ('preprocess', preprocess_text_pipeline),
    #('dense', DenseTransformer())
    #('dim_reduction', dim_reduction_pipeline),
])

In [15]:
def stopword_count(text):
    stopwords_in_text = [w for w in text.split() if w in stop_words]
    return len(stopwords_in_text)

In [16]:
class TitleSummaryFE(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_ = X.copy()
        feats = pd.DataFrame()
        feats['char_count'] = X_.apply(lambda x: len(x))
        feats['word_count'] = X_.apply(lambda x: len(x.split()))
        feats['avg_word_len'] = feats['char_count'] / feats['word_count']
        feats['stopword_count'] = X_.apply(stopword_count)
        return feats

In [17]:
from sklearn.preprocessing import FunctionTransformer

title_features = Pipeline([
    ('select_title', FeatureSelector('title')),
    ('title_features', TitleSummaryFE()),
])

In [18]:
summary_features = Pipeline([
    ('select_summary', FeatureSelector('summary')),
    ('summary_features', TitleSummaryFE()),
])

In [19]:
from nltk.sentiment import SentimentIntensityAnalyzer

class CombinedFE(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        feats = pd.DataFrame()
        sia = SentimentIntensityAnalyzer()
        X_ = X.copy()
        feats['unique_word_count'] = X_.apply(lambda x: len(set(x.split())))
        feats['unique_word_ratio'] = feats['unique_word_count'] / X_.apply(lambda x: len(x.split()))
        feats['sentiment_score'] = X_.apply(lambda x: sia.polarity_scores(x)['compound'])
        return feats

In [20]:
combined_features = Pipeline([
    ('select_combined', FeatureSelector('combined')),
    ('combined_features', CombinedFE()),
])

In [21]:
features = FeatureUnion([
    ('text', text_pipeline),
    ('title', title_features),
    ('summary', summary_features),
    ('combined', combined_features)
])

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

def run_experiment(X, y, pipeline, num_expts=5):
    scores = list()
    for i in range(num_expts):
        X_train, X_test, y_train, y_true = train_test_split(X, y, test_size=0.2)
        model = pipeline.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = accuracy_score(y_true, y_pred)
        scores.append(score)
    
    print(f"Average accuracy over {num_expts} experiments: {sum(scores) / num_expts} \n")
    print("Classification report for the last experiment:\n")
    print(classification_report(y_true, y_pred))

In [32]:
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

def model_pipeline(model):
    pipeline = Pipeline([
        ('features', features),
        ('model', model)
    ])
    return pipeline

In [33]:
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

svc = LinearSVC(dual=False)
knn = KNeighborsClassifier()
dtree = DecisionTreeClassifier()

run_experiment(X, y, model_pipeline(svc), num_expts=20)
run_experiment(X, y, model_pipeline(knn), num_expts=20)
run_experiment(X, y, model_pipeline(dtree), num_expts=20)

Average accuracy over 20 experiments: 0.6769849785407726 

Classification report for the last experiment:

              precision    recall  f1-score   support

       crime       0.61      0.67      0.64        86
     fantasy       0.66      0.74      0.70       183
     history       0.64      0.65      0.65       117
      horror       0.68      0.62      0.65       111
  psychology       0.64      0.45      0.53        20
     romance       0.58      0.29      0.39        24
     science       0.64      0.74      0.69       129
      sports       0.88      0.88      0.88        16
    thriller       0.69      0.62      0.65       229
      travel       0.86      0.71      0.77        17

    accuracy                           0.66       932
   macro avg       0.69      0.64      0.65       932
weighted avg       0.66      0.66      0.66       932

Average accuracy over 20 experiments: 0.21443133047210305 

Classification report for the last experiment:

              precision   

In [41]:
pipeline

In [279]:
class Apply(BaseEstimator, TransformerMixin):
    def __init__(self, fn):
        self.fn = fn
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_transformed = X.apply(self.fn)
        return X_transformed