In [1]:
# library imports
import importlib
import pandas as pd
import numpy as np
import os
import json
import random
import sklearn
importlib.reload(sklearn)
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import hamming_loss, label_ranking_loss

import seaborn as sns
sns.set_style('darkgrid')


print(sklearn.__version__)

1.0.1


In [2]:
# load the dataset
df = pd.read_csv('booksummaries.txt', sep='\t',names=['id1', 'id2', 'title', 'author', 'year', 'genres', 'summary'])

# extracting the required columns 
df = df.drop(['id1','id2', 'year', 'title', 'author'], axis = 1).dropna().reset_index()

# print the dataframe info
print(df.info())

# preview the dataframe 
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12841 entries, 0 to 12840
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   index    12841 non-null  int64 
 1   genres   12841 non-null  object
 2   summary  12841 non-null  object
dtypes: int64(1), object(2)
memory usage: 301.1+ KB
None


Unnamed: 0,index,genres,summary
0,0,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,1,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,2,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
3,4,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...
4,5,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...","The book tells the story of Paul Bäumer, a Ge..."


In [3]:
# cleaning up the genres feature
def get_genre(s):
    output = []
    try: 
        output  = tuple(json.loads(s).values())
    except json.decoder.JSONDecodeError as e: 
        output  = pd.NA
    except TypeError as e:
        output = pd.NA
    return output    

df['genres_cleaned'] = df.apply(lambda x : get_genre(x['genres']), axis = 1)

df.head()

Unnamed: 0,index,genres,summary,genres_cleaned
0,0,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...","(Roman à clef, Satire, Children's literature, ..."
1,1,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...","(Science Fiction, Novella, Speculative fiction..."
2,2,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...,"(Existentialism, Fiction, Absurdist fiction, N..."
3,4,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...,"(Hard science fiction, Science Fiction, Specul..."
4,5,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...","The book tells the story of Paul Bäumer, a Ge...","(War novel, Roman à clef)"


In [4]:
# Cleaning the Summary feature 
df['summary_cleaned'] = df['summary'].replace(r'[^A-Za-z .,-]+', '', regex=True)

df.head()

Unnamed: 0,index,genres,summary,genres_cleaned,summary_cleaned
0,0,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...","(Roman à clef, Satire, Children's literature, ...","Old Major, the old boar on the Manor Farm, ca..."
1,1,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...","(Science Fiction, Novella, Speculative fiction...","Alex, a teenager living in near-future Englan..."
2,2,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...,"(Existentialism, Fiction, Absurdist fiction, N...",The text of The Plague is divided into five p...
3,4,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...,"(Hard science fiction, Science Fiction, Specul...",The novel posits that space around the Milky ...
4,5,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...","The book tells the story of Paul Bäumer, a Ge...","(War novel, Roman à clef)","The book tells the story of Paul Bumer, a Ger..."


In [5]:
# getting info about the DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12841 entries, 0 to 12840
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   index            12841 non-null  int64 
 1   genres           12841 non-null  object
 2   summary          12841 non-null  object
 3   genres_cleaned   12841 non-null  object
 4   summary_cleaned  12841 non-null  object
dtypes: int64(1), object(4)
memory usage: 501.7+ KB
None


In [6]:
# vectorize and create the input array
tf = sklearn.feature_extraction.text.TfidfVectorizer(input = 'content', min_df = 0.01, max_df = 0.99)
tf.fit(df['summary_cleaned'])
df_summary_features = pd.DataFrame(tf.transform(df['summary_cleaned']).toarray(), columns = tf.get_feature_names_out())

df_summary_features.head()

Unnamed: 0,abandon,abandoned,abilities,ability,able,aboard,about,above,absence,abuse,...,year,years,yet,york,you,young,younger,youngest,your,youth
0,0.0,0.0,0.0,0.0,0.0,0.0,0.010653,0.0,0.0,0.028472,...,0.0,0.012591,0.0,0.0,0.0,0.025999,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.015517,0.0,0.0,0.0,...,0.021053,0.018339,0.0,0.0,0.0,0.037868,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.022206,0.0,0.0,0.0,...,0.0,0.0,0.019829,0.0,0.0,0.013548,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.038404,0.0,0.0,0.018449,0.0,0.0,0.0,...,0.025033,0.043612,0.0,0.0,0.0,0.022513,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.063498,0.0,0.0,0.0,...,0.0,0.0,0.028351,0.0,0.063696,0.019371,0.0,0.0,0.0,0.03834


In [7]:
# perform truncated SVD for dimensionality reduction
svd = TruncatedSVD(n_components=1000)
svd.fit(df_summary_features)
df_summary_features_reduced = pd.DataFrame(svd.transform(df_summary_features))

df_summary_features_reduced.head()  

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.381575,-0.084696,-0.037838,0.082393,0.045518,-0.019985,0.037461,-0.024951,0.046183,-0.024943,...,0.000451,0.001967,0.028718,-0.019464,-0.001609,-0.00086,-0.029424,0.000506,-0.019221,-0.013303
1,0.622475,-0.117993,0.062738,-0.092988,0.038168,0.004558,-0.026012,0.030303,-0.000732,-0.006283,...,0.004449,-0.010749,-0.0093,0.006969,-0.007605,0.007688,-0.00349,-0.003368,-0.007886,-0.006562
2,0.549348,-0.126407,0.017222,0.015966,0.000682,0.000706,-0.00743,-0.031451,0.000125,-0.040633,...,0.011943,-0.017734,0.0143,-0.012888,0.003032,-0.013693,0.001744,0.00368,-0.002169,0.005319
3,0.509968,-0.143091,-0.200928,0.101256,-0.072166,-0.009712,-0.033457,-0.080392,0.004586,0.104181,...,0.01882,0.01007,-0.004545,0.012081,-0.029315,-0.018684,-0.00933,-0.016391,-0.012828,0.012093
4,0.621812,-0.131444,0.045212,-0.026447,0.015532,-0.076272,0.059844,-0.069324,0.043078,0.042251,...,0.010124,0.00938,0.007587,-0.005931,0.00596,0.001808,-0.001698,0.000106,0.007167,-0.011451


In [8]:
# create labels
genre_classes = df['genres_cleaned'].explode().unique()
print(f"number of genres: {genre_classes.shape}")

mlb = MultiLabelBinarizer(classes = genre_classes)

df_labels = pd.DataFrame(mlb.fit_transform(df['genres_cleaned']), columns = genre_classes)

df_labels.head()

number of genres: (227,)


Unnamed: 0,Roman à clef,Satire,Children's literature,Speculative fiction,Fiction,Science Fiction,Novella,Utopian and dystopian fiction,Existentialism,Absurdist fiction,...,Encyclopedia,Mashup,Biopunk,Popular culture,Neuroscience,New York Times Best Seller list,Epic Science Fiction and Fantasy,Alien invasion,Prose,Pastiche
0,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# remove labels with less than 5 classes
counts = df_labels.sum(axis = 0)
df_labels_reduced = df_labels.iloc[:,(counts > 5).values]
reduced_columns = df_labels_reduced.columns

In [10]:
# calibrated classifier cv
X_train, X_test, y_train, y_test = train_test_split(df_summary_features_reduced, df_labels_reduced)
clf = MultiOutputClassifier(SVC(kernel='linear'))
clf.fit(X_train, y_train)

MultiOutputClassifier(estimator=SVC(kernel='linear'))

In [None]:
# checking the hamming loss
y_pred = clf.predict(X_test)
print(f"hamming loss: {hamming_loss(y_test, y_pred)}")
print(f"label ranking loss: {label_ranking_loss(y_test, y_pred)}")

In [None]:
hamming_loss(y_test, np.zeros(shape=(3211, 122)))