# Data Exploration and Visualization 

In this notebook I use Decision Trees to identify underlying patterns in our book data that our stakeholders can use to impact their businesses.


By: Prince Okpoziakpo

In [44]:
import pandas as pd
import numpy as np

In [45]:
# read the book data into the 'books_df' dataframe
books_df = pd.read_csv('../data/final_books.csv')
books_df.shape

(3287, 21)

In [46]:
books_df.columns

Index(['id', 'title', 'isbn', 'page_count', 'publishing_date', 'form',
       'publisher', 'language', 'author', 'illustrator',
       'originally_published', 'genres', 'subject', 'awards', 'nominations',
       'characters', 'description', 'sub_title', 'book_id', 'average_rating',
       'ratings_count'],
      dtype='object')

# Data Cleaning

## Removing null values

In [47]:
# get all the books that have a description
books_df = books_df.loc[~books_df.description.isna()] 
books_df.shape

(3144, 21)

In [48]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3144 entries, 0 to 3286
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    3144 non-null   object 
 1   title                 3144 non-null   object 
 2   isbn                  2883 non-null   object 
 3   page_count            3107 non-null   object 
 4   publishing_date       3143 non-null   object 
 5   form                  3019 non-null   object 
 6   publisher             3144 non-null   object 
 7   language              3143 non-null   object 
 8   author                3043 non-null   object 
 9   illustrator           315 non-null    object 
 10  originally_published  2862 non-null   object 
 11  genres                1773 non-null   object 
 12  subject               2998 non-null   object 
 13  awards                278 non-null    object 
 14  nominations           379 non-null    object 
 15  characters           

In [49]:
# get all the books that have categories
books_df = books_df.loc[~books_df.genres.isna()]
books_df.shape

(1773, 21)

In [50]:
category_counts = books_df.genres.value_counts()
category_counts[category_counts == 1]

Graphic novel, Comics, Fiction                                1
Thriller, Spy fiction, Suspense, Adventure fiction            1
Young adult fiction, Adventure fiction                        1
Novel, Science fiction, Fantasy Fiction, Dystopian Fiction    1
Fiction, Children's literature, Christmas Story               1
                                                             ..
Novel, Fiction, Chick lit, Humor, Roman à clef                1
Novel, Humor, Mystery, Crime fiction                          1
Fairy tale, Fiction                                           1
Novel, Creative nonfiction, Road Fiction                      1
Novel, Adventure fiction, Nautical fiction                    1
Name: genres, Length: 783, dtype: int64

# Feature Extraction

## Vectorize each document from the corpus

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

### Fitting the vectorizer and transforming the corpus

In [52]:
# instantiate the CountVectorizer object; `stop_words` parameter makes sure 
# we exclude English stop words
vectorizer = CountVectorizer(stop_words="english")

# extract the 'description' column and convert it into an array 
descriptions = books_df.description.to_numpy()

# fit the vectorizer and transform the corpus
bag_of_words = vectorizer.fit_transform(descriptions)

print(f"Vocabulary size: {vectorizer.vocabulary_.__len__()}")
print(f"Sample features: {vectorizer.get_feature_names_out()[1000:1005]}\n")
print(bag_of_words.toarray(), '\n')

print(bag_of_words.shape)

Vocabulary size: 21927
Sample features: ['amüsiert' 'ana' 'anagrammatic' 'anahuac' 'anais']

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] 

(1773, 21927)


# Target Extraction

## Exploring the categories of each book

In [53]:
# determine the number of unique categories that exist in the dataset
all_categories = [] 
for categories_list_string in books_df.genres: 
    if type(categories_list_string) == str:
        c = categories_list_string.\
        rstrip().\
        lstrip().\
        replace('[', '').\
        replace(']', '').\
        replace("\'", '').\
        replace("\"", '').\
        split(',')
        all_categories += c

print("Number of unique categories: ", len(set(all_categories)))

Number of unique categories:  312


## Fit and transform the targets into a label indicator matrix

In [54]:
from sklearn.preprocessing import MultiLabelBinarizer

In [55]:
# convert the type of the categories from string into an array of categories
all_categories = books_df.main_categories.apply(
    lambda s: s.\
        rstrip().\
        lstrip().\
        replace('[', '').\
        replace(']', '').\
        replace("\'", '').\
        replace("\"", '').\
        split(',')
).to_numpy()

AttributeError: 'DataFrame' object has no attribute 'main_categories'

In [None]:
# instantiate the binarizer object
mlb = MultiLabelBinarizer()

# fit and transform the labels of the targets into a label indicator matrix
targets = mlb.fit_transform(all_categories)

# verfiy that the number of rows matches the number of rows in `books_df`
# verify that the number of columns matches the number of unique categories
targets.shape

# Model Training

## Training the model

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split as tts

In [None]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = tts(bag_of_words, targets, random_state=42, test_size=.2)

In [None]:
clf = DecisionTreeClassifier(criterion="entropy")
clf.fit(X_train, y_train)

## Evaluating model performance

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(clf, X_test, y_test, cv=5)

## Visualization

In [None]:
from sklearn.tree import plot_tree

In [None]:
plot_tree(clf)

In [None]:
clf.get_depth()