# Data Exploration and Visualization 

In this notebook I use Decision Trees to identify underlying patterns in our book data that our stakeholders can use to impact their businesses.


By: Prince Okpoziakpo

In [10]:
import pandas as pd
import numpy as np

In [11]:
# read the book data into the 'books_df' dataframe
books_df = pd.read_csv('../data/kaggle_dataset_of_books.csv')
books_df.shape

(11127, 12)

In [12]:
books_df.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')

# Data Cleaning

## Removing null values

In [13]:
# get all the books that have a description
books_df = books_df.loc[~books_df.description.isna()] 
books_df.shape

AttributeError: 'DataFrame' object has no attribute 'description'

In [None]:
# get all the books that have categories
books_df = books_df.loc[~books_df.main_categories.isna()]
books_df.shape

In [None]:
category_counts = books_df.categories.value_counts()
category_counts[category_counts == 1]

# Feature Extraction

## Vectorize each document from the corpus

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

### Fitting the vectorizer and transforming the corpus

In [None]:
# instantiate the CountVectorizer object; `stop_words` parameter makes sure 
# we exclude English stop words
vectorizer = CountVectorizer(stop_words="english")

# extract the 'description' column and convert it into an array 
descriptions = books_df.description.to_numpy()

# fit the vectorizer and transform the corpus
bag_of_words = vectorizer.fit_transform(descriptions)

print(f"Vocabulary size: {vectorizer.vocabulary_.__len__()}")
print(f"Sample features: {vectorizer.get_feature_names_out()[1000:1005]}\n")
print(bag_of_words.toarray(), '\n')

print(bag_of_words.shape)

# Target Extraction

## Exploring the categories of each book

In [None]:
# determine the number of unique categories that exist in the dataset
all_categories = [] 
for categories_list_string in books_df.main_categories: 
    if type(categories_list_string) == str:
        c = categories_list_string.\
        rstrip().\
        lstrip().\
        replace('[', '').\
        replace(']', '').\
        replace("\'", '').\
        replace("\"", '').\
        split(',')
        all_categories += c

print("Number of unique categories: ", len(set(all_categories)))

## Fit and transform the targets into a label indicator matrix

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
# convert the type of the categories from string into an array of categories
all_categories = books_df.main_categories.apply(
    lambda s: s.\
        rstrip().\
        lstrip().\
        replace('[', '').\
        replace(']', '').\
        replace("\'", '').\
        replace("\"", '').\
        split(',')
).to_numpy()

In [None]:
# instantiate the binarizer object
mlb = MultiLabelBinarizer()

# fit and transform the labels of the targets into a label indicator matrix
targets = mlb.fit_transform(all_categories)

# verfiy that the number of rows matches the number of rows in `books_df`
# verify that the number of columns matches the number of unique categories
targets.shape

# Model Training

## Training the model

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split as tts

In [None]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = tts(bag_of_words, targets, random_state=42, test_size=.2)

In [None]:
clf = DecisionTreeClassifier(criterion="entropy")
clf.fit(X_train, y_train)

## Evaluating model performance

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(clf, X_test, y_test, cv=5)

## Visualization

In [None]:
from sklearn.tree import plot_tree

In [None]:
plot_tree(clf)

In [None]:
clf.get_depth()