# Data Exploration and Visualization 

In this notebook I use Neural Networks to identify underlying patterns in our book data that our stakeholders can use to impact their businesses.


By: Prince Okpoziakpo

In [5]:
import pandas as pd
import numpy as np



In [33]:
# read the book data into the 'books_df' dataframe
books_df = pd.read_csv('../data/isbn13_results.csv').drop(columns=['Unnamed: 0'])
books_df.shape

(2761, 13)

# Cleaning the data 

## Removing null values

In [34]:
# get all the books that have a description
books_df = books_df.loc[~books_df.description.isna()] 
books_df.shape

(2620, 13)

In [110]:
# get all the books that have categories
books_df = books_df.loc[~books_df.main_categories.isna()]
books_df.shape

(2581, 13)

## Getting  

# Feature Extraction

## Vectorize each document from the corpus

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

### Fitting the transformer to the corpus

In [42]:
# instantiate the CountVectorizer object 
vectorizer = CountVectorizer()

# extract the 'description' column and convert it into an array 
descriptions = books_df.description.to_numpy()

# fit the vectorizer on the corpus
vectorizer.fit(descriptions)

print(f"Vocabulary size: {vectorizer.vocabulary_.__len__()}")

Vocabulary size: 23416


### Transforming each document into a vector

In [45]:
bag_of_words = vectorizer.transform(descriptions)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [47]:
vectorizer.get_feature_names()[1000:1005]

['alpha', 'alphabet', 'alphabetarians', 'alphabetical', 'alphabetized']

# Target Extraction

## Exploring the categories of each book.

In [88]:
# determine the number of unique categories that exist in the dataset
all_categories = [] 
for categories_list in books_df.main_categories: 
    if type(categories_list) == str:
        c = categories_list.strip("\'][").split(',')
        all_categories += c

print("Number of unique categories: ", len(set(all_categories)))

Number of unique categories:  561


In [80]:
from sklearn.preprocessing import MultiLabelBinarizer

In [107]:
# convert the type of the categories from string into an array of categories
all_categories = books_df.main_categories.apply(
    lambda s: s.\
        rstrip().\
        lstrip().\
        replace('[', '').\
        replace(']', '').\
        replace("\'", '').\
        replace("\"", '').\
        split(',')
)
all_categories.to_numpy() 

array([list(['Juvenile Fiction']), list(['Juvenile Fiction']),
       list(['Fiction']), ..., list(['Business & Economics']),
       list(['Drama']), list(['Fiction'])], dtype=object)

## Convert each the 'categories' column into a label indicator matrix

In [108]:
mlb = MultiLabelBinarizer()
mlb.fit(all_categories)
mlb.classes_

array([' 1800-1815', ' 1853-1856', ' 1870-1871',
       ' 1883-1924--TRANSLATIONS INTO ENGLISH.', ' 1961-1975', ' 1973',
       ' 1991', ' 2003-.', ' American', ' Anita (Fictitious character)',
       ' Arab', ' Arthur (Fictitious character)', ' Canadian.',
       ' Captain (Fictitious character)', ' Chilean',
       ' Count (Fictitious character)', ' Cuban',
       ' Elvis (Fictitious character)',
       ' Encyclopedia (Fictitious character)', ' English',
       ' English (Old)', ' FRANZ', ' French',
       ' Frodo (Fictitious character)', ' Gaia (Fictitious character)',
       ' German', ' Honor (Fictitious character)', ' Islamic', ' Italian',
       ' Kahlan (Fictitious character)', ' Latin',
       ' Max (Fictitious character)', ' Medieval', ' Mexican',
       ' Military', ' Mind & Spirit', ' Mount (China and Nepal)',
       ' Myron (Fictitious character)', ' N.Y.)', ' Norse',
       ' Paul (Fictitious character)', ' Pictorial',
       ' Richard (Fictitious character)', ' The', ' a

In [109]:
mlb = MultiLabelBinarizer()
mlb.fit(np.random.randint(1, 10, (3, 5)))
print(mlb.classes_)
mlb.transform([
    [1, 2], 
    [3, ]
])

[1 2 3 5 7 8 9]


array([[1, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0]])