In [None]:
From: Brooke Reeder (Owner, Maven Books)
Subject: Please help make data numeric

Hi again,

Thanks for your help earlier this week!

We just learned that we also have to make all the data numeric before inputting it into a predictive model.

Can you turn the “Audience” text field into a numeric field?

Thanks!
Brooke

Key Objectives

1. Create dummy variables from the “Audience” field
2. Using the “Audience” dummy variables, create three new columns that contain the number of Adult, Children, and Teen books purchased by each customer
3. Combine the three new columns back with the customer-level data

In [13]:
# Read all four files into a Jupyter Notebook

import pandas as pd

april_sales = pd.read_excel('Book_Sales_April.xlsx')
may_sales = pd.read_excel('Book_Sales_May.xlsx')
june_sales = pd.read_excel('Book_Sales_June.xlsx')
customers = pd.read_csv('Book_Customers.csv')
customers.head()

Unnamed: 0,Customer ID,Customer Name,Age,Has School Aged Children,Has_Pets
0,101,Alexander,36,Yes,No
1,102,Mason,16,No,No
2,103,Ethan,55,No,No
3,104,Daniel,70,No,No
4,105,Michael,18,No,Yes


In [7]:
# Append the May and June book sales to the April DataFrame

sales = pd.concat([april_sales, may_sales, june_sales]).reset_index(drop=True)
sales.head()

Unnamed: 0,Purchase ID,Customer ID,Book,Audience,Genre,Price,Book Rating,Purchase Location,Purchase Date
0,401,101,Unicorns Are Cool,Children,Fiction,4.99,3.6,Online,2023-04-01
1,402,101,I Love Unicorns,Children,Fiction,5.99,4.1,Online,2023-04-01
2,403,102,All About Turtles,Teens,Non-Fiction,19.99,3.3,In Person,2023-04-02
3,404,102,All About Whales,Teens,Non-Fiction,19.99,2.5,In Person,2023-04-02
4,405,102,All About Dolphins,Teens,Non-Fiction,19.99,4.2,In Person,2023-04-02


In [8]:
# specify a column to get dummy variables
audience_dummies = pd.get_dummies(sales.Audience)
audience_dummies

Unnamed: 0,Adults,Children,Teens
0,False,True,False
1,False,True,False
2,False,False,True
3,False,False,True
4,False,False,True
5,True,False,False
6,True,False,False
7,True,False,False
8,False,True,False
9,False,False,True


In [10]:
#combine with customer ID
pd.concat([sales['Customer ID'], audience_dummies], axis = 1)

Unnamed: 0,Customer ID,Adults,Children,Teens
0,101,False,True,False
1,101,False,True,False
2,102,False,False,True
3,102,False,False,True
4,102,False,False,True
5,103,True,False,False
6,104,True,False,False
7,105,True,False,False
8,101,False,True,False
9,102,False,False,True


In [12]:
# groupby by customer ID so each row is a single customer
# Using the “Audience” dummy variables, create three new columns that contain the number of Adult, Children, and Teen books purchased by each customer

categories = pd.concat([sales['Customer ID'], audience_dummies], axis = 1).groupby('Customer ID').sum().reset_index()
categories.head()

Unnamed: 0,Customer ID,Adults,Children,Teens
0,101,0,7,0
1,102,0,0,4
2,103,3,0,0
3,104,2,0,0
4,105,1,0,0


In [15]:
# 3. Combine the three new columns back with the customer-level data
customers_categories = customers.merge(categories, how='left', on ='Customer ID')
customers_categories.head()

Unnamed: 0,Customer ID,Customer Name,Age,Has School Aged Children,Has_Pets,Adults,Children,Teens
0,101,Alexander,36,Yes,No,0.0,7.0,0.0
1,102,Mason,16,No,No,0.0,0.0,4.0
2,103,Ethan,55,No,No,3.0,0.0,0.0
3,104,Daniel,70,No,No,2.0,0.0,0.0
4,105,Michael,18,No,Yes,1.0,0.0,0.0
