In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
%matplotlib inline

In [17]:
# Reading in the cleaned data
df = pd.read_csv('cleaned_sales_data.csv', index_col= None)
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,295665,Macbook Pro Laptop,1.0,1700.0,2019-12-30 00:01:00,"136 Church St, New York City, NY 10001"
1,295666,LG Washing Machine,1.0,600.0,2019-12-29 07:03:00,"562 2nd St, New York City, NY 10001"
2,295667,USB-C Charging Cable,1.0,11.95,2019-12-12 18:21:00,"277 Main St, New York City, NY 10001"
3,295668,27in FHD Monitor,1.0,149.99,2019-12-22 15:13:00,"410 6th St, San Francisco, CA 94016"
4,295669,USB-C Charging Cable,1.0,11.95,2019-12-18 12:38:00,"43 Hill St, Atlanta, GA 30301"


# Adding columns for order times (month, day, hour)

In [18]:
# Converting order date column into date dtype
df['Order Date'] = pd.to_datetime(df['Order Date'])

In [19]:
df['Order_Month'] = pd.DatetimeIndex(df['Order Date']).month
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Order_Month
0,295665,Macbook Pro Laptop,1.0,1700.0,2019-12-30 00:01:00,"136 Church St, New York City, NY 10001",12
1,295666,LG Washing Machine,1.0,600.0,2019-12-29 07:03:00,"562 2nd St, New York City, NY 10001",12
2,295667,USB-C Charging Cable,1.0,11.95,2019-12-12 18:21:00,"277 Main St, New York City, NY 10001",12
3,295668,27in FHD Monitor,1.0,149.99,2019-12-22 15:13:00,"410 6th St, San Francisco, CA 94016",12
4,295669,USB-C Charging Cable,1.0,11.95,2019-12-18 12:38:00,"43 Hill St, Atlanta, GA 30301",12


In [20]:
df['Order_Hour'] = pd.DatetimeIndex(df['Order Date']).hour
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Order_Month,Order_Hour
0,295665,Macbook Pro Laptop,1.0,1700.0,2019-12-30 00:01:00,"136 Church St, New York City, NY 10001",12,0
1,295666,LG Washing Machine,1.0,600.0,2019-12-29 07:03:00,"562 2nd St, New York City, NY 10001",12,7
2,295667,USB-C Charging Cable,1.0,11.95,2019-12-12 18:21:00,"277 Main St, New York City, NY 10001",12,18
3,295668,27in FHD Monitor,1.0,149.99,2019-12-22 15:13:00,"410 6th St, San Francisco, CA 94016",12,15
4,295669,USB-C Charging Cable,1.0,11.95,2019-12-18 12:38:00,"43 Hill St, Atlanta, GA 30301",12,12


In [21]:
df['Order_Day'] = pd.DatetimeIndex(df['Order Date']).day
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Order_Month,Order_Hour,Order_Day
0,295665,Macbook Pro Laptop,1.0,1700.0,2019-12-30 00:01:00,"136 Church St, New York City, NY 10001",12,0,30
1,295666,LG Washing Machine,1.0,600.0,2019-12-29 07:03:00,"562 2nd St, New York City, NY 10001",12,7,29
2,295667,USB-C Charging Cable,1.0,11.95,2019-12-12 18:21:00,"277 Main St, New York City, NY 10001",12,18,12
3,295668,27in FHD Monitor,1.0,149.99,2019-12-22 15:13:00,"410 6th St, San Francisco, CA 94016",12,15,22
4,295669,USB-C Charging Cable,1.0,11.95,2019-12-18 12:38:00,"43 Hill St, Atlanta, GA 30301",12,12,18


# Adding a sale value column

In [22]:
df['Sale_Value'] = df['Quantity Ordered'] * df['Price Each']
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Order_Month,Order_Hour,Order_Day,Sale_Value
0,295665,Macbook Pro Laptop,1.0,1700.0,2019-12-30 00:01:00,"136 Church St, New York City, NY 10001",12,0,30,1700.0
1,295666,LG Washing Machine,1.0,600.0,2019-12-29 07:03:00,"562 2nd St, New York City, NY 10001",12,7,29,600.0
2,295667,USB-C Charging Cable,1.0,11.95,2019-12-12 18:21:00,"277 Main St, New York City, NY 10001",12,18,12,11.95
3,295668,27in FHD Monitor,1.0,149.99,2019-12-22 15:13:00,"410 6th St, San Francisco, CA 94016",12,15,22,149.99
4,295669,USB-C Charging Cable,1.0,11.95,2019-12-18 12:38:00,"43 Hill St, Atlanta, GA 30301",12,12,18,11.95


In [23]:
# Making sure all values are the correct dtype
df.dtypes

Order ID                     int64
Product                     object
Quantity Ordered           float64
Price Each                 float64
Order Date          datetime64[ns]
Purchase Address            object
Order_Month                  int64
Order_Hour                   int64
Order_Day                    int64
Sale_Value                 float64
dtype: object

# Adding a buyer state column

In [24]:
df['Buyer_State'] = df['Purchase Address'].str.slice(start= -8, stop= -6)
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Order_Month,Order_Hour,Order_Day,Sale_Value,Buyer_State
0,295665,Macbook Pro Laptop,1.0,1700.0,2019-12-30 00:01:00,"136 Church St, New York City, NY 10001",12,0,30,1700.0,NY
1,295666,LG Washing Machine,1.0,600.0,2019-12-29 07:03:00,"562 2nd St, New York City, NY 10001",12,7,29,600.0,NY
2,295667,USB-C Charging Cable,1.0,11.95,2019-12-12 18:21:00,"277 Main St, New York City, NY 10001",12,18,12,11.95,NY
3,295668,27in FHD Monitor,1.0,149.99,2019-12-22 15:13:00,"410 6th St, San Francisco, CA 94016",12,15,22,149.99,CA
4,295669,USB-C Charging Cable,1.0,11.95,2019-12-18 12:38:00,"43 Hill St, Atlanta, GA 30301",12,12,18,11.95,GA


# Adding a buyer city column

In [25]:
def buyer_city(col):
    return col.split(",")[1].strip(" ")

df['Buyer_City'] = df['Purchase Address'].apply(buyer_city)
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Order_Month,Order_Hour,Order_Day,Sale_Value,Buyer_State,Buyer_City
0,295665,Macbook Pro Laptop,1.0,1700.0,2019-12-30 00:01:00,"136 Church St, New York City, NY 10001",12,0,30,1700.0,NY,New York City
1,295666,LG Washing Machine,1.0,600.0,2019-12-29 07:03:00,"562 2nd St, New York City, NY 10001",12,7,29,600.0,NY,New York City
2,295667,USB-C Charging Cable,1.0,11.95,2019-12-12 18:21:00,"277 Main St, New York City, NY 10001",12,18,12,11.95,NY,New York City
3,295668,27in FHD Monitor,1.0,149.99,2019-12-22 15:13:00,"410 6th St, San Francisco, CA 94016",12,15,22,149.99,CA,San Francisco
4,295669,USB-C Charging Cable,1.0,11.95,2019-12-18 12:38:00,"43 Hill St, Atlanta, GA 30301",12,12,18,11.95,GA,Atlanta


# Creating product classification columns

In [26]:
df['Product'].value_counts()

USB-C Charging Cable          21903
Lightning Charging Cable      21658
AAA Batteries (4-pack)        20641
AA Batteries (4-pack)         20577
Wired Headphones              18882
Apple Airpods Headphones      15549
Bose SoundSport Headphones    13325
27in FHD Monitor               7507
iPhone                         6842
27in 4K Gaming Monitor         6230
34in Ultrawide Monitor         6181
Google Phone                   5525
Flatscreen TV                  4800
Macbook Pro Laptop             4724
ThinkPad Laptop                4128
20in Monitor                   4101
Vareebadd Phone                2065
LG Washing Machine              666
LG Dryer                        646
Name: Product, dtype: int64

In [27]:
# Categorizing product type
def product_simplifier(product):
    if 'cable' in product.lower():
        return 'Cable'
    elif 'batteries' in product.lower():
        return 'Batteries'
    elif 'headphones' in product.lower():
        return 'Headphones'
    elif 'monitor' in product.lower():
        return 'Monitor'
    elif 'Phone' in product:
        return 'Smartphone'
    elif 'laptop' in product.lower():
        return 'Computer'
    else:
        return None

df['Product_Simplified'] = df['Product'].apply(product_simplifier)

In [28]:
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Order_Month,Order_Hour,Order_Day,Sale_Value,Buyer_State,Buyer_City,Product_Simplified
0,295665,Macbook Pro Laptop,1.0,1700.0,2019-12-30 00:01:00,"136 Church St, New York City, NY 10001",12,0,30,1700.0,NY,New York City,Computer
1,295666,LG Washing Machine,1.0,600.0,2019-12-29 07:03:00,"562 2nd St, New York City, NY 10001",12,7,29,600.0,NY,New York City,
2,295667,USB-C Charging Cable,1.0,11.95,2019-12-12 18:21:00,"277 Main St, New York City, NY 10001",12,18,12,11.95,NY,New York City,Cable
3,295668,27in FHD Monitor,1.0,149.99,2019-12-22 15:13:00,"410 6th St, San Francisco, CA 94016",12,15,22,149.99,CA,San Francisco,Monitor
4,295669,USB-C Charging Cable,1.0,11.95,2019-12-18 12:38:00,"43 Hill St, Atlanta, GA 30301",12,12,18,11.95,GA,Atlanta,Cable


In [29]:
# Categorizing product brand
def product_brand_simplifier(product):
    if 'apple' in product.lower():
        return 'Apple'
    elif 'bose' in product.lower():
        return 'Bose'
    elif 'iphone' in product.lower():
        return 'Apple'
    elif 'google' in product.lower():
        return 'Google'
    elif 'macbook' in product.lower():
        return 'Apple'
    elif 'lg' in product.lower():
        return 'LG'
    elif 'thinkpad' in product.lower():
        return 'Thinkpad'
    elif 'vareebadd' in product.lower():
        return 'Vareebadd'
    else:
        return None

df['Product_Brand_Simplified'] = df['Product'].apply(product_brand_simplifier)

In [30]:
df.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Order_Month,Order_Hour,Order_Day,Sale_Value,Buyer_State,Buyer_City,Product_Simplified,Product_Brand_Simplified
0,295665,Macbook Pro Laptop,1.0,1700.0,2019-12-30 00:01:00,"136 Church St, New York City, NY 10001",12,0,30,1700.0,NY,New York City,Computer,Apple
1,295666,LG Washing Machine,1.0,600.0,2019-12-29 07:03:00,"562 2nd St, New York City, NY 10001",12,7,29,600.0,NY,New York City,,LG
2,295667,USB-C Charging Cable,1.0,11.95,2019-12-12 18:21:00,"277 Main St, New York City, NY 10001",12,18,12,11.95,NY,New York City,Cable,
3,295668,27in FHD Monitor,1.0,149.99,2019-12-22 15:13:00,"410 6th St, San Francisco, CA 94016",12,15,22,149.99,CA,San Francisco,Monitor,
4,295669,USB-C Charging Cable,1.0,11.95,2019-12-18 12:38:00,"43 Hill St, Atlanta, GA 30301",12,12,18,11.95,GA,Atlanta,Cable,


In [32]:
df.dtypes

Order ID                             int64
Product                             object
Quantity Ordered                   float64
Price Each                         float64
Order Date                  datetime64[ns]
Purchase Address                    object
Order_Month                          int64
Order_Hour                           int64
Order_Day                            int64
Sale_Value                         float64
Buyer_State                         object
Buyer_City                          object
Product_Simplified                  object
Product_Brand_Simplified            object
dtype: object

In [31]:
df.to_csv('featured_data.csv', index = False)
print('Exported to csv as "featured_data.csv"')

Exported to csv as "featured_data.csv"
