In [1]:
# Import Libraries and Dependencies
import pandas as pd
import matplotlib.pyplot as plt


### 1. Combine and Clean the Data
#### Import CSVs

In [2]:
# Read the CSV files into DataFrame
df_2020 = pd.read_csv('Resources/athletic_sales_2020.csv')
df_2021 = pd.read_csv('Resources/athletic_sales_2021.csv')

In [3]:
# Display the 2020 sales DataFrame
df_2020.head()

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
0,Foot Locker,1185732,1/1/20,Northeast,New York,New York,Men's Street Footwear,50,1200,600000,300000.0,In-store
1,Foot Locker,1185732,1/1/20,Northeast,Pennsylvania,Philadelphia,Women's Apparel,68,83,5644,2426.92,Online
2,Foot Locker,1185732,1/1/20,Northeast,Pennsylvania,Philadelphia,Women's Apparel,75,275,206250,61875.0,Outlet
3,Foot Locker,1185732,1/1/20,Northeast,New York,New York,Men's Street Footwear,34,384,13056,6789.12,Outlet
4,Foot Locker,1185732,1/1/20,Northeast,Pennsylvania,Philadelphia,Women's Apparel,53,83,4399,1407.68,Outlet


In [4]:
# Display the 2021 sales DataFrame
df_2021.head()

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
0,West Gear,1128299,1/1/21,West,California,San Francisco,Men's Athletic Footwear,65,750,487500,121875.0,Outlet
1,West Gear,1128299,1/1/21,West,California,San Francisco,Men's Athletic Footwear,51,233,11883,3208.41,Outlet
2,Kohl's,1189833,1/1/21,Midwest,Montana,Billings,Men's Apparel,50,275,137500,82500.0,Outlet
3,Kohl's,1189833,1/1/21,Midwest,Montana,Billings,Men's Apparel,47,77,3619,2714.25,Online
4,West Gear,1128299,1/1/21,West,California,San Francisco,Men's Athletic Footwear,64,225,14400,5184.0,Online


#### Check the data types of each DataFrame

In [5]:
# Check the 2020 sales data types.
df_2020.dtypes

retailer             object
retailer_id           int64
invoice_date         object
region               object
state                object
city                 object
product              object
price_per_unit        int64
units_sold            int64
total_sales           int64
operating_profit    float64
sales_method         object
dtype: object

In [6]:
# Check the 2021 sales data types.
df_2021.dtypes

retailer             object
retailer_id           int64
invoice_date         object
region               object
state                object
city                 object
product              object
price_per_unit        int64
units_sold            int64
total_sales           int64
operating_profit    float64
sales_method         object
dtype: object

#### Combine the sales data by rows.

In [7]:
# Combine the 2020 and 2021 sales DataFrames on the rows and reset the index.
sales_2021 = pd.read_csv('Resources/athletic_sales_2021.csv')
sales_2020 = pd.read_csv('Resources/athletic_sales_2020.csv')

# Add a 'Year' column to each DataFrame
df_2020['invoice_date'] = 2020
df_2021['invoice_date'] = 2021

# Concatenate the DataFrames
combined_df = pd.concat([df_2020, df_2021], axis = "rows", join = "inner") 

 # Reset the index of the combined DataFrame
combined_df.reset_index(drop=True)

# Print the combined DataFrame
combined_df.tail(20)


Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
8326,Foot Locker,1185732,2021,Northeast,Pennsylvania,Philadelphia,Men's Athletic Footwear,60,275,165000,57750.0,Outlet
8327,Foot Locker,1185732,2021,Northeast,Pennsylvania,Philadelphia,Men's Athletic Footwear,55,69,3795,1707.75,Online
8328,Foot Locker,1185732,2021,Northeast,Pennsylvania,Philadelphia,Men's Athletic Footwear,47,85,3995,1637.95,Outlet
8329,Amazon,1185732,2021,Northeast,Maine,Portland,Men's Athletic Footwear,50,250,125000,43750.0,Outlet
8330,Amazon,1185732,2021,Northeast,Maine,Portland,Men's Athletic Footwear,46,70,3220,1513.4,Online
8331,Foot Locker,1185732,2021,Northeast,Pennsylvania,Philadelphia,Women's Street Footwear,55,65,3575,1894.75,Online
8332,Amazon,1185732,2021,Northeast,Maine,Portland,Women's Street Footwear,45,150,67500,27000.0,Outlet
8333,Foot Locker,1185732,2021,Northeast,Pennsylvania,Philadelphia,Women's Street Footwear,60,225,135000,54000.0,Outlet
8334,Amazon,1185732,2021,Northeast,Maine,Portland,Women's Street Footwear,41,38,1558,856.9,Online
8335,Foot Locker,1185732,2021,Northeast,Pennsylvania,Philadelphia,Women's Street Footwear,44,79,3476,1529.44,Outlet


In [8]:
# Check for null values
null_values = combined_df.isnull().sum()

# Print the null values
print(null_values)

retailer            0
retailer_id         0
invoice_date        0
region              0
state               0
city                0
product             0
price_per_unit      0
units_sold          0
total_sales         0
operating_profit    0
sales_method        0
dtype: int64


In [9]:
# Check the data type of each column
column_data_types = combined_df.dtypes

# Print the data types
print(column_data_types)


retailer             object
retailer_id           int64
invoice_date          int64
region               object
state                object
city                 object
product              object
price_per_unit        int64
units_sold            int64
total_sales           int64
operating_profit    float64
sales_method         object
dtype: object


In [10]:
combined_df['invoice_date'] = combined_df['invoice_date'].astype(object) 
print(combined_df['invoice_date'].dtype)

object


In [11]:
# Convert "invoice_date" to datetime data type

#combined_df['invoice_date'] = combined_df['invoice_date'].astype(object)  


combined_df['invoice_date'] = pd.to_datetime(combined_df['invoice_date'], errors='coerce')

# Print the updated DataFrame
combined_df.tail()


Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
8341,Foot Locker,1185732,1970-01-01 00:00:00.000002021,Northeast,Pennsylvania,Philadelphia,Men's Apparel,63,47,2961,1362.06,Online
8342,Foot Locker,1185732,1970-01-01 00:00:00.000002021,Northeast,Pennsylvania,Philadelphia,Men's Apparel,46,56,2576,1004.64,Outlet
8343,Amazon,1185732,1970-01-01 00:00:00.000002021,Northeast,Maine,Portland,Men's Apparel,52,36,1872,692.64,Online
8344,Amazon,1185732,1970-01-01 00:00:00.000002021,Northeast,Maine,Portland,Men's Apparel,55,125,68750,17187.5,Outlet
8345,Foot Locker,1185732,1970-01-01 00:00:00.000002021,Northeast,Pennsylvania,Philadelphia,Men's Apparel,70,175,122500,42875.0,Outlet


In [12]:
# Confirm that the "invoice_date" data type has been changed.
combined_df['invoice_date'].dtype
combined_df['invoice_date'].head()



0   1970-01-01 00:00:00.000002020
1   1970-01-01 00:00:00.000002020
2   1970-01-01 00:00:00.000002020
3   1970-01-01 00:00:00.000002020
4   1970-01-01 00:00:00.000002020
Name: invoice_date, dtype: datetime64[ns]

### 2. Determine which Region Sold the Most Products

#### Using `groupby`

In [13]:
# Show the number products sold for region, state, and city.

# Group the data by region, state, and city, and calculate the count of products sold

product_counts = combined_df.groupby(['region', 'state', 'city'])['units_sold'].sum()

# Print the result
print(product_counts)


# Show the top 5 results.
# Sort the DataFrame by units_sold in descending order
sorted_df = combined_df.sort_values(by='units_sold', ascending=False)

# Select the top 5 rows
top_5_results = sorted_df.head(5)

# Print the results

                            

region     state           city          
Midwest    Illinois        Chicago            25407
           Indiana         Indianapolis       26332
           Iowa            Des Moines         23446
           Kansas          Wichita            29463
           Michigan        Detroit            50095
           Minnesota       Minneapolis        20415
           Missouri        St. Louis          36404
           Montana         Billings           42713
           Nebraska        Omaha              19154
           North Dakota    Fargo              22781
           Ohio            Columbus           47781
           South Dakota    Sioux Falls        22973
           Wisconsin       Milwaukee          23950
Northeast  Connecticut     Hartford           34696
           Delaware        Wilmington         30275
           Maine           Portland           22410
           Maryland        Baltimore          20818
           Massachusetts   Boston             32895
           New Hampshi

#### Using `pivot_table`

In [14]:
# Show the number products sold for region, state, and city.
pivot_table = pd.pivot_table(combined_df, index=['region', 'state', 'city'], values='units_sold', aggfunc='sum')
print(pivot_table)


# Show the top 5 results.
combined_df.groupby(['region', 'state', 'city'])['units_sold'].sum().sort_values(ascending=False).head(5)
print(combined_df.groupby(['region', 'state', 'city'])['units_sold'].sum().sort_values(ascending=False).head(5))



                                         units_sold
region    state          city                      
Midwest   Illinois       Chicago              25407
          Indiana        Indianapolis         26332
          Iowa           Des Moines           23446
          Kansas         Wichita              29463
          Michigan       Detroit              50095
          Minnesota      Minneapolis          20415
          Missouri       St. Louis            36404
          Montana        Billings             42713
          Nebraska       Omaha                19154
          North Dakota   Fargo                22781
          Ohio           Columbus             47781
          South Dakota   Sioux Falls          22973
          Wisconsin      Milwaukee            23950
Northeast Connecticut    Hartford             34696
          Delaware       Wilmington           30275
          Maine          Portland             22410
          Maryland       Baltimore            20818
          Ma

### 3. Determine which Region had the Most Sales

#### Using `groupby`

In [15]:
# Show the total sales for the products sold for each region, state, and city.
# Group the data by region, state, and city, and calculate the sum of the sales
total_sales = combined_df.groupby(['region', 'state', 'city'])['units_sold'].sum()


# Show the top 5 results.

combined_df.groupby(['region', 'state', 'city'])['units_sold'].sum().sort_values(ascending=False).head(5)



region     state       city         
Northeast  New York    New York         111954
South      Texas       Houston           90322
West       California  San Francisco     85478
                       Los Angeles       76384
Southeast  Florida     Miami             73135
Name: units_sold, dtype: int64

#### Using `pivot_table`

In [16]:
# Show the total sales for the products sold for each region, state, and city.

combined_df.pivot_table(index=['region', 'state', 'city'], values='units_sold', aggfunc='sum').sort_values(by='units_sold', ascending=False).head(5)



# Optional: Rename the "total_sales" column to "Total Sales"

combined_df.rename(columns={'Total Sales':'units_sold'}, inplace=True)


# Show the top 5 results.
combined_df


Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
0,Foot Locker,1185732,1970-01-01 00:00:00.000002020,Northeast,New York,New York,Men's Street Footwear,50,1200,600000,300000.00,In-store
1,Foot Locker,1185732,1970-01-01 00:00:00.000002020,Northeast,Pennsylvania,Philadelphia,Women's Apparel,68,83,5644,2426.92,Online
2,Foot Locker,1185732,1970-01-01 00:00:00.000002020,Northeast,Pennsylvania,Philadelphia,Women's Apparel,75,275,206250,61875.00,Outlet
3,Foot Locker,1185732,1970-01-01 00:00:00.000002020,Northeast,New York,New York,Men's Street Footwear,34,384,13056,6789.12,Outlet
4,Foot Locker,1185732,1970-01-01 00:00:00.000002020,Northeast,Pennsylvania,Philadelphia,Women's Apparel,53,83,4399,1407.68,Outlet
...,...,...,...,...,...,...,...,...,...,...,...,...
8341,Foot Locker,1185732,1970-01-01 00:00:00.000002021,Northeast,Pennsylvania,Philadelphia,Men's Apparel,63,47,2961,1362.06,Online
8342,Foot Locker,1185732,1970-01-01 00:00:00.000002021,Northeast,Pennsylvania,Philadelphia,Men's Apparel,46,56,2576,1004.64,Outlet
8343,Amazon,1185732,1970-01-01 00:00:00.000002021,Northeast,Maine,Portland,Men's Apparel,52,36,1872,692.64,Online
8344,Amazon,1185732,1970-01-01 00:00:00.000002021,Northeast,Maine,Portland,Men's Apparel,55,125,68750,17187.50,Outlet


### 4. Determine which Retailer had the Most Sales

#### Using `groupby`

In [26]:
# Show the total sales for the products sold for each retailer, region, state, and city.
total_product_sales = combined_df.groupby(['retailer', 'region', 'state', 'city'])['units_sold'].sum()

# Rename the "total_sales" column to "Total Sales"

total_product_sales = combined_df.rename(columns={'Total Sales':'units_sold'}, inplace=True)    



# Show the top 5 results.
combined_df.groupby(['retailer', 'region', 'state', 'city'])['units_sold'].sum().sort_values(ascending=False).head(5)



KeyError: 'Column not found: units_sold'

#### Using `pivot_table`

In [None]:
# Show the total sales for the products sold for each retailer, region, state, and city.

combined_df.pivot_table(index=['retailer', 'region', 'state', 'city'], values='Total Sales', aggfunc='sum').sort_values(by='Total Sales', ascending=False).head(5)



# Optional: Rename the "total_sales" column to "Total Sales"

combined_df.rename(columns={'units_sold':'Total Sales'}, inplace=True)


# Show the top 5 results.

combined_df.pivot_table(index=['retailer', 'region', 'state', 'city'], values='Total Sales', aggfunc='sum').sort_values(by='Total Sales', ascending=False).head(5)



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Total Sales
retailer,region,state,city,Unnamed: 4_level_1
West Gear,West,California,San Francisco,81233
Kohl's,West,California,Los Angeles,74543
Foot Locker,Northeast,New York,New York,72196
Sports Direct,South,Texas,Dallas,67683
Walmart,South,Texas,Houston,65072


### 5. Determine which Retailer Sold the Most Women's Athletic Footwear

In [None]:
# Filter the sales data to get the women's athletic footwear sales data.

women_athletic_footwear_sales = combined_df[combined_df['Total Sales'] == "Women's Apparel"]
total_sales.head(5)

region   state     city        
Midwest  Illinois  Chicago         25407
         Indiana   Indianapolis    26332
         Iowa      Des Moines      23446
         Kansas    Wichita         29463
         Michigan  Detroit         50095
Name: units_sold, dtype: int64

#### Using `groupby`

In [None]:
# Show the total number of women's athletic footwear sold for each retailer, region, state, and city.

top_5_results = combined_df.groupby(['retailer', 'region', 'state', 'city'])['total_sales'].sum().sort_values(ascending=False).head(5)

# Rename the "units_sold" column to "Womens_Footwear_Units_Sold"

combined_df.rename(columns={'units_sold':'Womens_Footwear_Units_Sold'}, inplace=True)


# Show the top 5 results.
top_5_results.head(5)





KeyError: 'Column not found: total_sales'

#### Using `pivot_table`

In [None]:
# Show the total number of women's athletic footwear sold for each retailer, region, state, and city.


# Rename the "units_sold" column to "Womens_Footwear_Units_Sold"

# Show the top 5 results.


### 5. Determine the Day with the Most Women's Athletic Footwear Sales

In [None]:
# Create a pivot table with the 'invoice_date' column is the index, and the "total_sales" as the values.


# Optional: Rename the "total_sales" column to "Total Sales"


# Show the table.


In [None]:
# Resample the pivot table into daily bins, and get the total sales for each day.


# Sort the resampled pivot table in ascending order on "Total Sales".


### 6.  Determine the Week with the Most Women's Athletic Footwear Sales

In [None]:
# Resample the pivot table into weekly bins, and get the total sales for each week.


# Sort the resampled pivot table in ascending order on "Total Sales".
