In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os


In [2]:
# Set the path to the main project folder
path = r'C:\Users\luis\Desktop\Carrer Foundry boot camp\Python Fundamentals for Data Analysts'


In [3]:
path

'C:\\Users\\luis\\Desktop\\Carrer Foundry boot camp\\Python Fundamentals for Data Analysts'

In [4]:
# Import the orders data
df_ords = pd.read_csv(os.path.join(path, 'Data', 'Original Data', '4.3_orders_products', 'orders.csv'))

# Import the products data
df_prods = pd.read_csv(os.path.join(path, 'Data', 'Original Data', '4.3_orders_products', 'products.csv'))


In [5]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [6]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


# Perform Wrangling Procedures

In [7]:
# Drop the eval_set column
df_ords = df_ords.drop(columns=['eval_set'], errors='ignore')

In [8]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


# Change a Variable's Data Type

In [9]:
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_number            int64  
 3   order_dow               int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(5)
memory usage: 156.6 MB


In [10]:
# Drop rows with NaN values in the days_since_prior_order column
df_ords = df_ords.dropna(subset=['days_since_prior_order'])

# Change the data type to int64
df_ords['days_since_prior_order'] = df_ords['days_since_prior_order'].astype('int64')

In [11]:
# Verify the change
df_ords.info()
df_ords.head()

<class 'pandas.core.frame.DataFrame'>
Index: 3214874 entries, 1 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype
---  ------                  -----
 0   order_id                int64
 1   user_id                 int64
 2   order_number            int64
 3   order_dow               int64
 4   order_hour_of_day       int64
 5   days_since_prior_order  int64
dtypes: int64(6)
memory usage: 171.7 MB


Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
1,2398795,1,2,3,7,15
2,473747,1,3,3,12,21
3,2254736,1,4,4,7,29
4,431534,1,5,4,15,28
5,3367565,1,6,2,7,19


# Rename an Unintuitive Column

In [12]:
# Rename the order_dow column
df_ords_2 = df_ords.rename(columns={'order_dow': 'orders_day_of_week'})
# Rename the order_hour_of_day column without overwriting the dataframe
df_ords_2 = df_ords.rename(columns={'order_hour_of_day': 'order_hour'})

In [13]:
df_ords_2.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour,days_since_prior_order
1,2398795,1,2,3,7,15
2,473747,1,3,3,12,21
3,2254736,1,4,4,7,29
4,431534,1,5,4,15,28
5,3367565,1,6,2,7,19


# Find the Busiest Hour for Placing Orders

In [14]:
# Find the frequency of orders per hour
order_hour_freq = df_ords_2['order_hour'].value_counts()
busiest_hour = order_hour_freq.idxmax()
print(f"The busiest hour for placing orders is {busiest_hour}.")


The busiest hour for placing orders is 10.


# Create a Subset for Breakfast Items

In [15]:
# Subset for breakfast items
# according to the Kaggle website, department id fro breakfast is 14
df_breakfast = df_prods[df_prods['department_id'] == 14]  
df_breakfast.head()


Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


# Create a Subset for Dinner Party Items


In [16]:
# Subset for dinner party items
dinner_party_depts = [5, 20, 7, 12]  # these are the department_ids for alcohol, deli, beverages, and meat/seafood
df_dinner_party = df_prods[df_prods['department_id'].isin(dinner_party_depts)]
df_dinner_party.head()


Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1


# Count the Number of Rows in the Last Dataframe

In [17]:
# Count the number of rows in df_dinner_party
num_rows_dinner_party = df_dinner_party.shape[0]
print(f"The number of rows in the dinner party dataframe is {num_rows_dinner_party}.")


The number of rows in the dinner party dataframe is 7650.


# Extract Information About a Specific User

In [18]:
# Extract information about user_id = 1
user_id_1_data = df_ords[df_ords['user_id'] == 1]
print(user_id_1_data)


    order_id  user_id  order_number  order_dow  order_hour_of_day  \
1    2398795        1             2          3                  7   
2     473747        1             3          3                 12   
3    2254736        1             4          4                  7   
4     431534        1             5          4                 15   
5    3367565        1             6          2                  7   
6     550135        1             7          1                  9   
7    3108588        1             8          1                 14   
8    2295261        1             9          1                 16   
9    2550362        1            10          4                  8   
10   1187899        1            11          4                  8   

    days_since_prior_order  
1                       15  
2                       21  
3                       29  
4                       28  
5                       19  
6                       20  
7                       14  
8       

# Provide Basic Stats About User's Behavior

In [19]:
# Basic stats about user_id = 1
user_id_1_stats = user_id_1_data.describe()
print(user_id_1_stats)


           order_id  user_id  order_number  order_dow  order_hour_of_day  \
count  1.000000e+01     10.0      10.00000  10.000000          10.000000   
mean   1.861862e+06      1.0       6.50000   2.700000          10.300000   
std    1.109231e+06      0.0       3.02765   1.337494           3.591657   
min    4.315340e+05      1.0       2.00000   1.000000           7.000000   
25%    7.095760e+05      1.0       4.25000   1.250000           7.250000   
50%    2.274998e+06      1.0       6.50000   3.000000           8.500000   
75%    2.512470e+06      1.0       8.75000   4.000000          13.500000   
max    3.367565e+06      1.0      11.00000   4.000000          16.000000   

       days_since_prior_order  
count               10.000000  
mean                19.000000  
std                  9.030811  
min                  0.000000  
25%                 14.250000  
50%                 19.500000  
75%                 26.250000  
max                 30.000000  


# Export Orders Dataframes

In [20]:
df_ords_2.to_csv(os.path.join(path, 'Data', 'Prepared Data', 'orders_wrangled.csv'), index=False)

# Export Department Dataframes

In [29]:
# Import the department  data
df_dep = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'departments.csv'))


In [37]:
df_dep

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol
5,6,international
6,7,beverages
7,8,pets
8,9,dry goods pasta
9,10,bulk


## important note: in the prvious exercise the data set did not contain the "departmentcsv" file. i had to downloaded directly from the Kaggle website. the department data set was already Transpose

In [40]:
### for the Task completition sake, im going to export the dept data set as df_dep_t:

df_dep_t_new= df_dep
df_dep_t_new.to_csv(os.path.join(path, 'Data', 'Prepared Data', 'departments_wrangled.csv'), index=False)