# 01. Importing libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Importing data

In [2]:
# Shortcut for importing original data files
path_og = r'G:\My Drive\CareerFoundry\Python Projects\2023-10 Instacart Basket Analysis\02 Data\Original Data'

In [3]:
path_og

'G:\\My Drive\\CareerFoundry\\Python Projects\\2023-10 Instacart Basket Analysis\\02 Data\\Original Data'

In [4]:
# Creates a list with only the 'necessary' columns from the data set
vars_list = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']

In [5]:
vars_list

['order_id',
 'user_id',
 'order_number',
 'order_dow',
 'order_hour_of_day',
 'days_since_prior_order']

In [6]:
# Imports orders data set using the vars_list - usecols function
df_ords = pd.read_csv(os.path.join(path_og, 'orders.csv'), usecols = vars_list)

In [7]:
# Imports products data set
df_prods = pd.read_csv(os.path.join(path_og, 'products.csv'), index_col = False)

# 03. Exercise practice functions - DATA WRANGLING

## - Dropping Columns

In [8]:
# value_counts() function
df_ords['days_since_prior_order'].value_counts(dropna = False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

In [9]:
# NaN represents missing values

## - Renaming Columns

In [10]:
# df.rename() function - change ords_dow to ords_day_of_week
# always confirm with client that name change is allowed
df.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

NameError: name 'df' is not defined

In [None]:
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [None]:
df_ords.head()

## - Changing a Variable's Data Type

In [None]:
# Change dt before using describe()
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [None]:
# Use dtype() to confirm change - ** SINGULAR dtype b/c you're only checking one column
df_ords['order_id'].dtype

In [None]:
# 'O' stands for object, pandas' version of a string

## - Transposing Data

In [None]:
# I have to create a new path_og b/c I did not add orders and products csvs to 4.4 folder, but imported them from a different location.
# I'm correcting this for derpartments csv file

In [None]:
path_og_2 = r'G:\My Drive\CareerFoundry\Python Projects\2023-10 Data Immersion Task 4.4\02 Data\Original Data'

In [None]:
# Imports departments data set
df_dep = pd.read_csv(os.path.join(path_og_2, 'departments.csv'), index_col = False)

In [None]:
df_dep.head()

In [None]:
# transposes from wide to long format and vice-versa
df_dep.T

In [None]:
# To rewrite transpotion save new tranposed version as a new data frame
df_dep_t = df_dep.T

In [None]:
# check new df
df_dep_t

In [None]:
 # transpo confused pandas and made 'O' a header
## to change, FIRST add an index to new df

In [None]:
df_dep_t.reset_index()

In [None]:
# SECOND, create a new head (3 steps)

In [None]:
# Step 1 - Take the first row of df_dep_t for the header
new_header = df_dep_t.iloc[0]

In [None]:
new_header

In [None]:
# Step 2 - create a new dataframe that only copies over rows beyond the first row
df_dep_t_new = df_dep_t[1:]

In [None]:
df_dep_t_new

In [None]:
# Add the new header

In [None]:
# Set the header row as the df header
df_dep_t_new.columns = new_header

In [None]:
df_dep_t_new

## DATA DICTIONARIES

In [None]:
# to_dict function - changes df_dep_t into a data dictionary
data_dict = df_dep_t_new.to_dict('index')

In [None]:
data_dict

In [None]:
df_prods.head()

In [None]:
# Use the new data dictionary to determine what the department ids stand for by calling...

In [None]:
print(data_dict.get('19'))

## SUBSETTING

In [None]:
# 1. Use FOLLOW COMMAND to create a subset
# indexing code of the follow command
df_prods['department_id']==19

In [None]:
df_prods[df_prods['department_id']==19]

In [None]:
# Enclosing df_prods['department_id']==19 inside df_prods[] tells Python to map the true/false procedure you witnessed in the 
# previous output onto the columns within the df_prods dataframe—the result being a list of only those values within df_prods 
# that are true (and, subsequently, only those values within df_prods that have a "department_id" of 19).

In [None]:
# Save all these results into a new dataframe - df_snacks
df_snacks =  df_prods[df_prods['department_id']==19]

In [None]:
df_snacks.head()

In [None]:
# 2. Use loc function to create subset - loc looks for a column name
df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]

In [None]:
# OR this command
df_snacks_3 = df_prods.loc[df_prods['department_id'].isin([19])]

In [None]:
# 2nd command, rather than using an equals sign, you’re telling the loc function to look into a list: isin([19]). 
# This is handy b/c you could use more than one value if you wanted to check multiple "department_ids" at the same time 
# (isin([17,18,19]))

## EXPORTING DATAFRAMES

In [None]:
# Export command - df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))
# Make sure to export into Prepared Data folder since it has been altered

# TASK 4.4

### Q.2 Find another identifier variable in the df_ords dataframe that doesn’t need to be included in your analysis as a numeric variable and change it to a suitable format.

In [None]:
# Change user_id from 'int' to 'object'
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [None]:
df_ords['user_id'].dtype

In [None]:
# orders_day_of_week and order_hour_of_day should also be changed to an object
df_ords['orders_day_of_week'] = df_ords['orders_day_of_week'].astype('str')

In [None]:
df_ords['order_hour_of_day'] = df_ords['order_hour_of_day'].astype('str')

In [None]:
df_ords.dtypes

### Q.3 Look for a variable in your df_ords dataframe with an unintuitive name and change its name without overwriting the dataframe.

In [None]:
# order_number sounds like an id - change to number_of_orders for clarity
df_ords_v2 = df_ords.rename(columns = {'order_number' : 'number_of_orders'}, inplace = False)

In [None]:
df_ords.head()

In [None]:
df_ords_v2.head()

In [None]:
# From here on, df_ords_v2 is the current dataframe

## Q.4 Your client wants to know what the busiest hour is for placing orders. Find the frequency of the corresponding variable and share your findings.

In [None]:
df_ords_v2['order_hour_of_day'].value_counts()

### The busiest hour of the day is 10am.

## Q.5 Determine the meaning behind a value of 4 in the "department_id" column within the df_prods dataframe using a data dictionary.

In [None]:
# Needed data dictionary already exists from previous code - data_dict = df_dep_t_new.to_dict('index')
# Only calling the specific dept.id is necessary

In [None]:
print(data_dict.get('4'))

### The value 4 in the department_id column represents the Produce department.

## Q.6 The sales team in your client’s organization wants to know more about breakfast item sales. Create a subset containing only the required information.

In [None]:
# First, determing breakfast items dept. id
data_dict

In [None]:
# Breakfast id is 14

In [None]:
# Create a subset of department 14 only
df_prods_breakfast = df_prods[df_prods['department_id']==14]

In [None]:
df_prods_breakfast.head(None)

In [None]:
# Concern: This subset doesn't give any information about the actual sales of breakfast items and I don't see a 'foreign key'
# linking products to orders.

## Q.7 They’d also like to see details about products that customers might use to throw dinner parties. Your task is to find all observations from the entire dataframe that include items from the following departments: alcohol, deli, beverages, and meat/seafood. You’ll need to present this subset to your client.

In [None]:
# First, find corresponding dept. ids
data_dict

In [None]:
# alcohol = 5, deli = 20, beverages = 7, meat/seafood = 12

In [None]:
# Use loc and isin function for multiple filters
df_prods_din_parties = df_prods.loc[df_prods['department_id'].isin([5, 20, 7, 12])]

In [None]:
df_prods_din_parties.head(None)

## Q.8 It’s important that you keep track of total counts in your dataframes. How many rows does the last dataframe you created have?

In [None]:
# use shape to find # of rows
df_prods_din_parties.shape

### The Dinner Parties df has 7,650 rows.

## Q.9 Someone from the data engineers team in Instacart thinks they’ve spotted something strange about the customer with a "user_id" of “1.” Extract all the information you can about this user.

In [None]:
df_userid_1 = df_ords_v2[df_ords_v2['user_id']=='1']

In [None]:
df_userid_1.head(None)

In [None]:
df_userid_1.shape

## Q.10 You also need to provide some details about this user’s behavior. What basic stats can you provide based on the information you have?

In [None]:
df_userid_1.describe()

In [None]:
# First attempt to describe() did not include objects

In [None]:
# list of dtypes to include
include = ['object', 'float', 'int']

In [None]:
df_userid_1.describe(include=include)

### Basic Stats on user_id 1:

### 1. Has placed 11 separate orders over the last 6 mos.

### 2. Typically orders every 2-4 weeks.

### 3. Typically orders between 7am-9am.

### 4. Only orders on Tuesday - Friday, never on Saturday - Monday.

In [None]:
# Path to prepared data folder

In [None]:
path_prep = r'G:\My Drive\CareerFoundry\Python Projects\2023-10 Data Immersion Task 4.4\02 Data\Prepared Data'

In [None]:
# Exporting df_ords_v2
df_ords_v2.to_csv(os.path.join(path_prep, 'orders_wrangled.csv'))

In [None]:
# Exporting df_dep_t_new
df_dep_t_new.to_csv(os.path.join(path_prep, 'departments_wrangled.csv'))