# 01. Importing libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Importing data

In [2]:
# Shortcut for importing data
path = r'G:\My Drive\CareerFoundry\Python Projects\2023-10 Data Immersion Task 4.5\02 Data'

In [3]:
# Creates a list with only the 'necessary' columns from the data set - orders_wrangled contains a redundant index column
vars_list = ['order_id', 'user_id', 'number_of_orders', 'orders_day_of_week', 'order_hour_of_day', 'days_since_prior_order']

In [4]:
# Import orders_wrangled data set using the vars_list - usecols function
df_ords = pd.read_csv(os.path.join(path, 'Prepared Data','orders_wrangled.csv'), usecols = vars_list)

In [5]:
df_ords.head()

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [6]:
# Imports products data set
df_prods = pd.read_csv(os.path.join(path, 'Original Data', 'products.csv'), index_col = False)

In [7]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


# 03. What are Data Consistency Checks?

In [8]:
# Checking for data types - some were changed in last exercise
df_ords.dtypes

order_id                    int64
user_id                     int64
number_of_orders            int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [9]:
df_ords.describe()

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [10]:
# Need to change dt of order_id, user_id, orders_day_of_week, order_hour_of_day to str
df_ords[['order_id', 'user_id', 'orders_day_of_week', 'order_hour_of_day']] = df_ords[['order_id', 'user_id', 'orders_day_of_week', 'order_hour_of_day']].astype(str)

In [11]:
df_ords.dtypes

order_id                   object
user_id                    object
number_of_orders            int64
orders_day_of_week         object
order_hour_of_day          object
days_since_prior_order    float64
dtype: object

In [12]:
df_ords.describe()

Unnamed: 0,number_of_orders,days_since_prior_order
count,3421083.0,3214874.0
mean,17.15486,11.11484
std,17.73316,9.206737
min,1.0,0.0
25%,5.0,4.0
50%,11.0,7.0
75%,23.0,15.0
max,100.0,30.0


In [13]:
# list of dtypes to include
include = ['object', 'float', 'int']

In [14]:
df_ords.describe(include=include)

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
unique,3421083.0,206209.0,,7.0,24.0,
top,2539329.0,152340.0,,0.0,10.0,
freq,1.0,100.0,,600905.0,288418.0,
mean,,,17.15486,,,11.11484
std,,,17.73316,,,9.206737
min,,,1.0,,,0.0
25%,,,5.0,,,4.0
50%,,,11.0,,,7.0
75%,,,23.0,,,15.0


# 04. Mixed-Type Data

In [15]:
# Create a dataframe for practicing with mixed-type data
df_test = pd.DataFrame()

In [16]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [17]:
# Above code creates a new column, mix, within df_test and fills it with numeric, string, and boolean values.

In [18]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [19]:
# Mixed-type columns check
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [20]:
# Change dt to str
df_test['mix'] = df_test['mix'].astype('str')

In [21]:
# Test again for confirmation
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

In [22]:
# No columns were printed so all the data types are the same

# 05. Missing Values

In [23]:
# Code for finding missing values (df_prods.csv)
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [24]:
# Create a subset of the missing values
df_nan = df_prods[df_prods['product_name'].isnull()==True]

In [25]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [26]:
# Since missing data is product names (string), best to filter out to a subset

In [27]:
# Before creating subset, compare # of rows in current df to subset after missing values are removed
df_prods.shape

(49693, 5)

In [29]:
# Create new subset of df_prods without the missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull()==False]

In [30]:
df_prods_clean.shape

(49677, 5)

# 06. Duplicates

In [31]:
# Find duplicated FULL ROWS in entire df
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [32]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [33]:
# Before dropping dups, compare # of rows again
df_prods_clean.shape

(49677, 5)

In [34]:
# Create new subset without dups
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [35]:
df_prods_clean_no_dups.shape

(49672, 5)

# 07. Tidying Up and Exporting Changes

In [36]:
# New export path
path_prep = r'G:\My Drive\CareerFoundry\Python Projects\2023-10 Data Immersion Task 4.5\02 Data\Prepared Data'

In [37]:
# Export cleaned data to Prepared data folder
df_prods_clean_no_dups.to_csv(os.path.join(path_prep, 'products_checked.csv'))

# TASK 4.5

##  Q.2 Run the df.describe() function on your df_ords dataframe. Using your new knowledge about how to interpret the output of this function, share in a markdown cell whether anything about the data looks off or should be investigated further.

In [38]:
# include = ['object', 'float', 'int'] -- created in section 03
df_ords.describe(include=include)

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
unique,3421083.0,206209.0,,7.0,24.0,
top,2539329.0,152340.0,,0.0,10.0,
freq,1.0,100.0,,600905.0,288418.0,
mean,,,17.15486,,,11.11484
std,,,17.73316,,,9.206737
min,,,1.0,,,0.0
25%,,,5.0,,,4.0
50%,,,11.0,,,7.0
75%,,,23.0,,,15.0


In [44]:
# Scientific Notation conversion - with '+' move decimal to the right by the # number displayed (ex. 1.5e+01 = 15.)

In [53]:
# Change dow and hour of day back to int for readability
df_ords[['orders_day_of_week', 'order_hour_of_day']] = df_ords[['orders_day_of_week', 'order_hour_of_day']].astype('int64')

In [54]:
df_ords.describe(include=include)

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
unique,3421083.0,206209.0,,,,
top,2539329.0,152340.0,,,,
freq,1.0,100.0,,,,
mean,,,17.15486,2.776219,13.45202,11.11484
std,,,17.73316,2.046829,4.226088,9.206737
min,,,1.0,0.0,0.0,0.0
25%,,,5.0,1.0,10.0,4.0
50%,,,11.0,3.0,13.0,7.0
75%,,,23.0,5.0,16.0,15.0


### A) The only thing that looks off is the count of days_since_prior_order (~200k less). May be due to NaNs which would coincide with the first order placed by a unique user. Further investigation is needed.

## Q.3 Check for mixed-type data in your df_ords dataframe.

In [55]:
# for loop function
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

## Q.4 If you find mixed-type data, fix it. The column in question should contain observations of a single data type.

### No columns were printed so there is no mixed-type data within any 1 column.

## Q.5 Run a check for missing values in your df_ords dataframe.

In [56]:
# Code for finding missing values
df_ords.isnull().sum()

order_id                       0
user_id                        0
number_of_orders               0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [57]:
# Create a subset of the missing values
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull()==True]

In [58]:
df_ords_nan.head()

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,


In [59]:
df_ords_nan.tail(25)

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order
3420613,2168901,206185,1,5,19,
3420624,1831589,206186,1,6,14,
3420628,474057,206187,1,2,23,
3420663,1944921,206188,1,4,10,
3420671,2966736,206189,1,4,15,
3420678,1955107,206190,1,6,9,
3420687,936117,206191,1,6,15,
3420693,3242108,206192,1,6,6,
3420708,313609,206193,1,5,15,
3420750,3395125,206194,1,1,9,


### All NaNs in the days_since_prior_order coincide with the first order of that user. This makes sense because as this is the first order there is no preceding number for that user to perform the calculation, therefore a NaN is returned.

## Q.6 Address the missing values using an appropriate method.

In [60]:
# Replace all NaNs in days_since_prior_order to 0
df_ords['days_since_prior_order'] = df_ords['days_since_prior_order'].fillna(0)

In [61]:
df_ords.describe(include=include)

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0
unique,3421083.0,206209.0,,,,
top,2539329.0,152340.0,,,,
freq,1.0,100.0,,,,
mean,,,17.15486,2.776219,13.45202,10.44488
std,,,17.73316,2.046829,4.226088,9.308727
min,,,1.0,0.0,0.0,0.0
25%,,,5.0,1.0,10.0,4.0
50%,,,11.0,3.0,13.0,7.0
75%,,,23.0,5.0,16.0,15.0


### I chose to replace all the NaNs with a 0, because that is the true number that should be in that position and using a separate practice script I compared the describe() outputs of both dfs (with NaN, with 0). The Count was fixed and there was only a very minor change in the Mean and Std but all other stats remained the same.

### - I realized I overwrote the orders_wrangled.csv instead of creating a new clean df. This is not best practice and hopefully will not effect my future work on this project.

## Q.7 Run a check for duplicate values in your df_ords data.

In [62]:
# Find duplicated FULL ROWS in entire df
df_ords_dups = df_ords[df_ords.duplicated()]

In [63]:
df_ords_dups

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order


## Q.8 Address the duplicates using an appropriate method.

### No duplicates were found.

## Q.9 Export final, cleaned data (df_prods has already been exported).

In [64]:
# Export cleaned data to Prepared data folder
df_ords.to_csv(os.path.join(path_prep, 'orders_checked.csv'))