# 01 Mixed Type Data

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Set path
path = r"C:\Users\miche\Instacart Basket Analysis 2022_MP\02 Data"

In [4]:
# Import products_wrangled.csv
products = pd.read_csv(os.path.join(path, 'Prepared Data', 'products_wrangled.csv'), index_col = False)

In [6]:
products.columns

Index(['Unnamed: 0', 'product_id', 'product_name', 'department_id', 'prices'], dtype='object')

In [9]:
# Drop Unnamed column
products=products.drop(columns = ['Unnamed: 0'])

In [10]:
products.shape

(49693, 4)

In [11]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49693 entries, 0 to 49692
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49693 non-null  int64  
 1   product_name   49677 non-null  object 
 2   department_id  49693 non-null  int64  
 3   prices         49693 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 1.5+ MB


In [12]:
products.head()

Unnamed: 0,product_id,product_name,department_id,prices
0,1,Chocolate Sandwich Cookies,19,5.8
1,2,All-Seasons Salt,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,1,10.5
4,5,Green Chile Anytime Sauce,13,4.3


In [13]:
# Check for mixed data types
for col in products.columns.tolist():
  weird = (products[[col]].applymap(type) != products[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (products[weird]) > 0:
    print (col)

product_name


In [14]:
products['product_name'].dtype

dtype('O')

# 02 Missing Data

In [15]:
# Find missing values
products.isnull().sum()

product_id        0
product_name     16
department_id     0
prices            0
dtype: int64

In [16]:
# Create df of missing values
products_nan = products[products['product_name'].isnull()==True]

In [17]:
products_nan

Unnamed: 0,product_id,product_name,department_id,prices
33,34,,14,12.2
68,69,,7,11.8
115,116,,3,10.8
261,262,,13,12.1
525,525,,11,1.2
1511,1511,,16,14.3
1780,1780,,11,12.3
2240,2240,,1,14.2
2586,2586,,13,12.4
3159,3159,,11,13.1


In [18]:
# Remove entries with missing product_name values
products.shape

(49693, 4)

In [19]:
# by creating df without missing entries
products_no_miss = products[products['product_name'].isnull()==False]

In [20]:
products_no_miss.shape

(49677, 4)

In [21]:
# or by removing them from the dataset completely
products.dropna(subset=['product_name'], inplace=True)

In [22]:
products.shape

(49677, 4)

#### df_products now matches shape of df_products_no_miss 

# 03 Duplicates

In [23]:
# Finding/Viewing duplicates
products_dups = products[products.duplicated()]

In [24]:
products_dups

Unnamed: 0,product_id,product_name,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,11,4.8
18459,18458,Ranger IPA,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,14,6.8
35495,35491,Adore Forever Body Wash,11,9.9


In [25]:
# or another way of finding duplicates 
products.drop_duplicates()

Unnamed: 0,product_id,product_name,department_id,prices
0,1,Chocolate Sandwich Cookies,19,5.8
1,2,All-Seasons Salt,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,1,10.5
4,5,Green Chile Anytime Sauce,13,4.3
...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,1,3.1
49690,49686,Artisan Baguette,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,8,4.7


In [26]:
# Overwrite df to drop duplicates
products = products.drop_duplicates()

In [27]:
products.shape

(49672, 4)

In [28]:
products.describe()

Unnamed: 0,product_id,department_id,prices
count,49672.0,49672.0,49672.0
mean,24850.349775,11.728942,9.993282
std,14340.705287,5.850779,453.615536
min,1.0,1.0,1.0
25%,12432.75,7.0,4.1
50%,24850.5,13.0,7.1
75%,37268.25,17.0,11.1
max,49688.0,21.0,99999.0


In [29]:
# Export products dataset
products.to_csv(os.path.join(path,'Prepared Data', 'products_clean.csv'))

In [30]:
# Deleting unneccessary df's
del products_nan
del products_no_miss
del products_dups

# 04 Task, Steps 3-9

In [31]:
# Import orders_wrangled df
orders = pd.read_csv(os.path.join(path, 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [32]:
orders.shape

(3421083, 7)

In [33]:
orders.columns

Index(['Unnamed: 0', 'order_id', 'customer_id', 'order_number', 'order_day',
       'order_hour', 'previous_order'],
      dtype='object')

In [34]:
# Drop Unnamed column
orders=orders.drop(columns = ['Unnamed: 0'])

In [35]:
orders.shape

(3421083, 6)

In [36]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   order_id        int64  
 1   customer_id     int64  
 2   order_number    int64  
 3   order_day       int64  
 4   order_hour      int64  
 5   previous_order  float64
dtypes: float64(1), int64(5)
memory usage: 156.6 MB


In [37]:
orders.describe()

Unnamed: 0,order_id,customer_id,order_number,order_day,order_hour,previous_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [38]:
#3 Check for mixed type data
for col in orders.columns.tolist():
  weird = (orders[[col]].applymap(type) != orders[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (orders[weird]) > 0:
    print (col)

#### 4. No mixed data types

In [40]:
#5 Check for missing values
orders.isnull().sum()

order_id               0
customer_id            0
order_number           0
order_day              0
order_hour             0
previous_order    206209
dtype: int64

#### 6. The values are missing due to it being the customer's first order, not having a previous_order.

In [41]:
#7 Check for duplicates
orders.drop_duplicates()

Unnamed: 0,order_id,customer_id,order_number,order_day,order_hour,previous_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [44]:
order_dups = orders[orders.duplicated()]

In [45]:
order_dups

Unnamed: 0,order_id,customer_id,order_number,order_day,order_hour,previous_order


#### 8. No duplicates present.

In [46]:
orders.shape

(3421083, 6)

In [47]:
# Export orders dataset
orders.to_csv(os.path.join(path,'Prepared Data', 'orders_clean.csv'))