<a href="https://colab.research.google.com/github/moeenessa31-lgtm/Project/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- In the first cell, we will import the libraries used in the project.

In [4]:
import pandas as pd
import numpy as np
import os

- Download data directly from Kaggle, via **API Token**

  -**(google.colab files)**
  This is the library used to upload API tokens

In [5]:
# upload API Token

from google.colab import files
files.upload()

# Create a new folder (directory) named .kaggle in the user's home folder (~)
!mkdir -p ~/.kaggle
# Copy the access key file to the folder we created in step one
!cp kaggle.json ~/.kaggle/
# Change the access permissions for the copied file to make it secure and private
!chmod 600 ~/.kaggle/kaggle.json

# Retrieve dataset from Kaggle command
!kaggle datasets download -d psparks/instacart-market-basket-analysis

# Unzip files command
!unzip -q instacart-market-basket-analysis.zip -d instacart_data

Saving kaggle.json to kaggle (1).json
Dataset URL: https://www.kaggle.com/datasets/psparks/instacart-market-basket-analysis
License(s): CC0-1.0
instacart-market-basket-analysis.zip: Skipping, found more recently modified local copy (use --force to force download)
replace instacart_data/aisles.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace instacart_data/departments.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace instacart_data/order_products__prior.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace instacart_data/order_products__train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace instacart_data/orders.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace instacart_data/products.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


- Test Exaple

In [6]:
# listdir() >> Available files
data = 'instacart_data/'
print("Available files:")
print(os.listdir(data))

print("-------------------------------")
ds = pd.read_csv(data + 'orders.csv')
ds.head()

Available files:
['aisles.csv', 'products.csv', 'order_products__prior.csv', 'orders.csv', 'order_products__train.csv', 'departments.csv']
-------------------------------


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0




---


# ***Data ingestion & memory-savvy joins, using python (not manual)! Your dataset includes multiple files***



---





In [7]:
# Understanding the original datatypes

aisles = pd.read_csv(data + 'aisles.csv')
print(aisles.dtypes)

print("---------------------------------------")

departments = pd.read_csv(data + 'departments.csv')
print(departments.dtypes)

print("---------------------------------------")

products = pd.read_csv(data + 'products.csv')
print(products.dtypes)

print("---------------------------------------")

orders = pd.read_csv(data + 'orders.csv')
print(orders.dtypes)

print("---------------------------------------")

order_products_prior = pd.read_csv(data + 'order_products__prior.csv')
print(order_products_prior.dtypes)

aisle_id     int64
aisle       object
dtype: object
---------------------------------------
department_id     int64
department       object
dtype: object
---------------------------------------
product_id        int64
product_name     object
aisle_id          int64
department_id     int64
dtype: object
---------------------------------------
order_id                    int64
user_id                     int64
eval_set                   object
order_number                int64
order_dow                   int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object
---------------------------------------
order_id             int64
product_id           int64
add_to_cart_order    int64
reordered            int64
dtype: object


In [8]:
# Defining data types to reduce memory
Types = {
    'order_id': 'uint32',
    'user_id': 'uint32',
    'product_id': 'UInt16',
    'add_to_cart_order': 'UInt8',
    'reordered': 'UInt8',
    'aisle_id': 'UInt8',
    'department_id': 'UInt8',
    'order_dow': 'UInt8',
    'order_hour_of_day': 'UInt8',
    'days_since_prior_order': 'float16',
    'order_number': 'UInt8'
}

In [9]:
# Reading files using enhanced data types
aisles = pd.read_csv(data + 'aisles.csv', dtype={i: Types.get(i, None) for i in ['aisle_id']})

departments = pd.read_csv(data + 'departments.csv', dtype={i: Types.get(i, None) for i in ['department_id']})

products = pd.read_csv(data + 'products.csv', dtype={i: Types.get(i, None) for i in ['product_id', 'aisle_id', 'department_id']})

# Read orders, excluding 'days_since_prior_order' from explicit dtype setting during read_csv
orders = pd.read_csv(data + 'orders.csv', dtype={i: Types.get(i, None) for i in ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day']})
# Convert 'days_since_prior_order' to float16 after loading the DataFrame
orders['days_since_prior_order'] = orders['days_since_prior_order'].astype('float16')

order_products_prior = pd.read_csv(data + 'order_products__prior.csv', dtype={i: Types.get(i, None) for i in ['order_id', 'product_id', 'add_to_cart_order', 'reordered']})

In [10]:
print(aisles.dtypes)

print("---------------------------------------")

print(departments.dtypes)

print("---------------------------------------")

print(products.dtypes)

print("---------------------------------------")

print(orders.dtypes)

print("---------------------------------------")

print(order_products_prior.dtypes)

aisle_id     UInt8
aisle       object
dtype: object
---------------------------------------
department_id     UInt8
department       object
dtype: object
---------------------------------------
product_id       UInt16
product_name     object
aisle_id          UInt8
department_id     UInt8
dtype: object
---------------------------------------
order_id                   uint32
user_id                    uint32
eval_set                   object
order_number                UInt8
order_dow                   UInt8
order_hour_of_day           UInt8
days_since_prior_order    float16
dtype: object
---------------------------------------
order_id             uint32
product_id           UInt16
add_to_cart_order     UInt8
reordered             UInt8
dtype: object





---



* **Merge files to obtain the required dataset**  


---




In [11]:
data_set = pd.merge(orders, order_products_prior, on='order_id', how='left')
data_set = pd.merge(data_set, products, on='product_id', how='left')
data_set = pd.merge(data_set, aisles, on='aisle_id', how='left')
data_set = pd.merge(data_set, departments, on='department_id', how='left')

In [12]:
data_set.head()

  has_large_values = (abs_vals > 1e6).any()
  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,2539329,1,prior,1,2,8,,196,1,0,Soda,77,7,soft drinks,beverages
1,2539329,1,prior,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,soy lactosefree,dairy eggs
2,2539329,1,prior,1,2,8,,12427,3,0,Original Beef Jerky,23,19,popcorn jerky,snacks
3,2539329,1,prior,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,23,19,popcorn jerky,snacks
4,2539329,1,prior,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,paper goods,household


In [13]:
print(data_set.shape)
print("---------------------")
data_set.info()

(32640698, 15)
---------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32640698 entries, 0 to 32640697
Data columns (total 15 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                uint32 
 1   user_id                 uint32 
 2   eval_set                object 
 3   order_number            UInt8  
 4   order_dow               UInt8  
 5   order_hour_of_day       UInt8  
 6   days_since_prior_order  float16
 7   product_id              UInt16 
 8   add_to_cart_order       UInt8  
 9   reordered               UInt8  
 10  product_name            object 
 11  aisle_id                UInt8  
 12  department_id           UInt8  
 13  aisle                   object 
 14  department              object 
dtypes: UInt16(1), UInt8(7), float16(1), object(4), uint32(2)
memory usage: 1.8+ GB




---


# ***- Exploratory Data Analysis***



---



In [18]:
# Calculate the missing values ​​for each column
Miss_value = data_set.isnull().sum()

# Calculate the percentage of missing values ​​for each column
Miss_per = (Miss_value / len(data_set)) * 100

# Merge and sort the results
set_missing = pd.DataFrame({'Missing Count': Miss_value, 'Missing Percentage': Miss_per})

# Display only columns containing missing values
print(set_missing[set_missing['Missing Count'] > 0])


                        Missing Count  Missing Percentage
days_since_prior_order        2078068            6.366494
product_id                     206209            0.631754
add_to_cart_order              206209            0.631754
reordered                      206209            0.631754
product_name                   206209            0.631754
aisle_id                       206209            0.631754
department_id                  206209            0.631754
aisle                          206209            0.631754
department                     206209            0.631754
