# eCommerce Predict Repurchase 

In this project, I utilized data from an eCommerce platform __[link](https://gist.github.com/jeremystan/c3b39d947d9b88b3ccff3147dbcf6c6b)__ to predict repurchase. The project comprises four main parts:

Part 1 - Data Exploration: This involves visualization and quality checks of the data.

Part 2 - ETL (Extract, Transform, Load): Tasks include creating a Postgres database, feature engineering, and loading data for modeling in SQL.

Part 3 - Data Preprocessing and Modeling: Utilizing techniques such as XGBoost and Random Forest to preprocess the data and build predictive models.

Part 4 - Business Recommendations: Providing actionable insights and recommendations based on the analysis and modeling results.




# Part 2


## 1. Create Postgres Database

### Export data from Python to Postgres

In [1]:
import glob
import pandas as pd
import os as os
from sqlalchemy import create_engine
import psycopg2 
import io

orders = pd.read_csv("C:/Users/shenl/OneDrive/Documents/eCommerce DS project/orders.csv")
aisles = pd.read_csv(r'C:\Users\shenl\OneDrive\Documents\eCommerce DS project\aisles.csv')
departments = pd.read_csv(r'C:\Users\shenl\OneDrive\Documents\eCommerce DS project\departments.csv')
order_products_prior = pd.read_csv(r'C:\Users\shenl\OneDrive\Documents\eCommerce DS project\order_products_prior.csv')
order_products_train = pd.read_csv(r'C:\Users\shenl\OneDrive\Documents\eCommerce DS project\order_products_train.csv')
products = pd.read_csv(r'C:\Users\shenl\OneDrive\Documents\eCommerce DS project\products.csv')



#following this structure:  engine = create_engine('postgresql+psycopg2://user:password@hostname/database_name')
# user:postgres
# password: class
# hostname (running locally): localhost:5432
# database_name:postgres
engine = create_engine('postgresql+psycopg2://postgres:class@localhost:5432/postgres')

# create tables in the database 
orders.to_sql('orders', engine, schema="Ecommerce_Project")
aisles.to_sql('aisles', engine, schema="Ecommerce_Project")
departments.to_sql('departments', engine, schema="Ecommerce_Project")
order_products_prior.to_sql('order_products_prior', engine, schema="Ecommerce_Project")
order_products_train.to_sql('order_products_train', engine, schema="Ecommerce_Project")
products.to_sql('products', engine, schema="Ecommerce_Project")

688

In [14]:
import glob
import pandas as pd
import os as os
from sqlalchemy import create_engine
import psycopg2 
import io

model_data = pd.read_csv(r'C:\Users\shenl\OneDrive\Documents\eCommerce DS project\model_data.csv')

#following this structure:  engine = create_engine('postgresql+psycopg2://user:password@hostname/database_name')
# user:postgres
# password: class
# hostname (running locally): localhost:5432
# database_name:postgres
engine = create_engine('postgresql+psycopg2://postgres:class@localhost:5432/postgres')

model_data.to_sql('model_data', engine, schema="Ecommerce_Project")

924

In [15]:
import glob
import pandas as pd
import os as os
from sqlalchemy import create_engine
import psycopg2 
import io

model_data_department_product = pd.read_csv(r'C:\Users\shenl\OneDrive\Documents\eCommerce DS project\model_data_department_product.csv')

#following this structure:  engine = create_engine('postgresql+psycopg2://user:password@hostname/database_name')
# user:postgres
# password: class
# hostname (running locally): localhost:5432
# database_name:postgres
engine = create_engine('postgresql+psycopg2://postgres:class@localhost:5432/postgres')

model_data_department_product.to_sql('model_data_department_product', engine, schema="Ecommerce_Project")

924

## 2. Feature Engineering and Load Modeling Data in SQL
[Click here to view SQL code for](SQL part 2 - ETL and feature engineering.sql)


In [12]:
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
model_data = pd.read_csv(r'C:\Users\shenl\OneDrive\Documents\eCommerce DS project\model_data.csv')
model_data = model_data.sort_values(by=['user_id', 'product_id']) 
model_data = model_data.drop_duplicates() # there is duplicates- drop duplicates based on all columns
model_data

Unnamed: 0,user_id,product_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,target,user_num_orders,user_num_products,user_most_order_day,...,product_num_orders,product_num_users,product_most_order_day,product_most_order_hour,product_avg_days_since_prior_order,user_product_num_orders,user_product_avg_add_to_cart_order,user_product_avg_reorder,user_product_most_order_day,user_product_most_order_hour
242108,50,1202,54,6,10,3.0,0,67,453,1,...,91,42,0,12,11.333333,1,11.0,0.0,6,10
714636,50,2132,30,3,12,7.0,0,67,453,1,...,37,12,1,13,15.342857,1,2.0,0.0,3,12
567087,50,2643,2,6,12,10.0,0,67,453,1,...,24,13,0,14,8.600000,1,4.0,0.0,6,12
737892,50,4601,9,4,11,5.0,0,67,453,1,...,9,7,0,14,11.000000,1,10.0,0.0,4,11
556841,50,4656,11,6,9,4.0,0,67,453,1,...,194,124,1,14,12.513661,1,6.0,0.0,6,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419903,206175,44661,2,6,21,7.0,0,6,49,0,...,169,92,1,9,10.729032,1,5.0,0.0,6,21
323765,206175,45007,1,6,17,,1,6,49,0,...,2474,779,0,11,10.454661,1,3.0,0.0,6,17
708296,206175,46802,1,6,17,,0,6,49,0,...,541,251,0,11,12.222892,1,9.0,0.0,6,17
174769,206175,47144,1,6,17,,0,6,49,0,...,824,257,1,11,11.948617,1,6.0,0.0,6,17
