In [3]:
# importing dependencies

import pandas as pd
import numpy as np
from scipy import stats 
import os
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.metrics import r2_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
import datetime
import random
import pickle

In [4]:
# Reads in CSV's to separate dataframes

file_cust = pd.read_csv('ecommerce_data/olist_customers_dataset.csv')
file_items = pd.read_csv('ecommerce_data/olist_order_items_dataset.csv')
file_payments = pd.read_csv('ecommerce_data/olist_order_payments_dataset.csv')
file_reviews = pd.read_csv('ecommerce_data/olist_order_reviews_dataset.csv')
file_orders = pd.read_csv('ecommerce_data/olist_orders_dataset.csv')
file_geo = pd.read_csv('ecommerce_data/olist_geolocation_dataset.csv')
file_products = pd.read_csv('ecommerce_data/olist_products_dataset.csv')
file_sellers = pd.read_csv('ecommerce_data/olist_sellers_dataset.csv')
file_category = pd.read_csv('ecommerce_data/product_category_name_translation.csv')

EmptyDataError: No columns to parse from file

In [None]:
# Drops NAs from each dataframe

file_cust = file_cust.dropna()
file_geo = file_geo.dropna()
file_items = file_items.dropna()
file_payments = file_payments.dropna()
file_reviews = file_reviews.dropna()
file_orders = file_orders.dropna()
file_products = file_products.dropna()
file_sellers = file_sellers.dropna()
file_category = file_category.dropna()

In [None]:
# Merges dataframes

data = file_orders.merge(file_items, on='order_id',how='left')
data = data.merge(file_payments, on='order_id',how='outer',validate='m:m')
data = data.merge(file_reviews, on='order_id',how='outer')
data = data.merge(file_products,on='product_id',how='outer')
data = data.merge(file_cust,on='customer_id',how='outer')
data = data.merge(file_sellers, on='seller_id',how='outer')
data = data.merge(file_category, on='product_category_name',how='outer')

In [None]:
#data = data.dropna()

In [None]:
# Quick review

data

In [None]:
# Converts date-related columns to datetime

data['order_purchase_timestamp'] = pd.to_datetime(data['order_purchase_timestamp'])
data['order_approved_at'] = pd.to_datetime(data['order_approved_at'])
data['order_delivered_carrier_date'] = pd.to_datetime(data['order_delivered_carrier_date'])
data['order_delivered_customer_date'] = pd.to_datetime(data['order_delivered_customer_date'])
data['order_estimated_delivery_date'] = pd.to_datetime(data['order_estimated_delivery_date'])
data['shipping_limit_date'] = pd.to_datetime(data['shipping_limit_date'])
data['review_creation_date'] = pd.to_datetime(data['review_creation_date'])
data['review_answer_timestamp'] = pd.to_datetime(data['review_answer_timestamp'])

In [None]:
#order_dates, order_times = zip(*[(d.date(), d.time()) for d in data['order_purchase_timestamp']])
#data = data.assign(order_date=order_dates, order_time=order_times)

In [None]:
# Creates dataframe just showing unique orders. From here we can determine the # of repeat customers (unique customer ID's with repeat orders)

unique_orders = data.drop_duplicates(subset='order_id')
unique_orders

In [None]:
# Number of unique customers:

data['customer_unique_id'].nunique()

In [None]:
# Creates dataframe with customer unique id and their # of orders

df = unique_orders['customer_unique_id'].value_counts().rename_axis('unique_values').reset_index(name='counts')

In [None]:
df

In [None]:
# Creates a dataframe for the repeat customers

df1 = df.loc[df['counts'] > 1]

In [None]:
df1['counts'].sum()

In [None]:
# Creates a list for the repeat customers

repeat_list = df1['unique_values'].tolist()

In [None]:
# # of repeat customers

len(repeat_list)

In [None]:
len(data['customer_unique_id'].unique())

In [None]:
# Creating a column on the larger, clean dataframe based on whether the customer is in the repeat customer list

data["repeat?"] = np.where(data["customer_unique_id"].isin(repeat_list), 2, 1)

In [None]:
# Checks the number of repeat values

data['repeat?'].value_counts()

In [None]:
# Reviews columns available

data.columns

In [None]:
# Creates new column based on how the delivery time compared to the estimated delivery

data['delivery_time b/(w)'] = data['order_estimated_delivery_date'] - data['order_delivered_customer_date']
data['days_early_or_late'] = data['delivery_time b/(w)'].apply(lambda x: x.days)

In [None]:
# Creates column to determine absolute wait time for delivery to customer 

data['time_to_delivery'] = data['order_purchase_timestamp'] - data['order_delivered_customer_date']
data['total_delivery_days'] = data['time_to_delivery'].apply(lambda x: x.days)

In [None]:
# Creates binary column indicating whether the order was late compared to estimate

def is_late_func(x):
    if x>-1:
        return 1
    if x<0:
        return 0
    
data['late'] = data['days_early_or_late'].apply(is_late_func)

In [None]:
# Creates column determining promptness of company response to review

data['review_answer_time'] = data['review_answer_timestamp'] - data['review_creation_date']
data['review_response_days'] = data['review_answer_time'].apply(lambda x: x.days)

In [None]:
# Creates column providing ratio for freight cost vs payment amount

data['freight_percent'] = data['freight_value'] / data['payment_value']

In [None]:
# Creates dataframe with only first orders, since that is what we're trying to measure for repeat customers

first_order_df = data.sort_values(by='order_purchase_timestamp')
first_order_df = first_order_df.loc[first_order_df['order_purchase_timestamp']!=0]
first_order_df.drop_duplicates(subset ="customer_unique_id",keep = 'first', inplace = True)
#first_order_df.drop_duplicates(subset ="customer_id",keep = 'first', inplace = True)

In [None]:
# Double checking amounts in dataframe

first_order_df['repeat?'].value_counts()

In [None]:
# Pares down dataframe to be columns likely to impact decision

df_clean = first_order_df[['freight_value','payment_installments','payment_value','review_score','total_delivery_days','days_early_or_late','review_response_days','freight_percent','late','repeat?']].copy()

In [None]:
# Drops N/As to prevent model or correlation errors

df_clean = df_clean.dropna()
df_clean

In [None]:
print(df_clean.dtypes)

In [None]:
# Cleans datatypes

df_clean['payment_installments'] = df_clean['payment_installments'].astype('int')
df_clean['review_score'] = df_clean['review_score'].astype('int')
df_clean['days_early_or_late'] = df_clean['days_early_or_late'].astype('int')
df_clean['review_response_days'] = df_clean['review_response_days'].astype('int')
df_clean['late'] = df_clean['late'].astype('int')

In [None]:
print(df_clean.dtypes)

In [None]:
# Creates a separate dataframe for all the repeat customers

repeat_df = df_clean.loc[df_clean['repeat?'] == 2]

In [None]:
repeats = len(repeat_df)
repeat_df

In [None]:
# Creates dataframe for non-repeat customers

non_repeat_df = df_clean.loc[df_clean['repeat?'] == 1]

In [None]:
non_repeat_df = non_repeat_df.iloc[:(repeats)]
non_repeat_df

In [None]:
# Combines dataframes. Because of the outsized ratio of non-repeat data to repeat data the model was guessing non-repeat everytime. 

even_df = pd.concat([repeat_df,non_repeat_df])

In [None]:
even_df

In [None]:
#df_clean = df_clean.dropna()

In [None]:
# Correlation matrix for the dataframe

corrMatrix = even_df.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
# Correlation dataframe (same info)

corr_df = even_df.corr()
sorted_corr_df = corr_df.sort_values(by='repeat?',ascending=False)
sorted_corr_df

In [None]:
#df_clean = df_clean.dropna()

In [None]:
# Train, test, splitting the datafame; Making predictions

X = even_df[['days_early_or_late','total_delivery_days','late','review_score']].values
y = even_df['repeat?'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=28)
logisticModel = LogisticRegression()
#training data with logistic regression model
logisticModel.fit(X_train,y_train)
#'days_to_delivery','payment_installments','review_response_days',

#X_test = df_clean[['review_response_days','delivery_days b/(w)','late','review_score']].values
#y_test = df_clean['repeat?'].values
prediction = logisticModel.predict(X_test)
print(prediction)

In [None]:
# Printing the classification results

from sklearn.metrics import classification_report
print(classification_report(y_test,prediction))
print("Model score:", accuracy_score(y_test, prediction))

In [None]:
# Creating the pickle file for use in interactive web page

with open('repeat_customer.pkl', 'wb') as file:
    pickle.dump(logisticModel, file)