In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
import config
from config import CONNSTRING
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data from SQL and create a DataFrame
engine= create_engine(CONNSTRING)

query = "SELECT * FROM customer_data"
df = pd.read_sql(query, engine)

# Display the DataFrame
df.head()

Unnamed: 0,Customer ID,age,gender,Item Purchased,category,Purchase Amount (USD),location,size,color,season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53.0,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64.0,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73.0,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90.0,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49.0,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [3]:
# Dispay column names
df.columns

Index(['Customer ID', 'age', 'gender', 'Item Purchased', 'category',
       'Purchase Amount (USD)', 'location', 'size', 'color', 'season',
       'Review Rating', 'Subscription Status', 'Shipping Type',
       'Discount Applied', 'Promo Code Used', 'Previous Purchases',
       'Payment Method', 'Frequency of Purchases'],
      dtype='object')

In [6]:
# Display data amount
df.shape[0]

3900

In [7]:
# Handle Missing Values: 
df.isna().sum()

Customer ID               0
age                       0
gender                    0
Item Purchased            0
category                  0
Purchase Amount (USD)     0
location                  0
size                      0
color                     0
season                    0
Review Rating             0
Subscription Status       0
Shipping Type             0
Discount Applied          0
Promo Code Used           0
Previous Purchases        0
Payment Method            0
Frequency of Purchases    0
dtype: int64

In [8]:
# Display the data types for each column
print(df.dtypes)

Customer ID                 int64
age                         int64
gender                     object
Item Purchased             object
category                   object
Purchase Amount (USD)     float64
location                   object
size                       object
color                      object
season                     object
Review Rating             float64
Subscription Status        object
Shipping Type              object
Discount Applied           object
Promo Code Used            object
Previous Purchases          int64
Payment Method             object
Frequency of Purchases     object
dtype: object


In [9]:
# Converting columns to appropriate data types
df['Customer ID'] = df['Customer ID'].astype(int)  # Assuming 'Customer ID' is an integer
df['age'] = df['age'].astype(int)  # Age is an integer
df['gender'] = df['gender'].astype('category')  # Gender as category
df['Item Purchased'] = df['Item Purchased'].astype('category')  # Item Purchased as category
df['category'] = df['category'].astype('category')  # Category as category
df['Purchase Amount (USD)'] = df['Purchase Amount (USD)'].astype(float)  # Purchase Amount as float
df['location'] = df['location'].astype('category')  # Location as category
df['size'] = df['size'].astype('category')  # Size as category
df['color'] = df['color'].astype('category')  # Color as category
df['season'] = df['season'].astype('category')  # Season as category
df['Review Rating'] = df['Review Rating'].astype(float)  # Review Rating as float
df['Subscription Status'] = df['Subscription Status'].astype('category')  # Subscription Status as category
df['Shipping Type'] = df['Shipping Type'].astype('category')  # Shipping Type as category
df['Discount Applied'] = df['Discount Applied'].astype('category')  # Discount Applied as category (assuming 'Yes'/'No')
df['Promo Code Used'] = df['Promo Code Used'].astype('category')  # Promo Code Used as category
df['Previous Purchases'] = df['Previous Purchases'].astype(int)  # Previous Purchases as integer
df['Payment Method'] = df['Payment Method'].astype('category')  # Payment Method as category
df['Frequency of Purchases'] = df['Frequency of Purchases'].astype('category')  # Frequency of Purchases as category

Based on certain customer characteristics, we want to build a model that can predict the categories that the customers will buy, so we can use it for designing a marketing campaign that sends them specific promo codes and advertising related to that product category.

We'll try to use a neural network model to predict the categories that the customers will buy (using 4 targets), based on the following features: age, gender, item purchased, purchased amount in USD and location.

In [10]:
# Create a dataframe with the columns that we'll use to predict the categories
customer_df = df[['age', 'gender','Item Purchased', 'category', 'Purchase Amount (USD)', 'location']].copy()
customer_df.head()

Unnamed: 0,age,gender,Item Purchased,category,Purchase Amount (USD),location
0,55,Male,Blouse,Clothing,53.0,Kentucky
1,19,Male,Sweater,Clothing,64.0,Maine
2,50,Male,Jeans,Clothing,73.0,Massachusetts
3,21,Male,Sandals,Footwear,90.0,Rhode Island
4,45,Male,Blouse,Clothing,49.0,Oregon


In [11]:
# Convert the categorical data in dummies and create a dataframe with that information
customer_df_encoded = pd.get_dummies(customer_df)
customer_df_encoded.head()

Unnamed: 0,age,Purchase Amount (USD),gender_Female,gender_Male,Item Purchased_Backpack,Item Purchased_Belt,Item Purchased_Blouse,Item Purchased_Boots,Item Purchased_Coat,Item Purchased_Dress,...,location_South Dakota,location_Tennessee,location_Texas,location_Utah,location_Vermont,location_Virginia,location_Washington,location_West Virginia,location_Wisconsin,location_Wyoming
0,55,53.0,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,19,64.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,50,73.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,21,90.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,45,49.0,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
# Display the columns of the dataframe
customer_df_encoded.columns

Index(['age', 'Purchase Amount (USD)', 'gender_Female', 'gender_Male',
       'Item Purchased_Backpack', 'Item Purchased_Belt',
       'Item Purchased_Blouse', 'Item Purchased_Boots', 'Item Purchased_Coat',
       'Item Purchased_Dress', 'Item Purchased_Gloves',
       'Item Purchased_Handbag', 'Item Purchased_Hat', 'Item Purchased_Hoodie',
       'Item Purchased_Jacket', 'Item Purchased_Jeans',
       'Item Purchased_Jewelry', 'Item Purchased_Pants',
       'Item Purchased_Sandals', 'Item Purchased_Scarf',
       'Item Purchased_Shirt', 'Item Purchased_Shoes', 'Item Purchased_Shorts',
       'Item Purchased_Skirt', 'Item Purchased_Sneakers',
       'Item Purchased_Socks', 'Item Purchased_Sunglasses',
       'Item Purchased_Sweater', 'Item Purchased_T-shirt',
       'category_Accessories', 'category_Clothing', 'category_Footwear',
       'category_Outerwear', 'location_Alabama', 'location_Alaska',
       'location_Arizona', 'location_Arkansas', 'location_California',
       'location_C

In [13]:
# Split our preprocessed data into our features and target arrays
#Features
X = customer_df_encoded.drop(columns=['category_Accessories','category_Footwear','category_Clothing','category_Outerwear'],axis=1).values

#Targets
y_accessories = customer_df_encoded['category_Accessories'].values
y_footwear = customer_df_encoded['category_Footwear'].values
y_clothing = customer_df_encoded['category_Clothing'].values
y_outerwear = customer_df_encoded['category_Outerwear'].values

# Number of classes for each target variable
num_classes_accessories = len(np.unique(y_accessories))
num_classes_footwear = len(np.unique(y_footwear))
num_classes_clothing = len(np.unique(y_clothing))
num_classes_outerwear = len(np.unique(y_outerwear))

# Convert targets to one-hot encoding
y_accessories = to_categorical(y_accessories, num_classes=num_classes_accessories).astype(np.float32)
y_footwear = to_categorical(y_footwear, num_classes=num_classes_footwear).astype(np.float32)
y_clothing = to_categorical(y_clothing, num_classes=num_classes_clothing).astype(np.float32)
y_outerwear = to_categorical(y_outerwear, num_classes=num_classes_outerwear).astype(np.float32)

# Split the data into training and testing sets
X_train, X_test, y_accessories_train, y_accessories_test = train_test_split(
    X, y_accessories, test_size=0.2, random_state=42
)

# Repeat the split for other targets, ensuring consistent splitting
y_footwear_train, y_footwear_test = train_test_split(
    y_footwear, test_size=0.2, random_state=42
)
y_clothing_train, y_clothing_test = train_test_split(
    y_clothing, test_size=0.2, random_state=42
)
y_outerwear_train, y_outerwear_test = train_test_split(
    y_outerwear, test_size=0.2, random_state=42
)

# Define the model
X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
inputs = Input(shape=(X.shape[1],))

# Shared layers
x = Dense(64, activation='relu')(inputs)
x = Dense(32, activation='relu')(x)

# Output layers for each target
output_accessories = Dense(num_classes_accessories, activation='softmax', name='accessories')(x)
output_footwear = Dense(num_classes_footwear, activation='softmax', name='footwear')(x)
output_clothing = Dense(num_classes_clothing, activation='softmax', name='clothing')(x)
output_outerwear = Dense(num_classes_outerwear, activation='softmax', name='outerwear')(x)



In [14]:
# Define the model
X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
inputs = Input(shape=(X.shape[1],))

# Shared layers
x = Dense(64, activation='relu')(inputs)
x = Dense(32, activation='relu')(x)

# Output layers for each target
output_accessories = Dense(num_classes_accessories, activation='softmax', name='accessories')(x)
output_footwear = Dense(num_classes_footwear, activation='softmax', name='footwear')(x)
output_clothing = Dense(num_classes_clothing, activation='softmax', name='clothing')(x)
output_outerwear = Dense(num_classes_outerwear, activation='softmax', name='outerwear')(x)

# Create and compile the model
model = Model(inputs=inputs, outputs=[output_accessories, output_footwear, output_clothing, output_outerwear])

model.compile(optimizer=Adam(),
              loss={'accessories': 'categorical_crossentropy',
                    'footwear': 'categorical_crossentropy',
                    'clothing': 'categorical_crossentropy',
                    'outerwear': 'categorical_crossentropy'},
              metrics={'accessories': 'accuracy',
                       'footwear': 'accuracy',
                       'clothing': 'accuracy',
                       'outerwear': 'accuracy'})

# Train the model
model.fit(X_train, {'accessories': y_accessories_train, 
                    'footwear': y_footwear_train, 
                    'clothing': y_clothing_train, 
                    'outerwear': y_outerwear_train},
          epochs=10, batch_size=32)

Epoch 1/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accessories_accuracy: 0.6151 - clothing_accuracy: 0.5431 - footwear_accuracy: 0.8450 - loss: 2.6377 - outerwear_accuracy: 0.9178
Epoch 2/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.7160 - clothing_accuracy: 0.7047 - footwear_accuracy: 0.8578 - loss: 1.8543 - outerwear_accuracy: 0.9078
Epoch 3/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.7918 - clothing_accuracy: 0.8373 - footwear_accuracy: 0.8531 - loss: 1.5800 - outerwear_accuracy: 0.9149
Epoch 4/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.9178 - clothing_accuracy: 0.9357 - footwear_accuracy: 0.8844 - loss: 1.2308 - outerwear_accuracy: 0.9157
Epoch 5/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.9754 - clothing_accuracy: 0.96

<keras.src.callbacks.history.History at 0x21fc0fbb470>

In [15]:
# Evaluate the model
results = model.evaluate(X_test, {'accessories': y_accessories_test, 
                                  'footwear': y_footwear_test, 
                                  'clothing': y_clothing_test, 
                                  'outerwear': y_outerwear_test})
# Print all results
for name, result in zip(model.metrics_names, results):
    print(f"{name}: {result}")

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 1.0000 - clothing_accuracy: 1.0000 - footwear_accuracy: 1.0000 - loss: 0.0428 - outerwear_accuracy: 1.0000  
loss: 0.04267256706953049
compile_metrics: 1.0


After running the model with 10 epochs, we reached an accuracy of 100% for all the target variables. Although the accuracy is very high, after reviewing the results we found a bit strange that the model gave us this accuracy level. Therefore, we decided to dig deeper in the model characteristics. One of the possibilities for this high accuracy level could be that the model is overfitting. We also think that the reason why the model is overfitting is because there is not enough complexity in 
the training data, and the model converges too quickly, given that the 100% accuracy is reached after 6 epochs.

Although the most straightforward way to fix overfitting is adding more training data, we were not able to do it because the company didn't provide us with more data fields or with an additional dataset. For this reason, we tried to fix this by keeping our training data the same and retraining the model using fewer epochs.


In [17]:
# Split our preprocessed data into our features and target arrays
#Features
X_2 = customer_df_encoded.drop(columns=['category_Accessories','category_Footwear','category_Clothing','category_Outerwear'],axis=1).values

#Targets
y_accessories_2 = customer_df_encoded['category_Accessories'].values
y_footwear_2 = customer_df_encoded['category_Footwear'].values
y_clothing_2 = customer_df_encoded['category_Clothing'].values
y_outerwear_2 = customer_df_encoded['category_Outerwear'].values

# Number of classes for each target variable
num_classes_accessories_2 = len(np.unique(y_accessories_2))
num_classes_footwear_2 = len(np.unique(y_footwear_2))
num_classes_clothing_2 = len(np.unique(y_clothing_2))
num_classes_outerwear_2 = len(np.unique(y_outerwear_2))

# Convert targets to one-hot encoding
y_accessories_2 = to_categorical(y_accessories_2, num_classes=num_classes_accessories_2).astype(np.float32)
y_footwear_2 = to_categorical(y_footwear_2, num_classes=num_classes_footwear_2).astype(np.float32)
y_clothing_2 = to_categorical(y_clothing_2, num_classes=num_classes_clothing_2).astype(np.float32)
y_outerwear_2 = to_categorical(y_outerwear_2, num_classes=num_classes_outerwear_2).astype(np.float32)

# Split the data into training and testing sets
X_train_2, X_test_2, y_accessories_train_2, y_accessories_test_2 = train_test_split(
    X, y_accessories_2, test_size=0.2, random_state=42
)

# Repeat the split for other targets, ensuring consistent splitting
y_footwear_train_2, y_footwear_test_2 = train_test_split(
    y_footwear_2, test_size=0.2, random_state=42
)
y_clothing_train_2, y_clothing_test_2 = train_test_split(
    y_clothing_2, test_size=0.2, random_state=42
)
y_outerwear_train_2, y_outerwear_test_2 = train_test_split(
    y_outerwear_2, test_size=0.2, random_state=42
)

# Define the model
X_train_2 = np.asarray(X_train_2).astype(np.float32)
X_test_2 = np.asarray(X_test_2).astype(np.float32)
inputs_2 = Input(shape=(X.shape[1],))

# Shared layers
x_2 = Dense(64, activation='relu')(inputs_2)
x_2 = Dense(32, activation='relu')(x_2)

# Output layers for each target
output_accessories_2 = Dense(num_classes_accessories_2, activation='softmax', name='accessories')(x_2)
output_footwear_2 = Dense(num_classes_footwear_2, activation='softmax', name='footwear')(x_2)
output_clothing_2 = Dense(num_classes_clothing_2, activation='softmax', name='clothing')(x_2)
output_outerwear_2 = Dense(num_classes_outerwear_2, activation='softmax', name='outerwear')(x_2)


# Create and compile the model
model_2 = Model(inputs=inputs_2, outputs=[output_accessories_2, output_footwear_2, output_clothing_2, output_outerwear_2])

model_2.compile(optimizer=Adam(),
              loss={'accessories': 'categorical_crossentropy',
                    'footwear': 'categorical_crossentropy',
                    'clothing': 'categorical_crossentropy',
                    'outerwear': 'categorical_crossentropy'},
              metrics={'accessories': 'accuracy',
                       'footwear': 'accuracy',
                       'clothing': 'accuracy',
                       'outerwear': 'accuracy'})


# Train the model
model_2.fit(X_train_2, {'accessories': y_accessories_train_2, 
                    'footwear': y_footwear_train_2, 
                    'clothing': y_clothing_train_2, 
                    'outerwear': y_outerwear_train_2},
          epochs=5, batch_size=32)

Epoch 1/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accessories_accuracy: 0.6039 - clothing_accuracy: 0.5256 - footwear_accuracy: 0.7160 - loss: 9.7999 - outerwear_accuracy: 0.5174 
Epoch 2/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.7312 - clothing_accuracy: 0.7420 - footwear_accuracy: 0.8471 - loss: 1.8707 - outerwear_accuracy: 0.9207
Epoch 3/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.8124 - clothing_accuracy: 0.9330 - footwear_accuracy: 0.8358 - loss: 1.5445 - outerwear_accuracy: 0.9175
Epoch 4/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.9545 - clothing_accuracy: 0.9983 - footwear_accuracy: 0.8477 - loss: 1.1493 - outerwear_accuracy: 0.9157
Epoch 5/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.9996 - clothing_accuracy: 1.0000 -

<keras.src.callbacks.history.History at 0x21fc6ee3b30>

In [32]:
# Evaluate the model
results_2 = model_2.evaluate(X_test_2, {'accessories': y_accessories_test_2, 
                                  'footwear': y_footwear_test_2, 
                                  'clothing': y_clothing_test_2, 
                                  'outerwear': y_outerwear_test_2})
# Print all results
for name, result in zip(model_2.metrics_names, results_2):
    print(f"{name}: {result}")

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accessories_accuracy: 0.9986 - clothing_accuracy: 1.0000 - footwear_accuracy: 0.9049 - loss: 0.5244 - outerwear_accuracy: 0.9339  
loss: 0.5442551970481873
compile_metrics: 0.9987179636955261


The model with fewer epochs gave us a lower accuracy but still closer to 100% in most categories, which means that we can continue increasing the epochs to reach a higher accuracy. However, if we do so, we'll probably end up facing the overfitting issue that we saw in the previous attempt. For this reason, this could give us a hint that the neural network model may not be the best one to apply to this type of data. 

Since we don't have a lot of data and the features used to predict the targets are simple, maybe the Random Forest model would be a better for this problem.

In [18]:
# Convert non numeric data into numbers by using LabelEncoder
label_encoders = {}
for column in customer_df.columns:
    le = LabelEncoder()
    customer_df[column] = le.fit_transform(customer_df[column])
    label_encoders[column] = le

customer_df.head()

Unnamed: 0,age,gender,Item Purchased,category,Purchase Amount (USD),location
0,37,1,2,1,33,16
1,1,1,23,1,44,18
2,32,1,11,1,53,20
3,3,1,14,2,70,38
4,27,1,2,1,29,36


In [19]:
# Display number of unique values for each column
customer_df_unique_counts=customer_df.nunique()
print(customer_df_unique_counts)

age                      53
gender                    2
Item Purchased           25
category                  4
Purchase Amount (USD)    81
location                 50
dtype: int64


In [20]:
# Define target value and features
X = customer_df.drop('category', axis=1)
y = customer_df['category']

In [21]:
# Devide the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
# Define the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [23]:
# Display model results
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[366   4   0   0]
 [  1 521   0   0]
 [  1   1 187   0]
 [  0   2   0  87]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       370
           1       0.99      1.00      0.99       522
           2       1.00      0.99      0.99       189
           3       1.00      0.98      0.99        89

    accuracy                           0.99      1170
   macro avg       1.00      0.99      0.99      1170
weighted avg       0.99      0.99      0.99      1170



After running the Random Forest model, we also observed that the accuracy is close to 100%. This makes us conclude that we don't have the necessary amount of data to train a machine learning model. However, we prefer to do it with a Random Forest because of the simplicity of the data.

Another conclusion could be that the accuracy on predicting the category that the customer will buy could come from the fact that we also have the information about the item purchased, which is directly related to the item category. We want to check if the accuracy changes if we don't have the infromation about the item purchased. Therefore, we'll try to use the neural network model to predict the categories that the customers will buy (using 4 targets), only based on the following features: age, gender, purchased amount in USD and location.
In addition, we'll increase the number of epochs to let the model learn from the data.

In [24]:
# Create a dataframe with the columns that we'll use to predict tha customer characteristics and preferences
customer_df_2 = df[['age', 'gender', 'category', 'Purchase Amount (USD)', 'location']].copy()
customer_df_2.head()

Unnamed: 0,age,gender,category,Purchase Amount (USD),location
0,55,Male,Clothing,53.0,Kentucky
1,19,Male,Clothing,64.0,Maine
2,50,Male,Clothing,73.0,Massachusetts
3,21,Male,Footwear,90.0,Rhode Island
4,45,Male,Clothing,49.0,Oregon


In [25]:
# Convert the categorical data in dummies and create a dataframe with that information
customer_df_encoded_2 = pd.get_dummies(customer_df_2)
customer_df_encoded_2.head()

Unnamed: 0,age,Purchase Amount (USD),gender_Female,gender_Male,category_Accessories,category_Clothing,category_Footwear,category_Outerwear,location_Alabama,location_Alaska,...,location_South Dakota,location_Tennessee,location_Texas,location_Utah,location_Vermont,location_Virginia,location_Washington,location_West Virginia,location_Wisconsin,location_Wyoming
0,55,53.0,False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,19,64.0,False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,50,73.0,False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,21,90.0,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,45,49.0,False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [26]:
# Display the columns of the dataframe
customer_df_encoded_2.columns

Index(['age', 'Purchase Amount (USD)', 'gender_Female', 'gender_Male',
       'category_Accessories', 'category_Clothing', 'category_Footwear',
       'category_Outerwear', 'location_Alabama', 'location_Alaska',
       'location_Arizona', 'location_Arkansas', 'location_California',
       'location_Colorado', 'location_Connecticut', 'location_Delaware',
       'location_Florida', 'location_Georgia', 'location_Hawaii',
       'location_Idaho', 'location_Illinois', 'location_Indiana',
       'location_Iowa', 'location_Kansas', 'location_Kentucky',
       'location_Louisiana', 'location_Maine', 'location_Maryland',
       'location_Massachusetts', 'location_Michigan', 'location_Minnesota',
       'location_Mississippi', 'location_Missouri', 'location_Montana',
       'location_Nebraska', 'location_Nevada', 'location_New Hampshire',
       'location_New Jersey', 'location_New Mexico', 'location_New York',
       'location_North Carolina', 'location_North Dakota', 'location_Ohio',
       'l

In [27]:
# Split our preprocessed data into our features and target arrays
#Features
X_3 = customer_df_encoded_2.drop(columns=['category_Accessories','category_Footwear','category_Clothing','category_Outerwear'],axis=1).values

#Targets
y_accessories_3 = customer_df_encoded_2['category_Accessories'].values
y_footwear_3 = customer_df_encoded_2['category_Footwear'].values
y_clothing_3 = customer_df_encoded_2['category_Clothing'].values
y_outerwear_3 = customer_df_encoded_2['category_Outerwear'].values

# Number of classes for each target variable
num_classes_accessories_3 = len(np.unique(y_accessories_3))
num_classes_footwear_3 = len(np.unique(y_footwear_3))
num_classes_clothing_3 = len(np.unique(y_clothing_3))
num_classes_outerwear_3 = len(np.unique(y_outerwear_3))

# Convert targets to one-hot encoding
y_accessories_3 = to_categorical(y_accessories_3, num_classes=num_classes_accessories_3).astype(np.float32)
y_footwear_3 = to_categorical(y_footwear_3, num_classes=num_classes_footwear_3).astype(np.float32)
y_clothing_3 = to_categorical(y_clothing_3, num_classes=num_classes_clothing_3).astype(np.float32)
y_outerwear_3 = to_categorical(y_outerwear_3, num_classes=num_classes_outerwear_3).astype(np.float32)

# Split the data into training and testing sets
X_train_3, X_test_3, y_accessories_train_3, y_accessories_test_3 = train_test_split(
    X, y_accessories_3, test_size=0.2, random_state=42
)

# Repeat the split for other targets, ensuring consistent splitting
y_footwear_train_3, y_footwear_test_3 = train_test_split(
    y_footwear_3, test_size=0.2, random_state=42
)
y_clothing_train_3, y_clothing_test_3 = train_test_split(
    y_clothing_3, test_size=0.2, random_state=42
)
y_outerwear_train_3, y_outerwear_test_3 = train_test_split(
    y_outerwear_3, test_size=0.2, random_state=42
)

# Define the model
X_train_3 = np.asarray(X_train_3).astype(np.float32)
X_test_3 = np.asarray(X_test_3).astype(np.float32)
inputs_3 = Input(shape=(X.shape[1],))

# Shared layers
x_3 = Dense(64, activation='relu')(inputs_3)
x_3 = Dense(32, activation='relu')(x_3)

# Output layers for each target
output_accessories_3 = Dense(num_classes_accessories_3, activation='softmax', name='accessories')(x_3)
output_footwear_3 = Dense(num_classes_footwear_3, activation='softmax', name='footwear')(x_3)
output_clothing_3 = Dense(num_classes_clothing_3, activation='softmax', name='clothing')(x_3)
output_outerwear_3 = Dense(num_classes_outerwear_3, activation='softmax', name='outerwear')(x_3)

In [28]:
# Define the model
X_train_3 = np.asarray(X_train_3).astype(np.float32)
X_test_3 = np.asarray(X_test_3).astype(np.float32)
inputs_3 = Input(shape=(X.shape[1],))

# Shared layers
x_3 = Dense(64, activation='relu')(inputs_3)
x_3 = Dense(32, activation='relu')(x_3)

# Output layers for each target
output_accessories_3 = Dense(num_classes_accessories_3, activation='softmax', name='accessories')(x_3)
output_footwear_3 = Dense(num_classes_footwear_3, activation='softmax', name='footwear')(x_3)
output_clothing_3 = Dense(num_classes_clothing_3, activation='softmax', name='clothing')(x_3)
output_outerwear_3 = Dense(num_classes_outerwear_3, activation='softmax', name='outerwear')(x_3)

# Create and compile the model
model_3 = Model(inputs=inputs_3, outputs=[output_accessories_3, output_footwear_3, output_clothing_3, output_outerwear_3])

model_3.compile(optimizer=Adam(),
              loss={'accessories': 'categorical_crossentropy',
                    'footwear': 'categorical_crossentropy',
                    'clothing': 'categorical_crossentropy',
                    'outerwear': 'categorical_crossentropy'},
              metrics={'accessories': 'accuracy',
                       'footwear': 'accuracy',
                       'clothing': 'accuracy',
                       'outerwear': 'accuracy'})

# Train the model
model_3.fit(X_train_3, {'accessories': y_accessories_train_3, 
                    'footwear': y_footwear_train_3, 
                    'clothing': y_clothing_train_3, 
                    'outerwear': y_outerwear_train_3},
          epochs=100, batch_size=32)

Epoch 1/100
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accessories_accuracy: 0.6077 - clothing_accuracy: 0.5693 - footwear_accuracy: 0.7405 - loss: 5.4245 - outerwear_accuracy: 0.6689
Epoch 2/100
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.6654 - clothing_accuracy: 0.5910 - footwear_accuracy: 0.8461 - loss: 2.0722 - outerwear_accuracy: 0.9133
Epoch 3/100
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.6682 - clothing_accuracy: 0.5882 - footwear_accuracy: 0.8572 - loss: 2.0147 - outerwear_accuracy: 0.9085
Epoch 4/100
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accessories_accuracy: 0.6771 - clothing_accuracy: 0.6050 - footwear_accuracy: 0.8565 - loss: 1.9606 - outerwear_accuracy: 0.9234
Epoch 5/100
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.6725 - clothing_accuracy:

<keras.src.callbacks.history.History at 0x21fca2c33e0>

In [33]:
# Evaluate the model
results_3 = model_3.evaluate(X_test_3, {'accessories': y_accessories_test_3, 
                                  'footwear': y_footwear_test_3, 
                                  'clothing': y_clothing_test_3, 
                                  'outerwear': y_outerwear_test_3})
# Print all results
for name, result in zip(model_3.metrics_names, results_3):
    print(f"{name}: {result}")

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.8038 - clothing_accuracy: 0.6563 - footwear_accuracy: 0.8892 - loss: 1.4697 - outerwear_accuracy: 0.9277  
loss: 1.4678400754928589
compile_metrics: 0.8192307949066162


As we thought, predicting the category that the customer will but without the information about the item purchased gave us a lower accuracy, but still above 75%. 

We'll also try to increase this accuracy by including other valuable features that could give us a better idea of the customer's profile, such as: Subscription Status, Previous Purchases and Frequency of Purchases.

In [34]:
# Create a dataframe with the columns that we'll use to predict tha customer characteristics and preferences
customer_df_3 = df[['age', 'gender', 'category', 'Purchase Amount (USD)', 'location','Subscription Status','Previous Purchases','Frequency of Purchases']].copy()
customer_df_3.head()

Unnamed: 0,age,gender,category,Purchase Amount (USD),location,Subscription Status,Previous Purchases,Frequency of Purchases
0,55,Male,Clothing,53.0,Kentucky,Yes,14,Fortnightly
1,19,Male,Clothing,64.0,Maine,Yes,2,Fortnightly
2,50,Male,Clothing,73.0,Massachusetts,Yes,23,Weekly
3,21,Male,Footwear,90.0,Rhode Island,Yes,49,Weekly
4,45,Male,Clothing,49.0,Oregon,Yes,31,Annually


In [35]:
# Convert the categorical data in dummies and create a dataframe with that information
customer_df_encoded_3 = pd.get_dummies(customer_df_3)
customer_df_encoded_3.head()

Unnamed: 0,age,Purchase Amount (USD),Previous Purchases,gender_Female,gender_Male,category_Accessories,category_Clothing,category_Footwear,category_Outerwear,location_Alabama,...,location_Wyoming,Subscription Status_No,Subscription Status_Yes,Frequency of Purchases_Annually,Frequency of Purchases_Bi-Weekly,Frequency of Purchases_Every 3 Months,Frequency of Purchases_Fortnightly,Frequency of Purchases_Monthly,Frequency of Purchases_Quarterly,Frequency of Purchases_Weekly
0,55,53.0,14,False,True,False,True,False,False,False,...,False,False,True,False,False,False,True,False,False,False
1,19,64.0,2,False,True,False,True,False,False,False,...,False,False,True,False,False,False,True,False,False,False
2,50,73.0,23,False,True,False,True,False,False,False,...,False,False,True,False,False,False,False,False,False,True
3,21,90.0,49,False,True,False,False,True,False,False,...,False,False,True,False,False,False,False,False,False,True
4,45,49.0,31,False,True,False,True,False,False,False,...,False,False,True,True,False,False,False,False,False,False


In [36]:
# Display the columns of the dataframe
customer_df_encoded_3.columns

Index(['age', 'Purchase Amount (USD)', 'Previous Purchases', 'gender_Female',
       'gender_Male', 'category_Accessories', 'category_Clothing',
       'category_Footwear', 'category_Outerwear', 'location_Alabama',
       'location_Alaska', 'location_Arizona', 'location_Arkansas',
       'location_California', 'location_Colorado', 'location_Connecticut',
       'location_Delaware', 'location_Florida', 'location_Georgia',
       'location_Hawaii', 'location_Idaho', 'location_Illinois',
       'location_Indiana', 'location_Iowa', 'location_Kansas',
       'location_Kentucky', 'location_Louisiana', 'location_Maine',
       'location_Maryland', 'location_Massachusetts', 'location_Michigan',
       'location_Minnesota', 'location_Mississippi', 'location_Missouri',
       'location_Montana', 'location_Nebraska', 'location_Nevada',
       'location_New Hampshire', 'location_New Jersey', 'location_New Mexico',
       'location_New York', 'location_North Carolina', 'location_North Dakota',
    

In [37]:
# Split our preprocessed data into our features and target arrays
#Features
X_4 = customer_df_encoded_3.drop(columns=['category_Accessories','category_Footwear','category_Clothing','category_Outerwear'],axis=1).values

#Targets
y_accessories_4 = customer_df_encoded_3['category_Accessories'].values
y_footwear_4 = customer_df_encoded_3['category_Footwear'].values
y_clothing_4 = customer_df_encoded_3['category_Clothing'].values
y_outerwear_4 = customer_df_encoded_3['category_Outerwear'].values

# Number of classes for each target variable
num_classes_accessories_4 = len(np.unique(y_accessories_4))
num_classes_footwear_4 = len(np.unique(y_footwear_4))
num_classes_clothing_4 = len(np.unique(y_clothing_4))
num_classes_outerwear_4 = len(np.unique(y_outerwear_4))

# Convert targets to one-hot encoding
y_accessories_4 = to_categorical(y_accessories_4, num_classes=num_classes_accessories_4).astype(np.float32)
y_footwear_4 = to_categorical(y_footwear_4, num_classes=num_classes_footwear_4).astype(np.float32)
y_clothing_4 = to_categorical(y_clothing_4, num_classes=num_classes_clothing_4).astype(np.float32)
y_outerwear_4 = to_categorical(y_outerwear_4, num_classes=num_classes_outerwear_4).astype(np.float32)

# Split the data into training and testing sets
X_train_4, X_test_4, y_accessories_train_4, y_accessories_test_4 = train_test_split(
    X, y_accessories_4, test_size=0.2, random_state=42
)

# Repeat the split for other targets, ensuring consistent splitting
y_footwear_train_4, y_footwear_test_4 = train_test_split(
    y_footwear_4, test_size=0.2, random_state=42
)
y_clothing_train_4, y_clothing_test_4 = train_test_split(
    y_clothing_4, test_size=0.2, random_state=42
)
y_outerwear_train_4, y_outerwear_test_4 = train_test_split(
    y_outerwear_4, test_size=0.2, random_state=42
)

# Define the model
X_train_4 = np.asarray(X_train_4).astype(np.float32)
X_test_4 = np.asarray(X_test_4).astype(np.float32)
inputs_4 = Input(shape=(X.shape[1],))

# Shared layers
x_4 = Dense(64, activation='relu')(inputs_4)
x_4 = Dense(32, activation='relu')(x_4)

# Output layers for each target
output_accessories_4 = Dense(num_classes_accessories_4, activation='softmax', name='accessories')(x_4)
output_footwear_4 = Dense(num_classes_footwear_4, activation='softmax', name='footwear')(x_4)
output_clothing_4 = Dense(num_classes_clothing_4, activation='softmax', name='clothing')(x_4)
output_outerwear_4 = Dense(num_classes_outerwear_4, activation='softmax', name='outerwear')(x_4)

In [38]:
# Define the model
X_train_4 = np.asarray(X_train_4).astype(np.float32)
X_test_4 = np.asarray(X_test_4).astype(np.float32)
inputs_4 = Input(shape=(X.shape[1],))

# Shared layers
x_4 = Dense(64, activation='relu')(inputs_4)
x_4 = Dense(32, activation='relu')(x_4)

# Output layers for each target
output_accessories_4 = Dense(num_classes_accessories_4, activation='softmax', name='accessories')(x_4)
output_footwear_4 = Dense(num_classes_footwear_4, activation='softmax', name='footwear')(x_4)
output_clothing_4 = Dense(num_classes_clothing_4, activation='softmax', name='clothing')(x_4)
output_outerwear_4 = Dense(num_classes_outerwear_4, activation='softmax', name='outerwear')(x_4)

# Create and compile the model
model_4 = Model(inputs=inputs_4, outputs=[output_accessories_4, output_footwear_4, output_clothing_4, output_outerwear_4])

model_4.compile(optimizer=Adam(),
              loss={'accessories': 'categorical_crossentropy',
                    'footwear': 'categorical_crossentropy',
                    'clothing': 'categorical_crossentropy',
                    'outerwear': 'categorical_crossentropy'},
              metrics={'accessories': 'accuracy',
                       'footwear': 'accuracy',
                       'clothing': 'accuracy',
                       'outerwear': 'accuracy'})

# Train the model
model_4.fit(X_train_4, {'accessories': y_accessories_train_4, 
                    'footwear': y_footwear_train_4, 
                    'clothing': y_clothing_train_4, 
                    'outerwear': y_outerwear_train_4},
          epochs=100, batch_size=32)

Epoch 1/100
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accessories_accuracy: 0.5753 - clothing_accuracy: 0.5548 - footwear_accuracy: 0.8230 - loss: 5.8530 - outerwear_accuracy: 0.8825
Epoch 2/100
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.6659 - clothing_accuracy: 0.6202 - footwear_accuracy: 0.8587 - loss: 2.0326 - outerwear_accuracy: 0.9067
Epoch 3/100
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.6859 - clothing_accuracy: 0.6052 - footwear_accuracy: 0.8544 - loss: 2.0158 - outerwear_accuracy: 0.9160
Epoch 4/100
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.6716 - clothing_accuracy: 0.6042 - footwear_accuracy: 0.8319 - loss: 2.0209 - outerwear_accuracy: 0.9188
Epoch 5/100
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.6627 - clothing_accuracy:

<keras.src.callbacks.history.History at 0x21fcf7cf9e0>

In [39]:
# Evaluate the model
results_4 = model_4.evaluate(X_test_4, {'accessories': y_accessories_test_4, 
                                  'footwear': y_footwear_test_4, 
                                  'clothing': y_clothing_test_4, 
                                  'outerwear': y_outerwear_test_4})
# Print all results
for name, result in zip(model_4.metrics_names, results_4):
    print(f"{name}: {result}")

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.8053 - clothing_accuracy: 0.6234 - footwear_accuracy: 0.8612 - loss: 1.6550 - outerwear_accuracy: 0.9211  
loss: 1.6478596925735474
compile_metrics: 0.8038461804389954


By increasing the features we didn't notice a major improvement in accuracy (and for some categories it was lower). Probably, the additional features included did not provide any value for predicting the categories.

Instead of the categories, we now want to focus on the items purchased. We will try using a neural network model to predict an specific item purchased by a customer (in this case Jewelry), in which the category is not a feature.

In [40]:
# Create a dataframe with the columns that we'll use to predict tha customer characteristics and preferences
customer_df_4 = df[['age', 'gender', 'Item Purchased', 'Purchase Amount (USD)', 'location','Subscription Status','Previous Purchases','Frequency of Purchases']].copy()
customer_df_4.head()

Unnamed: 0,age,gender,Item Purchased,Purchase Amount (USD),location,Subscription Status,Previous Purchases,Frequency of Purchases
0,55,Male,Blouse,53.0,Kentucky,Yes,14,Fortnightly
1,19,Male,Sweater,64.0,Maine,Yes,2,Fortnightly
2,50,Male,Jeans,73.0,Massachusetts,Yes,23,Weekly
3,21,Male,Sandals,90.0,Rhode Island,Yes,49,Weekly
4,45,Male,Blouse,49.0,Oregon,Yes,31,Annually


In [41]:
# Convert the categorical data in dummies and create a dataframe with that information
customer_df_encoded_4 = pd.get_dummies(customer_df_4)
customer_df_encoded_4.head()

Unnamed: 0,age,Purchase Amount (USD),Previous Purchases,gender_Female,gender_Male,Item Purchased_Backpack,Item Purchased_Belt,Item Purchased_Blouse,Item Purchased_Boots,Item Purchased_Coat,...,location_Wyoming,Subscription Status_No,Subscription Status_Yes,Frequency of Purchases_Annually,Frequency of Purchases_Bi-Weekly,Frequency of Purchases_Every 3 Months,Frequency of Purchases_Fortnightly,Frequency of Purchases_Monthly,Frequency of Purchases_Quarterly,Frequency of Purchases_Weekly
0,55,53.0,14,False,True,False,False,True,False,False,...,False,False,True,False,False,False,True,False,False,False
1,19,64.0,2,False,True,False,False,False,False,False,...,False,False,True,False,False,False,True,False,False,False
2,50,73.0,23,False,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True
3,21,90.0,49,False,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True
4,45,49.0,31,False,True,False,False,True,False,False,...,False,False,True,True,False,False,False,False,False,False


In [42]:
# Display the columns of the dataframe
customer_df_encoded_4.columns

Index(['age', 'Purchase Amount (USD)', 'Previous Purchases', 'gender_Female',
       'gender_Male', 'Item Purchased_Backpack', 'Item Purchased_Belt',
       'Item Purchased_Blouse', 'Item Purchased_Boots', 'Item Purchased_Coat',
       'Item Purchased_Dress', 'Item Purchased_Gloves',
       'Item Purchased_Handbag', 'Item Purchased_Hat', 'Item Purchased_Hoodie',
       'Item Purchased_Jacket', 'Item Purchased_Jeans',
       'Item Purchased_Jewelry', 'Item Purchased_Pants',
       'Item Purchased_Sandals', 'Item Purchased_Scarf',
       'Item Purchased_Shirt', 'Item Purchased_Shoes', 'Item Purchased_Shorts',
       'Item Purchased_Skirt', 'Item Purchased_Sneakers',
       'Item Purchased_Socks', 'Item Purchased_Sunglasses',
       'Item Purchased_Sweater', 'Item Purchased_T-shirt', 'location_Alabama',
       'location_Alaska', 'location_Arizona', 'location_Arkansas',
       'location_California', 'location_Colorado', 'location_Connecticut',
       'location_Delaware', 'location_Florida'

In [48]:
# Split our preprocessed data into our features and target arrays
X_5 = customer_df_encoded_4.drop('Item Purchased_Jewelry', axis=1).values
y_5 = customer_df_encoded_4['Item Purchased_Jewelry'].values

# Split the preprocessed data into a training and testing dataset
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(X_5, y_5, random_state = 78)

In [49]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler_5 = scaler.fit(X_train_5)

# Scale the data
X_train_scaled_5 = X_scaler_5.transform(X_train_5)
X_test_scaled_5 = X_scaler_5.transform(X_test_5)

In [50]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_5[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [51]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [52]:
# Train the model
fit_model = nn.fit(X_train_scaled_5,y_train_5,epochs=10)

Epoch 1/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8099 - loss: 0.5806
Epoch 2/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9468 - loss: 0.3521
Epoch 3/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 979us/step - accuracy: 0.9523 - loss: 0.2232
Epoch 4/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 913us/step - accuracy: 0.9547 - loss: 0.1779
Epoch 5/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9607 - loss: 0.1369
Epoch 6/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 888us/step - accuracy: 0.9598 - loss: 0.1176
Epoch 7/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9559 - loss: 0.1079
Epoch 8/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 905us/step - accuracy: 0.9539 - loss: 0.0905
Epoch 9/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━

In [53]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled_5,y_test_5,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

31/31 - 0s - 4ms/step - accuracy: 0.9713 - loss: 0.0530
Loss: 0.05295950546860695, Accuracy: 0.9712820649147034


With the selected features it is possible to predict the selected Jewelry item. This could be due to the fact that we have the purchase amount as a feature (higher purchase amounts in general are associated to Jewelry). However, it is important to highlight that we are limitting the number of epochs to 10 because we want to avoid the overfitting observed in previous models due to the available data.

We'll also try to apply the same model, but without considering the purchase amount as feature, so it does not bias the result due to the price difference observed in different products (ex. Jewelry is usually more expensive than jeans).

In [60]:
# Create a dataframe with the columns that we'll use to predict tha customer characteristics and preferences
customer_df_5 = df[['age', 'gender', 'Item Purchased', 'location','Subscription Status','Previous Purchases','Frequency of Purchases']].copy()
customer_df_5.head()

Unnamed: 0,age,gender,Item Purchased,location,Subscription Status,Previous Purchases,Frequency of Purchases
0,55,Male,Blouse,Kentucky,Yes,14,Fortnightly
1,19,Male,Sweater,Maine,Yes,2,Fortnightly
2,50,Male,Jeans,Massachusetts,Yes,23,Weekly
3,21,Male,Sandals,Rhode Island,Yes,49,Weekly
4,45,Male,Blouse,Oregon,Yes,31,Annually


In [61]:
# Convert the categorical data in dummies and create a dataframe with that information
customer_df_encoded_5 = pd.get_dummies(customer_df_5)
customer_df_encoded_5.head()

Unnamed: 0,age,Previous Purchases,gender_Female,gender_Male,Item Purchased_Backpack,Item Purchased_Belt,Item Purchased_Blouse,Item Purchased_Boots,Item Purchased_Coat,Item Purchased_Dress,...,location_Wyoming,Subscription Status_No,Subscription Status_Yes,Frequency of Purchases_Annually,Frequency of Purchases_Bi-Weekly,Frequency of Purchases_Every 3 Months,Frequency of Purchases_Fortnightly,Frequency of Purchases_Monthly,Frequency of Purchases_Quarterly,Frequency of Purchases_Weekly
0,55,14,False,True,False,False,True,False,False,False,...,False,False,True,False,False,False,True,False,False,False
1,19,2,False,True,False,False,False,False,False,False,...,False,False,True,False,False,False,True,False,False,False
2,50,23,False,True,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True
3,21,49,False,True,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True
4,45,31,False,True,False,False,True,False,False,False,...,False,False,True,True,False,False,False,False,False,False


In [62]:
# Split our preprocessed data into our features and target arrays
X_6 = customer_df_encoded_5.drop('Item Purchased_Jewelry', axis=1).values
y_6 = customer_df_encoded_5['Item Purchased_Jewelry'].values

# Split the preprocessed data into a training and testing dataset
X_train_6, X_test_6, y_train_6, y_test_6 = train_test_split(X_6, y_6, random_state = 78)

In [63]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler_6 = scaler.fit(X_train_6)

# Scale the data
X_train_scaled_6 = X_scaler_6.transform(X_train_6)
X_test_scaled_6 = X_scaler_6.transform(X_test_6)

In [64]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_6[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [65]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [66]:
# Train the model
fit_model = nn.fit(X_train_scaled_6,y_train_6,epochs=10)

Epoch 1/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 960us/step - accuracy: 0.5922 - loss: 0.6715
Epoch 2/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9036 - loss: 0.3388
Epoch 3/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9444 - loss: 0.2395
Epoch 4/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9496 - loss: 0.1953
Epoch 5/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 958us/step - accuracy: 0.9539 - loss: 0.1650
Epoch 6/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9559 - loss: 0.1467
Epoch 7/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 963us/step - accuracy: 0.9551 - loss: 0.1390
Epoch 8/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 936us/step - accuracy: 0.9557 - loss: 0.1244
Epoch 9/10
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━

In [67]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled_6,y_test_6,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

31/31 - 0s - 4ms/step - accuracy: 0.9579 - loss: 0.1073
Loss: 0.10732738673686981, Accuracy: 0.9579487442970276


Although the result is not materially different than from the previuos attempt, we think that this last model is more useful for predicting the item purchased given that we don't have to rely on the purchase amount, which could bias the result.