In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
import config
from config import CONNSTRING
from sklearn.model_selection import train_test_split



In [30]:
engine= create_engine(CONNSTRING)

query = "SELECT * FROM customer_data"
df = pd.read_sql(query, engine)

df.head()


Unnamed: 0,Customer ID,age,gender,Item Purchased,category,Purchase Amount (USD),location,size,color,season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53.0,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64.0,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73.0,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90.0,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49.0,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [31]:
df.columns

Index(['Customer ID', 'age', 'gender', 'Item Purchased', 'category',
       'Purchase Amount (USD)', 'location', 'size', 'color', 'season',
       'Review Rating', 'Subscription Status', 'Shipping Type',
       'Discount Applied', 'Promo Code Used', 'Previous Purchases',
       'Payment Method', 'Frequency of Purchases'],
      dtype='object')

In [32]:
df.sample()

Unnamed: 0,Customer ID,age,gender,Item Purchased,category,Purchase Amount (USD),location,size,color,season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
2004,2005,44,Male,Shoes,Footwear,70.0,Georgia,L,Blue,Fall,3.2,No,2-Day Shipping,No,No,13,Cash,Bi-Weekly


In [33]:
df.shape[0]

3900

In [34]:
# Handle Missing Values: 
df.isna().sum()

Customer ID               0
age                       0
gender                    0
Item Purchased            0
category                  0
Purchase Amount (USD)     0
location                  0
size                      0
color                     0
season                    0
Review Rating             0
Subscription Status       0
Shipping Type             0
Discount Applied          0
Promo Code Used           0
Previous Purchases        0
Payment Method            0
Frequency of Purchases    0
dtype: int64

In [35]:
print(df.dtypes)

Customer ID                 int64
age                         int64
gender                     object
Item Purchased             object
category                   object
Purchase Amount (USD)     float64
location                   object
size                       object
color                      object
season                     object
Review Rating             float64
Subscription Status        object
Shipping Type              object
Discount Applied           object
Promo Code Used            object
Previous Purchases          int64
Payment Method             object
Frequency of Purchases     object
dtype: object


In [36]:
#Step 1: Data Preprocessing
# Converting columns to appropriate data types
df['Customer ID'] = df['Customer ID'].astype(int)  # Assuming 'Customer ID' is an integer
df['age'] = df['age'].astype(int)  # Age is an integer
df['gender'] = df['gender'].astype('category')  # Gender as category
df['Item Purchased'] = df['Item Purchased'].astype('category')  # Item Purchased as category
df['category'] = df['category'].astype('category')  # Category as category
df['Purchase Amount (USD)'] = df['Purchase Amount (USD)'].astype(float)  # Purchase Amount as float
df['location'] = df['location'].astype('category')  # Location as category
df['size'] = df['size'].astype('category')  # Size as category
df['color'] = df['color'].astype('category')  # Color as category
df['season'] = df['season'].astype('category')  # Season as category
df['Review Rating'] = df['Review Rating'].astype(float)  # Review Rating as float
df['Subscription Status'] = df['Subscription Status'].astype('category')  # Subscription Status as category
df['Shipping Type'] = df['Shipping Type'].astype('category')  # Shipping Type as category
df['Discount Applied'] = df['Discount Applied'].astype('category')  # Discount Applied as category (assuming 'Yes'/'No')
df['Promo Code Used'] = df['Promo Code Used'].astype('category')  # Promo Code Used as category
df['Previous Purchases'] = df['Previous Purchases'].astype(int)  # Previous Purchases as integer
df['Payment Method'] = df['Payment Method'].astype('category')  # Payment Method as category
df['Frequency of Purchases'] = df['Frequency of Purchases'].astype('category')  # Frequency of Purchases as category

In [53]:
# Create a dataframe with the columns that we'll use to predict tha customer characteristics and preferences
customer_df = df[['age', 'gender','Item Purchased', 'category', 'Purchase Amount (USD)', 'location']].copy()
customer_df.head()

Unnamed: 0,age,gender,Item Purchased,category,Purchase Amount (USD),location
0,55,Male,Blouse,Clothing,53.0,Kentucky
1,19,Male,Sweater,Clothing,64.0,Maine
2,50,Male,Jeans,Clothing,73.0,Massachusetts
3,21,Male,Sandals,Footwear,90.0,Rhode Island
4,45,Male,Blouse,Clothing,49.0,Oregon


In [54]:
# Convert the categorical data in dummies and create a dataframe with that information

from sklearn.preprocessing import OneHotEncoder

customer_df_encoded = pd.get_dummies(customer_df)
customer_df_encoded.head()

Unnamed: 0,age,Purchase Amount (USD),gender_Female,gender_Male,Item Purchased_Backpack,Item Purchased_Belt,Item Purchased_Blouse,Item Purchased_Boots,Item Purchased_Coat,Item Purchased_Dress,...,location_South Dakota,location_Tennessee,location_Texas,location_Utah,location_Vermont,location_Virginia,location_Washington,location_West Virginia,location_Wisconsin,location_Wyoming
0,55,53.0,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,19,64.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,50,73.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,21,90.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,45,49.0,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [39]:
# Display the columns of the dataframe
customer_df_encoded.columns

Index(['age', 'Purchase Amount (USD)', 'gender_Female', 'gender_Male',
       'Item Purchased_Backpack', 'Item Purchased_Belt',
       'Item Purchased_Blouse', 'Item Purchased_Boots', 'Item Purchased_Coat',
       'Item Purchased_Dress', 'Item Purchased_Gloves',
       'Item Purchased_Handbag', 'Item Purchased_Hat', 'Item Purchased_Hoodie',
       'Item Purchased_Jacket', 'Item Purchased_Jeans',
       'Item Purchased_Jewelry', 'Item Purchased_Pants',
       'Item Purchased_Sandals', 'Item Purchased_Scarf',
       'Item Purchased_Shirt', 'Item Purchased_Shoes', 'Item Purchased_Shorts',
       'Item Purchased_Skirt', 'Item Purchased_Sneakers',
       'Item Purchased_Socks', 'Item Purchased_Sunglasses',
       'Item Purchased_Sweater', 'Item Purchased_T-shirt',
       'category_Accessories', 'category_Clothing', 'category_Footwear',
       'category_Outerwear', 'location_Alabama', 'location_Alaska',
       'location_Arizona', 'location_Arkansas', 'location_California',
       'location_C

In [40]:
# Import dependencies
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

First, we'll try to use a neural network model to predict the categories thay the customers will buy (using 4 targets), based on the following features: age, gender, item purchased, purchased amount in USD and location.

In [43]:
# Split our preprocessed data into our features and target arrays
#Features
X = customer_df_encoded.drop(columns=['category_Accessories','category_Footwear','category_Clothing','category_Outerwear'],axis=1).values

#Targets
y_accessories = customer_df_encoded['category_Accessories'].values
y_footwear = customer_df_encoded['category_Footwear'].values
y_clothing = customer_df_encoded['category_Clothing'].values
y_outerwear = customer_df_encoded['category_Outerwear'].values

# Number of classes for each target variable
num_classes_accessories = len(np.unique(y_accessories))
num_classes_footwear = len(np.unique(y_footwear))
num_classes_clothing = len(np.unique(y_clothing))
num_classes_outerwear = len(np.unique(y_outerwear))

# Convert targets to one-hot encoding
y_accessories = to_categorical(y_accessories, num_classes=num_classes_accessories).astype(np.float32)
y_footwear = to_categorical(y_footwear, num_classes=num_classes_footwear).astype(np.float32)
y_clothing = to_categorical(y_clothing, num_classes=num_classes_clothing).astype(np.float32)
y_outerwear = to_categorical(y_outerwear, num_classes=num_classes_outerwear).astype(np.float32)

# Split the data into training and testing sets
X_train, X_test, y_accessories_train, y_accessories_test = train_test_split(
    X, y_accessories, test_size=0.2, random_state=42
)

# Repeat the split for other targets, ensuring consistent splitting
y_footwear_train, y_footwear_test = train_test_split(
    y_footwear, test_size=0.2, random_state=42
)
y_clothing_train, y_clothing_test = train_test_split(
    y_clothing, test_size=0.2, random_state=42
)
y_outerwear_train, y_outerwear_test = train_test_split(
    y_outerwear, test_size=0.2, random_state=42
)

# Define the model
X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
inputs = Input(shape=(X.shape[1],))

# Shared layers
x = Dense(64, activation='relu')(inputs)
x = Dense(32, activation='relu')(x)

# Output layers for each target
output_accessories = Dense(num_classes_accessories, activation='softmax', name='accessories')(x)
output_footwear = Dense(num_classes_footwear, activation='softmax', name='footwear')(x)
output_clothing = Dense(num_classes_clothing, activation='softmax', name='clothing')(x)
output_outerwear = Dense(num_classes_outerwear, activation='softmax', name='outerwear')(x)



In [44]:
# Define the model
X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
inputs = Input(shape=(X.shape[1],))

# Shared layers
x = Dense(64, activation='relu')(inputs)
x = Dense(32, activation='relu')(x)

# Output layers for each target
output_accessories = Dense(num_classes_accessories, activation='softmax', name='accessories')(x)
output_footwear = Dense(num_classes_footwear, activation='softmax', name='footwear')(x)
output_clothing = Dense(num_classes_clothing, activation='softmax', name='clothing')(x)
output_outerwear = Dense(num_classes_outerwear, activation='softmax', name='outerwear')(x)

# Create and compile the model
model = Model(inputs=inputs, outputs=[output_accessories, output_footwear, output_clothing, output_outerwear])

model.compile(optimizer=Adam(),
              loss={'accessories': 'categorical_crossentropy',
                    'footwear': 'categorical_crossentropy',
                    'clothing': 'categorical_crossentropy',
                    'outerwear': 'categorical_crossentropy'},
              metrics={'accessories': 'accuracy',
                       'footwear': 'accuracy',
                       'clothing': 'accuracy',
                       'outerwear': 'accuracy'})

# Train the model
model.fit(X_train, {'accessories': y_accessories_train, 
                    'footwear': y_footwear_train, 
                    'clothing': y_clothing_train, 
                    'outerwear': y_outerwear_train},
          epochs=10, batch_size=32)

Epoch 1/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accessories_accuracy: 0.5790 - clothing_accuracy: 0.5070 - footwear_accuracy: 0.8164 - loss: 4.8240 - outerwear_accuracy: 0.6997
Epoch 2/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accessories_accuracy: 0.6885 - clothing_accuracy: 0.6636 - footwear_accuracy: 0.8403 - loss: 1.9559 - outerwear_accuracy: 0.9110
Epoch 3/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accessories_accuracy: 0.7205 - clothing_accuracy: 0.7933 - footwear_accuracy: 0.8578 - loss: 1.7217 - outerwear_accuracy: 0.9151
Epoch 4/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accessories_accuracy: 0.8562 - clothing_accuracy: 0.8371 - footwear_accuracy: 0.8674 - loss: 1.4630 - outerwear_accuracy: 0.9144
Epoch 5/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.9752 - clothing_accuracy: 0.95

<keras.src.callbacks.history.History at 0x1f755c016d0>

In [45]:
# Evaluate the model
results = model.evaluate(X_test, {'accessories': y_accessories_test, 
                                  'footwear': y_footwear_test, 
                                  'clothing': y_clothing_test, 
                                  'outerwear': y_outerwear_test})
# Print all results
for name, result in zip(model.metrics_names, results):
    print(f"{name}: {result}")

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accessories_accuracy: 1.0000 - clothing_accuracy: 1.0000 - footwear_accuracy: 1.0000 - loss: 0.0903 - outerwear_accuracy: 1.0000 
loss: 0.09034951031208038
compile_metrics: 1.0


After running the model with 10 epochs, we reached an accuracy of 100% for all the target variables. Although the accuracy is very high, after reviewing the results we found a bit strange that the model gave this accuracy level. Therefore, we decided to deep deeper in the model characteristics. One of the possibilities for this high accuracy level could be that the model is overfitting. We also think that the reason why the model is overfitting is because there is not enough complexity in 
the training data, and the model converges too quickly, given that the 100% accuracy is reached after 6 epochs.

Although the most straightforward way to fix overfitting is adding more training data, we were not able to do it because the company didn't provide us with more data fields or with an additional dataset. For this reason, we tried to fix this by keeping our training data the same and retraining the model using fewer epochs.


In [42]:
# Split our preprocessed data into our features and target arrays
#Features
X_2 = customer_df_encoded.drop(columns=['category_Accessories','category_Footwear','category_Clothing','category_Outerwear'],axis=1).values

#Targets
y_accessories_2 = customer_df_encoded['category_Accessories'].values
y_footwear_2 = customer_df_encoded['category_Footwear'].values
y_clothing_2 = customer_df_encoded['category_Clothing'].values
y_outerwear_2 = customer_df_encoded['category_Outerwear'].values

# Number of classes for each target variable
num_classes_accessories_2 = len(np.unique(y_accessories_2))
num_classes_footwear_2 = len(np.unique(y_footwear_2))
num_classes_clothing_2 = len(np.unique(y_clothing_2))
num_classes_outerwear_2 = len(np.unique(y_outerwear_2))

# Convert targets to one-hot encoding
y_accessories_2 = to_categorical(y_accessories_2, num_classes=num_classes_accessories_2).astype(np.float32)
y_footwear_2 = to_categorical(y_footwear_2, num_classes=num_classes_footwear_2).astype(np.float32)
y_clothing_2 = to_categorical(y_clothing_2, num_classes=num_classes_clothing_2).astype(np.float32)
y_outerwear_2 = to_categorical(y_outerwear_2, num_classes=num_classes_outerwear_2).astype(np.float32)

# Split the data into training and testing sets
X_train_2, X_test_2, y_accessories_train_2, y_accessories_test_2 = train_test_split(
    X, y_accessories_2, test_size=0.2, random_state=42
)

# Repeat the split for other targets, ensuring consistent splitting
y_footwear_train_2, y_footwear_test_2 = train_test_split(
    y_footwear_2, test_size=0.2, random_state=42
)
y_clothing_train_2, y_clothing_test_2 = train_test_split(
    y_clothing_2, test_size=0.2, random_state=42
)
y_outerwear_train_2, y_outerwear_test_2 = train_test_split(
    y_outerwear_2, test_size=0.2, random_state=42
)

# Define the model
X_train_2 = np.asarray(X_train_2).astype(np.float32)
X_test_2 = np.asarray(X_test_2).astype(np.float32)
inputs_2 = Input(shape=(X.shape[1],))

# Shared layers
x_2 = Dense(64, activation='relu')(inputs_2)
x_2 = Dense(32, activation='relu')(x_2)

# Output layers for each target
output_accessories_2 = Dense(num_classes_accessories_2, activation='softmax', name='accessories')(x_2)
output_footwear_2 = Dense(num_classes_footwear_2, activation='softmax', name='footwear')(x_2)
output_clothing_2 = Dense(num_classes_clothing_2, activation='softmax', name='clothing')(x_2)
output_outerwear_2 = Dense(num_classes_outerwear_2, activation='softmax', name='outerwear')(x_2)


# Create and compile the model
model_2 = Model(inputs=inputs_2, outputs=[output_accessories_2, output_footwear_2, output_clothing_2, output_outerwear_2])

model_2.compile(optimizer=Adam(),
              loss={'accessories': 'categorical_crossentropy',
                    'footwear': 'categorical_crossentropy',
                    'clothing': 'categorical_crossentropy',
                    'outerwear': 'categorical_crossentropy'},
              metrics={'accessories': 'accuracy',
                       'footwear': 'accuracy',
                       'clothing': 'accuracy',
                       'outerwear': 'accuracy'})


# Train the model
model_2.fit(X_train_2, {'accessories': y_accessories_train_2, 
                    'footwear': y_footwear_train_2, 
                    'clothing': y_clothing_train_2, 
                    'outerwear': y_outerwear_train_2},
          epochs=5, batch_size=32)

Epoch 1/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accessories_accuracy: 0.5531 - clothing_accuracy: 0.5262 - footwear_accuracy: 0.8505 - loss: 3.8530 - outerwear_accuracy: 0.9162
Epoch 2/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.7169 - clothing_accuracy: 0.6278 - footwear_accuracy: 0.8434 - loss: 1.9155 - outerwear_accuracy: 0.9139
Epoch 3/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accessories_accuracy: 0.7810 - clothing_accuracy: 0.7011 - footwear_accuracy: 0.8517 - loss: 1.7143 - outerwear_accuracy: 0.9208
Epoch 4/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accessories_accuracy: 0.9279 - clothing_accuracy: 0.8774 - footwear_accuracy: 0.8483 - loss: 1.4518 - outerwear_accuracy: 0.9106
Epoch 5/5
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accessories_accuracy: 0.9855 - clothing_accuracy: 0.9398 - 

<keras.src.callbacks.history.History at 0x1f76263c1a0>

The model with fewer epochs gave us an accuracy of 94%, which means that we can continue increasing the epochs to reach a higher accuracy. However, if we do so, we'll probably end up facing the overfitting issue that we saw in the previous attempt. For this reason, this could give us a hint that the neural network model may not be the best one to apply to this type of data. 

Since we don't have a lot of data and the features used to predict the targets are simple, maybe the Random Forest model would be a better for this problem.

In [55]:
# Import dependencies
from sklearn.preprocessing import LabelEncoder

# Convert non numeric data into numbers by using LabelEncoder
label_encoders = {}
for column in customer_df.columns:
    le = LabelEncoder()
    customer_df[column] = le.fit_transform(customer_df[column])
    label_encoders[column] = le

customer_df.head()

Unnamed: 0,age,gender,Item Purchased,category,Purchase Amount (USD),location
0,37,1,2,1,33,16
1,1,1,23,1,44,18
2,32,1,11,1,53,20
3,3,1,14,2,70,38
4,27,1,2,1,29,36


In [56]:
# Display number of unique values for each column
customer_df_unique_counts=customer_df.nunique()
print(customer_df_unique_counts)

age                      53
gender                    2
Item Purchased           25
category                  4
Purchase Amount (USD)    81
location                 50
dtype: int64


In [57]:
X = customer_df.drop('category', axis=1)
y = customer_df['category']

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [59]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [60]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[366   4   0   0]
 [  1 521   0   0]
 [  1   1 187   0]
 [  0   2   0  87]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       370
           1       0.99      1.00      0.99       522
           2       1.00      0.99      0.99       189
           3       1.00      0.98      0.99        89

    accuracy                           0.99      1170
   macro avg       1.00      0.99      0.99      1170
weighted avg       0.99      0.99      0.99      1170



After running the Random Forest model, we also observed that the accuracy is close to 100%. This makes us conclude that we don't have the necessary amount of data to train a machine learning model. However, we prefer to do it with a Random Forest because of the simplicity of the data.

Second, we'll try to use a Random Forest model to predict the gender of the customer (target variable), based on several features .

In [79]:
# Create a dataframe with the columns that we'll use to predict tha customer characteristics and preferences
customer_df = df[['age', 'gender','Item Purchased', 'category', 'Purchase Amount (USD)', 'location']].copy()
customer_df.head()

Unnamed: 0,age,gender,Item Purchased,category,Purchase Amount (USD),location
0,55,Male,Blouse,Clothing,53.0,Kentucky
1,19,Male,Sweater,Clothing,64.0,Maine
2,50,Male,Jeans,Clothing,73.0,Massachusetts
3,21,Male,Sandals,Footwear,90.0,Rhode Island
4,45,Male,Blouse,Clothing,49.0,Oregon


In [80]:
# Convert the categorical data in dummies and create a dataframe with that information

from sklearn.preprocessing import OneHotEncoder

customer_df_2_encoded = pd.get_dummies(customer_df)
customer_df_2_encoded.head()

Unnamed: 0,age,Purchase Amount (USD),gender_Female,gender_Male,Item Purchased_Backpack,Item Purchased_Belt,Item Purchased_Blouse,Item Purchased_Boots,Item Purchased_Coat,Item Purchased_Dress,...,location_South Dakota,location_Tennessee,location_Texas,location_Utah,location_Vermont,location_Virginia,location_Washington,location_West Virginia,location_Wisconsin,location_Wyoming
0,55,53.0,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,19,64.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,50,73.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,21,90.0,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,45,49.0,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [81]:
customer_df_2_encoded.drop('gender_Male', axis=1, inplace=True)

customer_df_2_encoded.head()

Unnamed: 0,age,Purchase Amount (USD),gender_Female,Item Purchased_Backpack,Item Purchased_Belt,Item Purchased_Blouse,Item Purchased_Boots,Item Purchased_Coat,Item Purchased_Dress,Item Purchased_Gloves,...,location_South Dakota,location_Tennessee,location_Texas,location_Utah,location_Vermont,location_Virginia,location_Washington,location_West Virginia,location_Wisconsin,location_Wyoming
0,55,53.0,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,19,64.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,50,73.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,21,90.0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,45,49.0,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [82]:
X = customer_df_2_encoded.drop('gender_Female', axis=1)
y = customer_df_2_encoded['gender_Female']

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [84]:
from sklearn.ensemble import RandomForestClassifier

model_3 = RandomForestClassifier(random_state=42)
model_3.fit(X_train, y_train)

In [85]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model_3.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[739  95]
 [300  36]]
              precision    recall  f1-score   support

       False       0.71      0.89      0.79       834
        True       0.27      0.11      0.15       336

    accuracy                           0.66      1170
   macro avg       0.49      0.50      0.47      1170
weighted avg       0.59      0.66      0.61      1170



Given that accuracy level was less than 75% and that the precision for predicting women was considerably low, we will include more data to try to improve the model.

In [92]:
# Create a dataframe with the columns that we'll use to predict tha customer characteristics and preferences
customer_df_3 = df[['age', 'gender','Item Purchased', 'category', 'Purchase Amount (USD)', 'location','size','season',
       'Review Rating', 'Subscription Status', 'Shipping Type',
       'Discount Applied', 'Promo Code Used', 'Previous Purchases',
       'Payment Method', 'Frequency of Purchases']].copy()
customer_df_3.head()

Unnamed: 0,age,gender,Item Purchased,category,Purchase Amount (USD),location,size,season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,55,Male,Blouse,Clothing,53.0,Kentucky,L,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,19,Male,Sweater,Clothing,64.0,Maine,L,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,50,Male,Jeans,Clothing,73.0,Massachusetts,S,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,21,Male,Sandals,Footwear,90.0,Rhode Island,M,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,45,Male,Blouse,Clothing,49.0,Oregon,M,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [93]:
# Convert the categorical data in dummies and create a dataframe with that information

from sklearn.preprocessing import OneHotEncoder

customer_df_3_encoded = pd.get_dummies(customer_df_3)
customer_df_3_encoded.head()

Unnamed: 0,age,Purchase Amount (USD),Review Rating,Previous Purchases,gender_Female,gender_Male,Item Purchased_Backpack,Item Purchased_Belt,Item Purchased_Blouse,Item Purchased_Boots,...,Payment Method_Debit Card,Payment Method_PayPal,Payment Method_Venmo,Frequency of Purchases_Annually,Frequency of Purchases_Bi-Weekly,Frequency of Purchases_Every 3 Months,Frequency of Purchases_Fortnightly,Frequency of Purchases_Monthly,Frequency of Purchases_Quarterly,Frequency of Purchases_Weekly
0,55,53.0,3.1,14,False,True,False,False,True,False,...,False,False,True,False,False,False,True,False,False,False
1,19,64.0,3.1,2,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,50,73.0,3.1,23,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,21,90.0,3.5,49,False,True,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
4,45,49.0,2.7,31,False,True,False,False,True,False,...,False,True,False,True,False,False,False,False,False,False


In [95]:
customer_df_3_encoded.drop('gender_Male', axis=1, inplace=True)

customer_df_3_encoded.head()

Unnamed: 0,age,Purchase Amount (USD),Review Rating,Previous Purchases,gender_Female,Item Purchased_Backpack,Item Purchased_Belt,Item Purchased_Blouse,Item Purchased_Boots,Item Purchased_Coat,...,Payment Method_Debit Card,Payment Method_PayPal,Payment Method_Venmo,Frequency of Purchases_Annually,Frequency of Purchases_Bi-Weekly,Frequency of Purchases_Every 3 Months,Frequency of Purchases_Fortnightly,Frequency of Purchases_Monthly,Frequency of Purchases_Quarterly,Frequency of Purchases_Weekly
0,55,53.0,3.1,14,False,False,False,True,False,False,...,False,False,True,False,False,False,True,False,False,False
1,19,64.0,3.1,2,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,50,73.0,3.1,23,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,21,90.0,3.5,49,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
4,45,49.0,2.7,31,False,False,False,True,False,False,...,False,True,False,True,False,False,False,False,False,False


In [96]:
X = customer_df_3_encoded.drop('gender_Female', axis=1)
y = customer_df_3_encoded['gender_Female']

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [98]:
from sklearn.ensemble import RandomForestClassifier

model_4 = RandomForestClassifier(random_state=42)
model_4.fit(X_train, y_train)

In [99]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model_4.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[606 228]
 [ 67 269]]
              precision    recall  f1-score   support

       False       0.90      0.73      0.80       834
        True       0.54      0.80      0.65       336

    accuracy                           0.75      1170
   macro avg       0.72      0.76      0.73      1170
weighted avg       0.80      0.75      0.76      1170



Although we reached an accuracy of 75% by increasing the data, we still have a low precision for predicting women. Therefore, marketing campaigns targeted to men will be more efficient.