In [1]:
# Imports
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.feature_selection import RFE, RFECV
from pandas_profiling import ProfileReport
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer, KNNImputer

  from pandas_profiling import ProfileReport


In [2]:
#
s3= boto3.resource('s3')
bucket_name= 'omar-vargas-bucket'
bucket= s3.Bucket(bucket_name)

file_key= 'Customers.csv'

bucket_object= bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the data
customers = pd.read_csv(file_content_stream)
customers.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,1,Male,19,15000,39,Healthcare,1,4
1,2,Male,21,35000,81,Engineer,3,3
2,3,Female,20,86000,6,Engineer,1,1
3,4,Female,23,59000,77,Lawyer,0,2
4,5,Female,31,38000,40,Entertainment,2,6


In [3]:
customers = customers.drop(columns='CustomerID')
customers.head()

Unnamed: 0,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,Male,19,15000,39,Healthcare,1,4
1,Male,21,35000,81,Engineer,3,3
2,Female,20,86000,6,Engineer,1,1
3,Female,23,59000,77,Lawyer,0,2
4,Female,31,38000,40,Entertainment,2,6


In [4]:
# Changing gender and profession to dummy variables
customers = pd.concat([customers.drop(columns=['Gender', 'Profession'], axis=1), pd.get_dummies(customers[['Gender', 'Profession']])], axis=1)
customers.head()

Unnamed: 0,Age,Annual Income ($),Spending Score (1-100),Work Experience,Family Size,Gender_Female,Gender_Male,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
0,19,15000,39,1,4,0,1,0,0,0,0,0,1,0,0,0
1,21,35000,81,3,3,0,1,0,0,1,0,0,0,0,0,0
2,20,86000,6,1,1,1,0,0,0,1,0,0,0,0,0,0
3,23,59000,77,0,2,1,0,0,0,0,0,0,0,0,1,0
4,31,38000,40,2,6,1,0,0,0,0,1,0,0,0,0,0


In [5]:
# Splitting the data into training and testing sets
train, test = train_test_split(customers, test_size=0.2, stratify = customers['Spending Score (1-100)'])

In [6]:
train.isna().any().any()

False

In [7]:
# Checking for importance features with Random Forest
importances = list()
for i in range (0,10):
    # Defining the input and target variables
    x = train.drop(columns = 'Spending Score (1-100)', axis = 1)
    y = train['Spending Score (1-100)']

    #Splitting the data
    x_train, x_test, y_train, y_test= train_test_split(x, y, test_size = .2, stratify = y)

    #Building Model
    RF_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(x_train, y_train)
    
    #Extracting the feature importances
    importances.append(RF_md.feature_importances_)

In [8]:
#Engineering interactions based on feature importance
train['interaction_1'] = train['Age'] * train['Annual Income ($)']
train['interaction_2'] = train['Age'] * train['Work Experience']
train['interaction_3'] = train['Annual Income ($)'] * train['Work Experience']

test['interaction_1'] = test['Age'] * test['Annual Income ($)']
test['interaction_2'] = test['Age'] * test['Work Experience']
test['interaction_3'] = test['Annual Income ($)'] * test['Work Experience']

In [9]:
#engineering features from the decisoon tree model
train['interaction_4']=np.where(((train['interaction_3'] <= 1136343.0) &
                                   (train['Annual Income ($)'] <= 72185.5) &
                                    (train['Annual Income ($)'] <= 72944.5)), 1, 0)
                                 
train['interaction_5']=np.where(((train['Annual Income ($)'] <= 138199.0) &
                                   (train['Work Experience'] <= 12.5) &
                                (train['Annual Income ($)'] <= 187540.5)), 1, 0)

In [10]:
# Defining input and target variables
x_train1 = train[['Annual Income ($)', 'interaction_3', 'interaction_2', 'Work Experience', 'Age']]
x_test1 = test[['Annual Income ($)', 'interaction_3', 'interaction_2', 'Work Experience', 'Age']]

In [11]:
# Defining input and target variables
x_train2 = train[['Annual Income ($)', 'interaction_3', 'interaction_2', 'Work Experience', 'Age', 'interaction_1']]
x_test2 = test[['Annual Income ($)', 'interaction_3', 'interaction_2', 'Work Experience', 'Age', 'interaction_1']]

In [12]:
# Building first Random Forest model
rf_md1 = RandomForestRegressor(n_estimators = 500 , max_depth = 3, min_samples_leaf = 5, min_samples_split = 10).fit(x_train1, train['Spending Score (1-100)'])

# Predicting on test
rf_pred1 = rf_md1.predict(x_test1)

# Changing likelihoods to labels
rf_label1 = mean_squared_error(test['Spending Score (1-100)'], rf_pred1)
rmse1 = np.sqrt(rf_label1)

print('RMSE of model 1 is:', rmse1)

RMSE of model 1 is: 27.8746754605779


In [13]:
# Building second Random Forest model
rf_md2 = RandomForestRegressor(n_estimators = 500 , max_depth = 3, min_samples_leaf = 5, min_samples_split = 10).fit(x_train2, train['Spending Score (1-100)'])

# Predicting on test
rf_pred2 = rf_md2.predict(x_test2)

# Changing likelihoods to labels
rf_label2 = mean_squared_error(test['Spending Score (1-100)'], rf_pred2)
rmse2 = np.sqrt(rf_label2)

print('RMSE of model 2 is:', rmse2)

RMSE of model 2 is: 27.82023592780519


In [None]:
# Based on my results I would use the second model to predict Spending Score as it had a lower RMSE. But barely at that.