In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle
import numpy as np

In [6]:

# Number of rows
num_rows = 2000
np.random.seed(42)

# Sample data for each column
customer_ids = np.arange(100000, 100000 + num_rows)
surnames = [f"Surname{i}" for i in range(num_rows)]
credit_scores = np.random.randint(300, 850, size=num_rows)
locations = np.random.choice(['New York', 'California', 'Texas', 'Florida', 'Illinois'], size=num_rows)
genders = np.random.choice(['Male', 'Female'], size=num_rows)
ages = np.random.randint(18, 90, size=num_rows)
tenures = np.random.randint(0, 11, size=num_rows)
balances = np.round(np.random.uniform(0, 250000, size=num_rows), 2)
num_of_products = np.random.randint(1, 5, size=num_rows)
has_credit_card = np.random.choice([0, 1], size=num_rows)
is_active_member = np.random.choice([0, 1], size=num_rows)
estimated_salaries = np.round(np.random.uniform(10000, 200000, size=num_rows), 2)
exited = np.random.choice([0, 1], size=num_rows)

# Create the DataFrame
data = pd.DataFrame({
    'CustomerID': customer_ids,
    'Surname': surnames,
    'CreditScore': credit_scores,
    'Location': locations,
    'Gender': genders,
    'Age': ages,
    'Tenure': tenures,
    'Balance': balances,
    'NumOfProducts': num_of_products,
    'HasCreditCard': has_credit_card,
    'IsActiveMember': is_active_member,
    'EstimatedSalary': estimated_salaries,
    'Exited': exited
})

data.head()


Unnamed: 0,CustomerID,Surname,CreditScore,Location,Gender,Age,Tenure,Balance,NumOfProducts,HasCreditCard,IsActiveMember,EstimatedSalary,Exited
0,100000,Surname0,402,California,Female,69,5,42349.12,4,1,1,127384.9,1
1,100001,Surname1,735,California,Male,66,4,155204.7,2,0,0,152807.11,1
2,100002,Surname2,570,New York,Male,54,2,179747.69,3,0,0,50660.01,1
3,100003,Surname3,406,Florida,Male,43,9,142380.32,3,1,0,146133.07,0
4,100004,Surname4,371,Illinois,Female,57,9,184587.82,1,1,1,53246.01,1


In [7]:
data = data.drop(['CustomerID', 'Surname'],axis=1)
data

Unnamed: 0,CreditScore,Location,Gender,Age,Tenure,Balance,NumOfProducts,HasCreditCard,IsActiveMember,EstimatedSalary,Exited
0,402,California,Female,69,5,42349.12,4,1,1,127384.90,1
1,735,California,Male,66,4,155204.70,2,0,0,152807.11,1
2,570,New York,Male,54,2,179747.69,3,0,0,50660.01,1
3,406,Florida,Male,43,9,142380.32,3,1,0,146133.07,0
4,371,Illinois,Female,57,9,184587.82,1,1,1,53246.01,1
...,...,...,...,...,...,...,...,...,...,...,...
1995,407,Texas,Male,79,7,18665.41,1,0,0,139722.99,1
1996,666,Illinois,Male,70,2,228610.09,4,0,0,68415.42,0
1997,580,California,Male,21,6,5160.86,2,0,1,116285.10,1
1998,427,Texas,Female,42,1,240587.53,2,1,1,187219.13,0


In [8]:
##endoding the categorical variables 
label_encoder_gender = LabelEncoder()
data['Gender']=label_encoder_gender.fit_transform(data['Gender'])
data

Unnamed: 0,CreditScore,Location,Gender,Age,Tenure,Balance,NumOfProducts,HasCreditCard,IsActiveMember,EstimatedSalary,Exited
0,402,California,0,69,5,42349.12,4,1,1,127384.90,1
1,735,California,1,66,4,155204.70,2,0,0,152807.11,1
2,570,New York,1,54,2,179747.69,3,0,0,50660.01,1
3,406,Florida,1,43,9,142380.32,3,1,0,146133.07,0
4,371,Illinois,0,57,9,184587.82,1,1,1,53246.01,1
...,...,...,...,...,...,...,...,...,...,...,...
1995,407,Texas,1,79,7,18665.41,1,0,0,139722.99,1
1996,666,Illinois,1,70,2,228610.09,4,0,0,68415.42,0
1997,580,California,1,21,6,5160.86,2,0,1,116285.10,1
1998,427,Texas,0,42,1,240587.53,2,1,1,187219.13,0


In [10]:
## one-hot encode the nominal data "Location"
from sklearn.preprocessing import OneHotEncoder
onehot_encoder_location=OneHotEncoder()
locaton_encoder = onehot_encoder_location.fit_transform(data[['Location']])
locaton_encoder.toarray()

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.]])

In [11]:
location_encoded_df=pd.DataFrame(locaton_encoder.toarray(),columns=onehot_encoder_location.get_feature_names_out(['Location']))
location_encoded_df

Unnamed: 0,Location_California,Location_Florida,Location_Illinois,Location_New York,Location_Texas
0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
1995,0.0,0.0,0.0,0.0,1.0
1996,0.0,0.0,1.0,0.0,0.0
1997,1.0,0.0,0.0,0.0,0.0
1998,0.0,0.0,0.0,0.0,1.0


In [12]:
##combine onehotencoded columns
data=pd.concat([data.drop('Location',axis=1), location_encoded_df],axis=1)
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCreditCard,IsActiveMember,EstimatedSalary,Exited,Location_California,Location_Florida,Location_Illinois,Location_New York,Location_Texas
0,402,0,69,5,42349.12,4,1,1,127384.9,1,1.0,0.0,0.0,0.0,0.0
1,735,1,66,4,155204.7,2,0,0,152807.11,1,1.0,0.0,0.0,0.0,0.0
2,570,1,54,2,179747.69,3,0,0,50660.01,1,0.0,0.0,0.0,1.0,0.0
3,406,1,43,9,142380.32,3,1,0,146133.07,0,0.0,1.0,0.0,0.0,0.0
4,371,0,57,9,184587.82,1,1,1,53246.01,1,0.0,0.0,1.0,0.0,0.0


In [13]:
X = data.drop('EstimatedSalary',axis=1)
y = data['EstimatedSalary']


In [14]:
with open('label_encoder_gender.pkl','wb') as file:
    pickle.dump(label_encoder_gender,file)
    
with open('onehot_encoder_location.pkl','wb') as file:
    pickle.dump(onehot_encoder_location, file)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Scaling the features

scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

In [16]:
with open('scalar.pkl','wb') as file:
    pickle.dump(scalar, file)

# ANN Regression Problem

In [17]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [18]:
data.shape[1]

15

In [20]:
##Build ANN model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  #hidden layer 1 connected to the input layer
    Dense(32, activation='relu'),
    Dense(1) # output layer  I am not specifing any activation function because default is linear activation function and for regression we use that 
])

## Compile the model
model.compile(optimizer='adam',loss='mean_absolute_error',metrics=['mae'])
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 64)                960       
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 3073 (12.00 KB)
Trainable params: 3073 (12.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [21]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard 
import datetime

log_dir = 'regression/fit' +  datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir,histogram_freq=1)

In [22]:
early_stopping_callback=EarlyStopping(monitor='val_loss',patience=10, restore_best_weights=True)
 

In [23]:
history = model.fit(
    X_train,y_train,
    validation_data=(X_test,y_test),
    epochs=100,
    callbacks=[early_stopping_callback, tensorboard_callback]
)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100


In [24]:
%load_ext tensorboard

In [27]:
%tensorboard --logdir regression/fit

In [28]:
##Evaluate model on the test data
test_loss, test_mae = model.evaluate(X_test,y_test)
print(f'Test MAE: {test_mae}')

Test MAE: 47092.9765625


In [29]:
model.save('regression_model.h5')

  saving_api.save_model(
