# Predict Discount Based on Age

In [19]:
from random import randrange, choice
import os
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split


## Create Data

In [20]:
sample_size =25000
data_file = f'../data/discount_data_{sample_size}.xlsx'

business_list = ['bar', 'restaurant', 'hotel', 'pharmacy', 'spa', 'airline']


def calculate_discount(age, sex, business_type, is_resident):
    business_discounts = dict()
    business_discounts['restaurant'] = 0.2
    business_discounts['pharmacy'] = 0.2
    business_discounts['hotel'] = 0.25
    
    if sex == 'M' and age >= 64 and is_resident:
        return business_discounts.get(business_type, 0.0)
    elif sex == 'F' and age >= 57 and is_resident:
        return business_discounts.get(business_type, 0.0)
    else:
        return 0.0
    
def build_dataframe(samples):

    data = dict()
    data['sex'] = list()
    data['age'] = list()
    data['business_type'] = list()
    data['discount'] = list()
    data['is_resident'] = list()
    for i in range(samples):
        age = randrange(18, 95, 1)
        sex = choice(['M', 'F'])
        business_type = choice(business_list)
        is_resident =  choice([True, False])
        discount = calculate_discount(age, sex, business_type, is_resident)
        #print(f'{sex} age {age} for {business_type}')
        data['sex'].append(sex)
        data['age'].append(age)
        data['business_type'].append(business_type)
        data['discount'].append(discount)
        data['is_resident'].append(is_resident)

        df = pd.DataFrame.from_dict(data)
    return df
    


In [27]:
if os.path.exists(data_file):
    df = pd.read_excel(data_file)
    print(f'Loaded from {data_file}')
else:
    df = build_dataframe(sample_size)
    df.to_excel(data_file, index=False)
    print(f'Saved to {data_file}')

Loaded from ../data/discount_data_25000.xlsx


In [28]:
print(f'Shape: {df.shape}')
df.head()

Shape: (25000, 5)


Unnamed: 0,sex,age,business_type,discount,is_resident
0,F,43,airline,0.0,True
1,M,89,spa,0.0,True
2,M,31,pharmacy,0.0,False
3,M,50,airline,0.0,True
4,F,61,airline,0.0,True


## Cleanup


### Separate dependent and independent variables

In [29]:
discounts = df['discount'].copy()
df.drop(columns=['discount',], inplace=True)

print(f'discount shape: {discounts.shape}')

df.head()

discount shape: (25000,)


Unnamed: 0,sex,age,business_type,is_resident
0,F,43,airline,True
1,M,89,spa,True
2,M,31,pharmacy,False
3,M,50,airline,True
4,F,61,airline,True


### Hot One Encode

In [30]:
df = pd.get_dummies(df, columns=['business_type'])


In [31]:
df['is_resident'].replace({False: 0, True: 1}, inplace=True)

df['sex'] = df['sex'].replace({'M': 1, 'F': 0})

df["sex"] = pd.to_numeric(df["sex"])

ValueError: Unable to parse string "F" at position 0

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   sex                       25000 non-null  object 
 1   age                       25000 non-null  float64
 2   is_resident               25000 non-null  int64  
 3   business_type_airline     25000 non-null  uint8  
 4   business_type_bar         25000 non-null  uint8  
 5   business_type_hotel       25000 non-null  uint8  
 6   business_type_pharmacy    25000 non-null  uint8  
 7   business_type_restaurant  25000 non-null  uint8  
 8   business_type_spa         25000 non-null  uint8  
dtypes: float64(1), int64(1), object(1), uint8(6)
memory usage: 732.5+ KB


### Scaling age

In [8]:
scaler = MinMaxScaler()
df['age'] = scaler.fit_transform(df['age'].values.reshape(-1, 1))


In [9]:
print(f'Shape: {df.shape}')
df.head()

Shape: (25000, 9)


Unnamed: 0,sex,age,is_resident,business_type_airline,business_type_bar,business_type_hotel,business_type_pharmacy,business_type_restaurant,business_type_spa
0,F,0.328947,1,1,0,0,0,0,0
1,M,0.934211,1,0,0,0,0,0,1
2,M,0.171053,0,0,0,0,1,0,0
3,M,0.421053,1,1,0,0,0,0,0
4,F,0.565789,1,1,0,0,0,0,0


### Split Training and Test Data

In [10]:

X_train, X_test, y_train, y_test = train_test_split(df.values, discounts.values,
                                                    test_size=0.2, random_state=25)

print('Train', X_train.shape, y_train.shape)
print('Test',X_test.shape, y_test.shape)

Train (20000, 9) (20000,)
Test (5000, 9) (5000,)


## Basic Neuronal Network


https://towardsdatascience.com/deep-neural-networks-for-regression-problems-81321897ca33

In [11]:
learning_rate = 0.001

#checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
#checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
#callbacks_list = [checkpoint]

In [12]:
model = tf.keras.models.Sequential()
print(X_train.shape[1])
model.add(Dense(X_train.shape[1], kernel_initializer='normal',
                input_dim = X_train.shape[1],
                activation='relu'))

model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
model.add(Dense(1, kernel_initializer='normal',activation='linear'))


9


In [13]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss='mean_absolute_error',
    metrics=['mean_absolute_error', 'accuracy']
)

In [14]:
history = model.fit(X_train, y_train, epochs=200, 
                    batch_size=32, validation_split = 0.2,
                   verbose=False)
                    #callbacks=callbacks_list))

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [None]:
print(history.history.keys())

In [None]:
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()

In [None]:
plt.xlabel('Epoch')
plt.ylabel('accuracy')
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.show()

In [None]:
plt.xlabel('Epoch')
plt.ylabel('mean_absolute_error')
plt.plot(history.history['mean_absolute_error'])
plt.plot(history.history['val_mean_absolute_error'])
plt.show()

In [None]:
#df_values = df.values
#disc_value = discounts.values
#print(y_test)
c = 1
i = 0
for discount in y_test:
    if discount > 0.0:
        print(i, discount)
        c += 1
    if c == 10:
        break
    i += 1
        
print('>>',y_test[51])

In [None]:
t = 51
print(df.columns)
print(X_test[t])
print(y_test[t])

In [None]:
pred = model.predict(X_test) #.reshape(10,1))
#print(type(pred))
print(f'Estimated discount {pred[t]}')

In [None]:


print(f'Estimated discount {pred[3]}')

In [None]:

print(f'Estimated discount {pred[0]}')