# Choosing model

In [14]:
import pandas as pd
import numpy as np

In [15]:
# Import data after log and preprocessing 1
df = pd.read_csv('pre1_pharma.csv')
df

Unnamed: 0,City,Day,Code,Amount
0,Thành phố Hà Nội,2018-10-04,G00821,3.401197
1,Tỉnh Thanh Hóa,2018-10-13,Z00200,5.298317
2,Thành phố Hồ Chí Minh,2018-10-16,Z00200,3.401197
3,Thành phố Hồ Chí Minh,2018-10-16,Z00200,5.703782
4,Thành phố Hồ Chí Minh,2018-10-17,Z00200,4.605170
...,...,...,...,...
56147,Thành phố Hà Nội,2022-07-11,P01364,1.945910
56148,Thành phố Hà Nội,2022-07-11,P01364,2.639057
56149,Thành phố Hà Nội,2022-07-11,P01364,1.945910
56150,Thành phố Hà Nội,2022-07-11,N01080,0.693147


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56152 entries, 0 to 56151
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   City    56152 non-null  object 
 1   Day     56152 non-null  object 
 2   Code    56152 non-null  object 
 3   Amount  56152 non-null  float64
dtypes: float64(1), object(3)
memory usage: 1.7+ MB


In [17]:
# convert the date column to datetime and set it as the index
df['Day'] = pd.to_datetime(df['Day'])
df.set_index('Day', inplace=True)

In [18]:
df

Unnamed: 0_level_0,City,Code,Amount
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-10-04,Thành phố Hà Nội,G00821,3.401197
2018-10-13,Tỉnh Thanh Hóa,Z00200,5.298317
2018-10-16,Thành phố Hồ Chí Minh,Z00200,3.401197
2018-10-16,Thành phố Hồ Chí Minh,Z00200,5.703782
2018-10-17,Thành phố Hồ Chí Minh,Z00200,4.605170
...,...,...,...
2022-07-11,Thành phố Hà Nội,P01364,1.945910
2022-07-11,Thành phố Hà Nội,P01364,2.639057
2022-07-11,Thành phố Hà Nội,P01364,1.945910
2022-07-11,Thành phố Hà Nội,N01080,0.693147


In [19]:
# time-step feature
df['Time'] = np.arange(len(df.index))

df.head()

Unnamed: 0_level_0,City,Code,Amount,Time
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-10-04,Thành phố Hà Nội,G00821,3.401197,0
2018-10-13,Tỉnh Thanh Hóa,Z00200,5.298317,1
2018-10-16,Thành phố Hồ Chí Minh,Z00200,3.401197,2
2018-10-16,Thành phố Hồ Chí Minh,Z00200,5.703782,3
2018-10-17,Thành phố Hồ Chí Minh,Z00200,4.60517,4


## Encoding

In [20]:
# one-hot encode the categorical variables
df = pd.get_dummies(df, columns=['City', 'Code'], drop_first=True)
df

Unnamed: 0_level_0,Amount,Time,City_Thành phố Hải Phòng,City_Thành phố Hồ Chí Minh,City_Tỉnh Nghệ An,City_Tỉnh Thanh Hóa,City_Tỉnh Thái Bình,City_Tỉnh Thái Nguyên,Code_M01090,Code_M01225,Code_N01080,Code_P01146,Code_P01364,Code_P01481,Code_Z00200
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-10-04,3.401197,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2018-10-13,5.298317,1,0,0,0,1,0,0,0,0,0,0,0,0,1
2018-10-16,3.401197,2,0,1,0,0,0,0,0,0,0,0,0,0,1
2018-10-16,5.703782,3,0,1,0,0,0,0,0,0,0,0,0,0,1
2018-10-17,4.605170,4,0,1,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-11,1.945910,56147,0,0,0,0,0,0,0,0,0,0,1,0,0
2022-07-11,2.639057,56148,0,0,0,0,0,0,0,0,0,0,1,0,0
2022-07-11,1.945910,56149,0,0,0,0,0,0,0,0,0,0,1,0,0
2022-07-11,0.693147,56150,0,0,0,0,0,0,0,0,1,0,0,0,0


# Test-train split

In [21]:
from sklearn.model_selection import train_test_split

In [22]:

# separate features (X) and target (y)
X = df.drop('Amount', axis=1)
y = df['Amount']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Linear Regression

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
model_linear = LinearRegression()
scores = -1 * cross_val_score(model_linear, X, y,
                            cv=5,
                            scoring='neg_root_mean_squared_error')

print("RMSE scores:\n", scores.mean())


RMSE scores:
 1.2315801115992453


# Random Forest

In [24]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import cross_val_score

# # Multiply by -1 since sklearn calculates *negative* MAE
# model_random = RandomForestClassifier(n_estimators=18000, 
#                                 max_depth=50, 
#                                 max_features=14, 
#                                 min_samples_leaf=4, 
#                                 random_state=0)
# scores = -1 * cross_val_score(model_random, X, y,
#                             cv=5,
#                             scoring='neg_root_mean_squared_error')

# print("RMSE scores:\n", scores.mean())


# ANN

In [26]:
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import RootMeanSquaredError

# Define R-squared metric
def r_square(y_true, y_pred):
    residual = tf.reduce_sum(tf.square(tf.subtract(y_true, y_pred)))
    total = tf.reduce_sum(tf.square(tf.subtract(y_true, tf.reduce_mean(y_true))))
    r2 = tf.subtract(1.0, tf.divide(residual, total))
    return r2

input_nn = 14

# Define the model architecture
model_tensor = tf.keras.Sequential([
    tf.keras.layers.Dense(300, input_shape=(input_nn,), activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(300, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(150, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(150, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(75, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(75, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(25, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(25, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(1, activation='linear')
])

# Compile the model
model_tensor.compile(optimizer=Adam(),
                loss='mse',
                metrics=[RootMeanSquaredError(), r_square])

# Train the model
model_tensor.fit(X_train, y_train, batch_size=64, epochs=1000, verbose=1)

# Evaluate the model on the test data
score = model_tensor.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test root mean squared error:', score[1])
print('Test R-squared:', score[2])


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000

KeyboardInterrupt: 