# Choosing model

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import data after log and preprocessing 1
df = pd.read_csv('pre1_pharma.csv')
df

Unnamed: 0,City,Day,Code,Amount
0,Thành phố Hà Nội,2018-10-04,G00821,3.401197
1,Tỉnh Thanh Hóa,2018-10-13,Z00200,5.298317
2,Thành phố Hồ Chí Minh,2018-10-16,Z00200,3.401197
3,Thành phố Hồ Chí Minh,2018-10-16,Z00200,5.703782
4,Thành phố Hồ Chí Minh,2018-10-17,Z00200,4.605170
...,...,...,...,...
56147,Thành phố Hà Nội,2022-07-11,P01364,1.945910
56148,Thành phố Hà Nội,2022-07-11,P01364,2.639057
56149,Thành phố Hà Nội,2022-07-11,P01364,1.945910
56150,Thành phố Hà Nội,2022-07-11,N01080,0.693147


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56152 entries, 0 to 56151
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   City    56152 non-null  object 
 1   Day     56152 non-null  object 
 2   Code    56152 non-null  object 
 3   Amount  56152 non-null  float64
dtypes: float64(1), object(3)
memory usage: 1.7+ MB


In [4]:
# convert the date column to datetime and set it as the index
df['Day'] = pd.to_datetime(df['Day'])
df.set_index('Day', inplace=True)

In [5]:
df

Unnamed: 0_level_0,City,Code,Amount
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-10-04,Thành phố Hà Nội,G00821,3.401197
2018-10-13,Tỉnh Thanh Hóa,Z00200,5.298317
2018-10-16,Thành phố Hồ Chí Minh,Z00200,3.401197
2018-10-16,Thành phố Hồ Chí Minh,Z00200,5.703782
2018-10-17,Thành phố Hồ Chí Minh,Z00200,4.605170
...,...,...,...
2022-07-11,Thành phố Hà Nội,P01364,1.945910
2022-07-11,Thành phố Hà Nội,P01364,2.639057
2022-07-11,Thành phố Hà Nội,P01364,1.945910
2022-07-11,Thành phố Hà Nội,N01080,0.693147


In [6]:
# time-step feature
df['Time'] = np.arange(len(df.index))

df.head()

Unnamed: 0_level_0,City,Code,Amount,Time
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-10-04,Thành phố Hà Nội,G00821,3.401197,0
2018-10-13,Tỉnh Thanh Hóa,Z00200,5.298317,1
2018-10-16,Thành phố Hồ Chí Minh,Z00200,3.401197,2
2018-10-16,Thành phố Hồ Chí Minh,Z00200,5.703782,3
2018-10-17,Thành phố Hồ Chí Minh,Z00200,4.60517,4


## Encoding

In [7]:
# one-hot encode the categorical variables
df = pd.get_dummies(df, columns=['City', 'Code'], drop_first=True)
df

Unnamed: 0_level_0,Amount,Time,City_Thành phố Hải Phòng,City_Thành phố Hồ Chí Minh,City_Tỉnh Nghệ An,City_Tỉnh Thanh Hóa,City_Tỉnh Thái Bình,City_Tỉnh Thái Nguyên,Code_M01090,Code_M01225,Code_N01080,Code_P01146,Code_P01364,Code_P01481,Code_Z00200
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-10-04,3.401197,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2018-10-13,5.298317,1,0,0,0,1,0,0,0,0,0,0,0,0,1
2018-10-16,3.401197,2,0,1,0,0,0,0,0,0,0,0,0,0,1
2018-10-16,5.703782,3,0,1,0,0,0,0,0,0,0,0,0,0,1
2018-10-17,4.605170,4,0,1,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-11,1.945910,56147,0,0,0,0,0,0,0,0,0,0,1,0,0
2022-07-11,2.639057,56148,0,0,0,0,0,0,0,0,0,0,1,0,0
2022-07-11,1.945910,56149,0,0,0,0,0,0,0,0,0,0,1,0,0
2022-07-11,0.693147,56150,0,0,0,0,0,0,0,0,1,0,0,0,0


# Test-train split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:

# separate features (X) and target (y)
X = df.drop('Amount', axis=1)
y = df['Amount']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
model_linear = LinearRegression()
scores = -1 * cross_val_score(model_linear, X, y,
                            cv=5,
                            scoring='neg_root_mean_squared_error')

print("RMSE scores:\n", scores.mean())


RMSE scores:
 1.2315801115992453


# Random Forest

In [19]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import cross_val_score

# # Multiply by -1 since sklearn calculates *negative* MAE
# model_random = RandomForestClassifier(n_estimators=18000, 
#                                 max_depth=50, 
#                                 max_features=14, 
#                                 min_samples_leaf=4, 
#                                 random_state=0)
# scores = -1 * cross_val_score(model_random, X, y,
#                             cv=5,
#                             scoring='neg_root_mean_squared_error')

# print("RMSE scores:\n", scores.mean())


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py", line 385, in fit
    y, expanded_class_weight = self._validate_y_class_weight(y)
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py", line 746, in _validate_y_class_weight
    check_classification_targets(y)
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\multiclass.py", line 218, in check_classification_targets
    raise ValueError("Unknown label type: %r" % y_type)
ValueError: Unknown label type: 'continuous'


# ANN

In [None]:
import tensorflow as tf

# Define the model architecture
model_tensor = tf.keras.Sequential([
    tf.keras.layers.Dense(300, input_shape=(53,), activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(300, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(150, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(150, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(75, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(75, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(25, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(25, activation='relu', kernel_regularizer=regularizers.l2(0.08)),
    tf.keras.layers.Dense(1, activation='linear')
])

# Compile the model_tensor
model_tensor.compile(optimizer=Adam(),
                loss='mse',
                metrics=['mse', 'accuracy'])

# Train the model_tensor
model_tensor.fit(X_train, y_train, batch_size=64, epochs=1000, verbose=1)

# Evaluate the model on the test data
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test mean squared error:', score[1])
print('Test accuracy:', score[2])