In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import joblib

# Load the training dataset
train_data = pd.read_csv('DS_ML Coding Challenge Dataset (1).xlsx - Training Dataset.csv')

# Convert 'Month of Sourcing' column to datetime format
train_data['Month of Sourcing'] = pd.to_datetime(train_data['Month of Sourcing'], format='%b-%y')

# Convert datetime to numeric representation (number of days since a reference date)
train_data['Month of Sourcing'] = (train_data['Month of Sourcing'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')

# Extract features and target variable
X_train = train_data[['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type', 'Month of Sourcing']]
y_train = train_data['Sourcing Cost']

# Convert categorical variables into dummy/indicator variables
X_train = pd.get_dummies(X_train, drop_first=True)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Reshape the input for LSTM (samples, timesteps, features)
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))

# Initialize the LSTM model
model_lstm = Sequential()

# Add LSTM layer
model_lstm.add(LSTM(units=50, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))

# Add output layer
model_lstm.add(Dense(units=1))

# Compile the model
model_lstm.compile(optimizer='adam', loss='mse')

# Fit the LSTM model
model_lstm.fit(X_train_reshaped, y_train, epochs=25, batch_size=32)

# Save the trained model
model_lstm.save('lstm_model.h5')


  super().__init__(**kwargs)


Epoch 1/25
[1m17193/17193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 737us/step - loss: 16325.3945
Epoch 2/25
[1m17193/17193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 714us/step - loss: 7583.7495
Epoch 3/25
[1m17193/17193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 721us/step - loss: 6596.7236
Epoch 4/25
[1m17193/17193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 712us/step - loss: 7080.6699
Epoch 5/25
[1m17193/17193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 708us/step - loss: 5810.7559
Epoch 6/25
[1m17193/17193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 676us/step - loss: 5685.0605
Epoch 7/25
[1m17193/17193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 623us/step - loss: 7327.7935
Epoch 8/25
[1m17193/17193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 643us/step - loss: 7903.2041
Epoch 9/25
[1m17193/17193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 692us/step - loss: 6508.5591




In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model
import joblib

# Load the test dataset
test_data = pd.read_csv('DS_ML Coding Challenge Dataset (1).xlsx - Test Dataset.csv')

test_data['Month of Sourcing'] = pd.to_datetime(test_data['Month of Sourcing'], format='%b-%y')

# Convert datetime to numeric representation (number of days since a reference date)
test_data['Month of Sourcing'] = (test_data['Month of Sourcing'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')

# Extract features
X_test = test_data[['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type', 'Month of Sourcing']]

# Convert categorical variables into dummy/indicator variables
X_test = pd.get_dummies(X_test, drop_first=True)

# Scale the features
X_test_scaled = scaler.transform(X_test)

# Reshape the input for LSTM (samples, timesteps, features)
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# Load the trained LSTM model
model_lstm = load_model('lstm_model.h5', compile= False)

# Predict on the test set using the trained model
y_pred_test_lstm = model_lstm.predict(X_test_reshaped)

# Print the predicted sourcing costs
print("Predicted Sourcing Cost (LSTM):")
print(y_pred_test_lstm.flatten())


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step  
Predicted Sourcing Cost (LSTM):
[122.87993  136.90393  152.38939  121.44141  147.74292  160.7605
  43.57338   56.13659  145.04924  139.47333  115.48572   70.59926
  65.09178  103.95442  138.03671  141.28114  144.5171   141.98271
 122.16222  130.08424  140.49763  112.34224  120.3725     5.686247
  54.8114    26.557713  33.18408   38.351597  34.349365 199.05772
 183.3354    77.496925  39.918625  26.876389  45.902176 122.074066
 152.49416  135.62251  106.07156  192.70653   26.864319  13.109159
 144.18861  156.38403   95.28659   37.977676 162.19402  182.4193
  89.83034   74.88184   70.581436 127.791214  76.84285   43.071312
 173.3926   111.81937  123.728516 195.35573  165.59435  117.107925
  28.445017 119.95503  106.390945 139.46352  152.41185  125.62764
 117.17116  146.2581   124.20471  135.30272  125.138275  99.65614
  58.59089   24.909256  55.57221   44.439156  29.999046 113.18698
  75.23874    6.920702  66.37496  

In [6]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import joblib

# Load the training dataset
train_data = pd.read_csv('DS_ML Coding Challenge Dataset (1).xlsx - Training Dataset.csv')

# Convert 'Month of Sourcing' column to datetime format
train_data['Month of Sourcing'] = pd.to_datetime(train_data['Month of Sourcing'], format='%b-%y')

# Convert datetime to numeric representation (number of days since a reference date)
train_data['Month of Sourcing'] = (train_data['Month of Sourcing'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')

# Extract features and target variable
X_train = train_data[['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type', 'Month of Sourcing']]
y_train = train_data['Sourcing Cost']

# Convert categorical variables into dummy/indicator variables
X_train = pd.get_dummies(X_train, drop_first=True)

# Initialize and fit the XGBoost model
model_xgb = xgb.XGBRegressor()
model_xgb.fit(X_train, y_train)

# Save the trained model
joblib.dump(model_xgb, 'xgboost_model.pkl')


['xgboost_model.pkl']

In [7]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import joblib

# Load the training dataset
train_data = pd.read_csv('DS_ML Coding Challenge Dataset (1).xlsx - Training Dataset.csv')

# Convert 'Month of Sourcing' column to datetime format
train_data['Month of Sourcing'] = pd.to_datetime(train_data['Month of Sourcing'], format='%b-%y')

# Convert datetime to numeric representation (number of days since a reference date)
train_data['Month of Sourcing'] = (train_data['Month of Sourcing'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')

# Extract features and target variable
X_train = train_data[['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type', 'Month of Sourcing']]
y_train = train_data['Sourcing Cost']

# Convert categorical variables into dummy/indicator variables
X_train = pd.get_dummies(X_train, drop_first=True)

# Initialize and fit the LightGBM model
model_lgb = lgb.LGBMRegressor()
model_lgb.fit(X_train, y_train)

# Save the trained model
joblib.dump(model_lgb, 'lightgbm_model.pkl')


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011519 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120
[LightGBM] [Info] Number of data points in the train set: 550176, number of used features: 55
[LightGBM] [Info] Start training from score 108.817286


['lightgbm_model.pkl']

In [9]:
import pandas as pd
import xgboost as xgb
import joblib

# Load the test dataset
test_data = pd.read_csv('DS_ML Coding Challenge Dataset (1).xlsx - Test Dataset.csv')

# Convert 'Month of Sourcing' column to datetime format
test_data['Month of Sourcing'] = pd.to_datetime(test_data['Month of Sourcing'], format='%b-%y')

# Convert datetime to numeric representation (number of days since a reference date)
test_data['Month of Sourcing'] = (test_data['Month of Sourcing'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')

# Extract features
X_test = test_data[['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type', 'Month of Sourcing']]

# Convert categorical variables into dummy/indicator variables
X_test = pd.get_dummies(X_test, drop_first=True)

# Load the trained XGBoost model
model_xgb = joblib.load('xgboost_model.pkl')

# Predict on the test set using the trained model
y_pred_test_xgb = model_xgb.predict(X_test)

# Print the predicted sourcing costs
print("Predicted Sourcing Cost (XGBoost):")
print(y_pred_test_xgb)


Predicted Sourcing Cost (XGBoost):
[112.53496  153.59299  149.86044  145.43594  169.56429  173.7738
  48.40108   66.15772  150.14774  149.32771  149.61943  133.9744
  94.659904 141.45634  140.92635  151.98058  156.44974  142.05586
 142.71555  149.45415  150.64648  147.1959   147.82664   17.159203
  79.719734  32.06247   33.8921    30.179289  24.637184 216.41849
 189.73964  170.93263   32.944557  25.22425   69.16799  150.20833
 170.91917  145.28969  144.02716  183.52249   47.938313  35.7663
 170.78976  173.05342  102.269516  41.178024 211.37619  212.7497
 108.35295   72.27743   57.020947 149.30782   97.75054   26.040003
 179.89725  125.62097  173.38013  199.1317   185.20116  143.21678
 130.50778  148.262    148.03319  142.9592   143.99173  143.86136
 143.62859  143.0835   145.05606  152.28763  148.02864  114.191795
  72.61399   33.9718    63.71227   55.20187   25.908846 141.47995
  93.2865     6.495615  71.57892   40.87667  158.1807   108.0876
  70.43418  118.51166   67.01067   30.57753

In [10]:
import pandas as pd
import lightgbm as lgb
import joblib

# Load the test dataset
test_data = pd.read_csv('DS_ML Coding Challenge Dataset (1).xlsx - Test Dataset.csv')

# Convert 'Month of Sourcing' column to datetime format
test_data['Month of Sourcing'] = pd.to_datetime(test_data['Month of Sourcing'], format='%b-%y')

# Convert datetime to numeric representation (number of days since a reference date)
test_data['Month of Sourcing'] = (test_data['Month of Sourcing'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1D')

# Extract features
X_test = test_data[['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type', 'Month of Sourcing']]

# Convert categorical variables into dummy/indicator variables
X_test = pd.get_dummies(X_test, drop_first=True)

# Load the trained LightGBM model
model_lgb = joblib.load('lightgbm_model.pkl')

# Predict on the test set using the trained model
y_pred_test_lgb = model_lgb.predict(X_test)

# Print the predicted sourcing costs
print("Predicted Sourcing Cost (LightGBM):")
print(y_pred_test_lgb)


Predicted Sourcing Cost (LightGBM):
[111.87988611 153.69420111 153.38247842 144.37636632 168.57736791
 171.05996081  53.90709126  61.67959004 146.19531602 146.19531602
 146.81551464 134.7860545   95.93801217 142.12272701 142.12272701
 149.20031568 150.31068258 144.41166615 143.19877539 150.12783321
 150.40349601 145.67641876 145.67641876  22.03150276  79.14193391
  32.90811574  23.42250899  30.79224679  25.30213589 219.20916952
 185.65915776 166.96875094  31.77966854  24.58481237  66.86403181
 150.19562697 160.77241912 145.8825077  144.09164729 181.37266234
  53.53490778  41.86695227 171.11442223 171.1653136  104.85864409
  40.77481057 210.87553401 211.72042837 107.01929289  70.70078709
  60.98523072 148.5359255  106.78083172  97.33139226 182.70311984
 134.29878018 165.4613988  202.90808825 196.4808534  144.54166503
 139.05716957 148.98515753 147.28913734 144.49077366 144.54166503
 146.73000725 144.71925046 145.92606548 145.59876408 151.52696144
 147.38462235 106.30793635  68.73408857 