In [8]:
!pip install pandas gdown
!pip install missforest



In [9]:
import pandas as pd
import seaborn as sns
import numpy as np
import gdown
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from xgboost import XGBClassifier
import xgboost
from sklearn.metrics import mean_squared_error

### Downloading the data from Google drive

In [10]:
url = 'https://drive.google.com/uc?id=1sXMdspt9jkBVQ-gzeTZikraGxPXhnnZo' # gdrive url of the dataset
output = 'brewery_data_complete_extended.csv'
gdown.download(url, output, quiet=False)                                 # download the data to the colab instance

df = pd.read_csv(output)                                                 # Load it as a pandas dataframe

Downloading...
From: https://drive.google.com/uc?id=1sXMdspt9jkBVQ-gzeTZikraGxPXhnnZo
To: /content/brewery_data_complete_extended.csv
100%|██████████| 2.62G/2.62G [00:26<00:00, 97.5MB/s]


In [11]:
print(f'Shape of the dataset : Rows={df.shape[0]} , Columns={df.shape[1]}')

Shape of the dataset : Rows=10000000 , Columns=20


### Simple Random Sampling

In [12]:
df_sampled = df.sample(400000,random_state=10)

### Generating missing values at random

In [13]:
np.random.seed(10)
p=0.3
mask = np.random.choice([True, False], size=df_sampled.shape, p=[p, 1-p])
df_masked = df_sampled.mask(mask)

In [14]:
# Dropping identifier type of columns
df_masked = df_masked.drop(columns = ['Batch_ID','Brew_Date','Location','Ingredient_Ratio','SKU'])

In [15]:
# Each attribute has 30% missing values
df_masked.isnull().sum()

Beer_Style                      119798
Fermentation_Time               120188
Temperature                     120339
pH_Level                        119899
Gravity                         120188
Alcohol_Content                 119868
Bitterness                      119974
Color                           119545
Volume_Produced                 120248
Total_Sales                     120145
Quality_Score                   120222
Brewhouse_Efficiency            119595
Loss_During_Brewing             120083
Loss_During_Fermentation        119438
Loss_During_Bottling_Kegging    120779
dtype: int64

### Missforest Imputation

In [17]:
from missforest.missforest import MissForest
import pandas as pd
import numpy as np

In [18]:
mf = MissForest()

In [19]:
%%time
imputed_data = mf.fit_transform(X=df_masked,categorical = ['Beer_Style']);


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000283 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2873
[LightGBM] [Info] Number of data points in the train set: 1922, number of used features: 14
[LightGBM] [Info] Start training from score -2.283007
[LightGBM] [Info] Start training from score -2.023787
[LightGBM] [Info] Start training from score -2.031693
[LightGBM] [Info] Start training from score -2.131776
[LightGBM] [Info] Start training from score -2.088851
[LightGBM] [Info] Start training from score -2.019858
[LightGBM] [Info] Start training from score -2.015944
[LightGBM] [Info] Start training from score -2.068060
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000341 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2870
[LightGBM] [Info] Nu

### ML Pipeline for Missforest

In [20]:
target = 'Fermentation_Time'
X_ = imputed_data.iloc[:,:]
X_ = X_.drop(columns = [target])
y_ = imputed_data[target]

X_  = pd.get_dummies(X_)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=42)

In [21]:
# Fit the model
mod = XGBRegressor()
mod.fit(X_train,y_train)
y_pred = mod.predict(X_test)

# Calculate the mean squared error
mse_miss = mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error: {mse_miss}")

Root Mean Squared Error: 5.883795732294364


### Modified Missforest

In [23]:
!pip install -i https://test.pypi.org/simple/ MissingValImputerDats6450
# from missingvaluehandler import *
# from errors import *
# from missforest import *

Looking in indexes: https://test.pypi.org/simple/


In [24]:
from MissingValImputerDats6450.MissingValImputerDats6450 import MissingValImputerDats6450

In [25]:
mvh = MissingValImputerDats6450()

In [26]:
%%time
fitted = mvh.fit_transform(df_masked,'Fermentation_Time',categorical = ['Beer_Style'])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009562 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2626
[LightGBM] [Info] Number of data points in the train set: 32849, number of used features: 14
[LightGBM] [Info] Start training from score 79.985669
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2626
[LightGBM] [Info] Number of data points in the train set: 32849, number of used features: 14
[LightGBM] [Info] Start training from score 2.999074
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2626
[LightGBM] [Info] Number of data points in the train se

### ML Pipeline for modified Missforest

In [27]:
# Prepare the training data
X_ = fitted.iloc[:,:]
X_ = X_.drop(columns = [target])
y_ = fitted[target]

X_  = pd.get_dummies(X_)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=42)

In [28]:
X_train.columns

Index(['Temperature', 'pH_Level', 'Gravity', 'Alcohol_Content', 'Bitterness',
       'Color', 'Volume_Produced', 'Total_Sales', 'Quality_Score',
       'Brewhouse_Efficiency', 'Loss_During_Brewing',
       'Loss_During_Fermentation', 'Loss_During_Bottling_Kegging',
       'Beer_Style_Ale', 'Beer_Style_IPA', 'Beer_Style_Lager',
       'Beer_Style_Pilsner', 'Beer_Style_Porter', 'Beer_Style_Sour',
       'Beer_Style_Stout', 'Beer_Style_Wheat Beer'],
      dtype='object')

In [29]:
# Fit the model
mod = XGBRegressor()
mod.fit(X_train,y_train)
y_pred = mod.predict(X_test)

# Calculate the mean squared error
mse_mmiss = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse_mmiss}")

Mean Squared Error: 5.775307912661294


In [30]:
print(f'Difference in RMSE : {abs(np.sqrt(mse_mmiss)-np.sqrt(mse_miss))}')

Difference in RMSE : 0.022466638742875134
