In [37]:
import numpy as np
import pandas as pd

from dill import dump, load

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FunctionTransformer

In [38]:
df_original = pd.read_csv('house_synthetic_data.csv')
df_original.head()

Unnamed: 0,household_id,surface_type,room_size,dirtiness_level,cleaning_time,cleaning_efficiency,date_time
0,0,Tile,33,3,17,1.41,2023-07-23 21:01:08
1,1,Carpet,23,4,23,1.11,2023-07-21 16:05:03
2,2,Carpet,30,2,17,1.26,2023-07-21 10:44:45
3,3,Tile,27,2,26,1.31,2023-07-20 22:35:36
4,4,Carpet,40,1,15,1.58,2023-07-21 15:24:15


In [39]:
df = df_original.copy()
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
print(f'After removing the rows with Null values and the duplicate entries, {df.shape[0]} applications remained for further analysis.')

After removing the rows with Null values and the duplicate entries, 5000 applications remained for further analysis.


In [40]:
X = df.drop(columns=['cleaning_time', 'cleaning_efficiency'])

y_time = df.copy()['cleaning_time']
y_efficiency = df.copy()['cleaning_efficiency']

X.shape, y_time.shape, y_efficiency.shape

((5000, 5), (5000,), (5000,))

In [41]:
X_train_time, X_test_time, y_train_time, y_test_time = train_test_split(X, y_time, test_size=.2, random_state=42)

X_train_time.shape, X_test_time.shape, y_train_time.shape, y_test_time.shape

((4000, 5), (1000, 5), (4000,), (1000,))

In [42]:
X_train_efficiency, X_test_efficiency, y_train_efficiency, y_test_efficiency = train_test_split(X, y_efficiency, test_size=.2, random_state=42)

X_train_efficiency.shape, X_test_efficiency.shape, y_train_efficiency.shape, y_test_efficiency.shape

((4000, 5), (1000, 5), (4000,), (1000,))

In [43]:
def data_cleaning(data):
    data = data.copy().drop(columns=['household_id', 'date_time'], errors='ignore')
    data = data.astype({'surface_type': 'category', 'dirtiness_level': 'category'})
    return data

In [44]:
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False), ['surface_type']),
        ('standardscaler', StandardScaler(), ['room_size']),
        ('minmaxscaler', MinMaxScaler(), ['dirtiness_level'])
    ],
    remainder='passthrough',
    verbose=True
)

In [45]:
cleaning_time_pipeline = Pipeline(
    steps=[
        ('cleanor', FunctionTransformer(func=data_cleaning)),
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ],
    verbose=True)

In [46]:
cleaning_efficiency_pipeline = Pipeline(
    steps=[
        ('cleanor', FunctionTransformer(func=data_cleaning)),
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ],
    verbose=True)

In [47]:
cleaning_time_pipeline.fit(X_train_time, y_train_time)

[Pipeline] ........... (step 1 of 3) Processing cleanor, total=   0.0s
[ColumnTransformer] ........ (1 of 3) Processing onehot, total=   0.0s
[ColumnTransformer]  (2 of 3) Processing standardscaler, total=   0.0s
[ColumnTransformer] .. (3 of 3) Processing minmaxscaler, total=   0.0s
[Pipeline] ...... (step 2 of 3) Processing preprocessor, total=   0.0s
[Pipeline] ......... (step 3 of 3) Processing regressor, total=   0.0s


In [48]:
cleaning_efficiency_pipeline.fit(X_train_efficiency, y_train_efficiency)

[Pipeline] ........... (step 1 of 3) Processing cleanor, total=   0.0s
[ColumnTransformer] ........ (1 of 3) Processing onehot, total=   0.0s
[ColumnTransformer]  (2 of 3) Processing standardscaler, total=   0.0s
[ColumnTransformer] .. (3 of 3) Processing minmaxscaler, total=   0.0s
[Pipeline] ...... (step 2 of 3) Processing preprocessor, total=   0.0s
[Pipeline] ......... (step 3 of 3) Processing regressor, total=   0.0s


In [49]:
cleaning_time_pipeline.score(X_train_time, y_train_time)

0.7014655190183863

In [50]:
cleaning_time_pipeline.score(X_test_time, y_test_time)

0.6976321315020912

In [51]:
cleaning_efficiency_pipeline.score(X_train_efficiency, y_train_efficiency)

0.7069236523342408

In [52]:
cleaning_efficiency_pipeline.score(X_test_efficiency, y_test_efficiency)

0.7179064733592557

In [53]:
with open('ct_lr_pipeline.dill.pkl','wb') as file:
    dump(cleaning_time_pipeline, file)

In [54]:
with open('ce_lr_pipeline.dill.pkl','wb') as file:
    dump(cleaning_efficiency_pipeline, file)

**How to load:**

```python
with open('ct_lr_pipeline.dill.pkl','rb') as file:
    ct_pipeline = load(file)
```

```python
with open('ce_lr_pipeline.dill.pkl','rb') as file:
    ce_pipeline = load(file)
```