In [1]:
import pandas as pd

DATA = '/kaggle/input/air-filter-data/air_filter_data.csv'
df = pd.read_csv(filepath_or_buffer=DATA, parse_dates=['timestamp'])
df.head()

Unnamed: 0,timestamp,filter_name,location,filter_class,filter_age_days,load_factor,pressure_drop_pa,efficiency,inlet_pm25,outlet_pm25,inlet_pm10,outlet_pm10,replacement_needed,hour
0,2022-08-28 03:00:00,Electrostatic_Filter_Mall,Shopping Mall,Electrostatic,23.54,0.408784,60.7,0.87,47.7,6.2,88.3,11.4,0,3
1,2023-06-08 17:00:00,HEPA_Filter_Hospital,Hospital Ventilation,HEPA,5.46,0.309243,95.0,0.981,71.3,1.4,75.8,2.0,0,17
2,2022-09-14 05:00:00,HEPA_Filter_Hospital,Hospital Ventilation,HEPA,23.5,0.312394,114.9,0.962,67.4,2.6,98.9,3.8,0,5
3,2021-01-16 09:00:00,Electrostatic_Filter_Mall,Shopping Mall,Electrostatic,4.04,0.42089,25.1,0.893,69.2,7.4,57.8,6.2,0,9
4,2021-06-01 14:00:00,HEPA_Filter_Hospital,Hospital Ventilation,HEPA,1.88,0.2,83.7,0.992,45.3,1.0,66.4,2.0,0,14


In [2]:
df.nunique().to_frame().T

Unnamed: 0,timestamp,filter_name,location,filter_class,filter_age_days,load_factor,pressure_drop_pa,efficiency,inlet_pm25,outlet_pm25,inlet_pm10,outlet_pm10,replacement_needed,hour
0,21915,5,5,4,1656,20191,1598,159,581,146,766,185,2,24


In [3]:
COLUMNS = [column for column, dtype in df.dtypes.to_dict().items() if str(dtype) in {'float64'}]
RANDOM_STATE = 2025
TARGET = 'replacement_needed'

Is our target class balanced?

In [4]:
df[TARGET].value_counts().to_dict()

{0: 21887, 1: 28}

No. We have almost no filters that need to be replaced. Let's use a small part of our dataset.

In [5]:
target_1_df = df[df[TARGET] == 1]
target_0_df = df[df[TARGET] == 0].sample(n=40 * len(target_1_df), random_state=RANDOM_STATE)
sample_df = pd.concat(axis='index', objs=[target_0_df, target_1_df])

We have a sample that includes all of our positives and a sample of our negatives. Let's use dimensionality reduction and a scatter plot to see if there's a signal in our data.

In [6]:
from sklearn.manifold import TSNE

reducer = TSNE(random_state=RANDOM_STATE)
plot_df = pd.DataFrame(columns=['x', 'y'], data=reducer.fit_transform(X=sample_df[COLUMNS],))
plot_df[TARGET] = sample_df[TARGET].tolist()
plot_df[TARGET] = plot_df[TARGET] == 1

In [7]:
from plotly import express
from plotly.offline import init_notebook_mode

init_notebook_mode(connected=True)
express.scatter(data_frame=plot_df, x='x', y='y', color=TARGET).show(renderer='iframe_connected')

Surprisingly TSNE finds it pretty easy to distinguish filters that need to be replaced from filters that don't. Let's build a model.

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(sample_df[COLUMNS], sample_df[TARGET], test_size=0.2, random_state=RANDOM_STATE, shuffle=True, stratify=sample_df[TARGET])
qda = QuadraticDiscriminantAnalysis()
qda.fit(X=X_train, y=y_train)
y_pred = qda.predict(X=X_test)
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       224
           1       1.00      1.00      1.00         6

    accuracy                           1.00       230
   macro avg       1.00      1.00      1.00       230
weighted avg       1.00      1.00      1.00       230

