In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import numpy as np

In [73]:
dataset = pd.read_csv('./rsc/AirQualityUCI.csv', sep=';')
print(dataset.info())
print(dataset)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9471 entries, 0 to 9470
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           9357 non-null   object 
 1   Time           9357 non-null   object 
 2   CO(GT)         9357 non-null   object 
 3   PT08.S1(CO)    9357 non-null   float64
 4   NMHC(GT)       9357 non-null   float64
 5   C6H6(GT)       9357 non-null   object 
 6   PT08.S2(NMHC)  9357 non-null   float64
 7   NOx(GT)        9357 non-null   float64
 8   PT08.S3(NOx)   9357 non-null   float64
 9   NO2(GT)        9357 non-null   float64
 10  PT08.S4(NO2)   9357 non-null   float64
 11  PT08.S5(O3)    9357 non-null   float64
 12  T              9357 non-null   object 
 13  RH             9357 non-null   object 
 14  AH             9357 non-null   object 
 15  Unnamed: 15    0 non-null      float64
 16  Unnamed: 16    0 non-null      float64
dtypes: float64(10), object(7)
memory usage: 1.2+ MB
None

In [74]:
# Select only Date, Time, NO2 columns
dataset = dataset[['Date', 'Time', 'NO2(GT)']].dropna()
print(dataset.info())
print(dataset)

<class 'pandas.core.frame.DataFrame'>
Index: 9357 entries, 0 to 9356
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     9357 non-null   object 
 1   Time     9357 non-null   object 
 2   NO2(GT)  9357 non-null   float64
dtypes: float64(1), object(2)
memory usage: 292.4+ KB
None
            Date      Time  NO2(GT)
0     10/03/2004  18.00.00    113.0
1     10/03/2004  19.00.00     92.0
2     10/03/2004  20.00.00    114.0
3     10/03/2004  21.00.00    122.0
4     10/03/2004  22.00.00    116.0
...          ...       ...      ...
9352  04/04/2005  10.00.00    190.0
9353  04/04/2005  11.00.00    179.0
9354  04/04/2005  12.00.00    175.0
9355  04/04/2005  13.00.00    156.0
9356  04/04/2005  14.00.00    168.0

[9357 rows x 3 columns]


In [75]:
# Create a unified Date column of type Date - specificando il formato
dataset['Date'] = pd.to_datetime(
    dataset['Date'] + ' ' + dataset['Time'], 
    format='%d/%m/%Y %H.%M.%S'
)
dataset = dataset[['Date', 'NO2(GT)']]
print(dataset)

                    Date  NO2(GT)
0    2004-03-10 18:00:00    113.0
1    2004-03-10 19:00:00     92.0
2    2004-03-10 20:00:00    114.0
3    2004-03-10 21:00:00    122.0
4    2004-03-10 22:00:00    116.0
...                  ...      ...
9352 2005-04-04 10:00:00    190.0
9353 2005-04-04 11:00:00    179.0
9354 2005-04-04 12:00:00    175.0
9355 2005-04-04 13:00:00    156.0
9356 2005-04-04 14:00:00    168.0

[9357 rows x 2 columns]


In [76]:
dataset['Global_Average'] = dataset['NO2(GT)'].mean()
dataset['Daily_Average'] = dataset.groupby(dataset['Date'].dt.date)['NO2(GT)'].transform('mean')
dataset['Weekly_Average'] = dataset.groupby(pd.Grouper(key='Date', freq='W'))['NO2(GT)'].transform('mean')

# Classification based on different averages
dataset['Quality_vs_Global'] = dataset['NO2(GT)'].apply(
    lambda x: 'good' if x <= dataset['Global_Average'].iloc[0] else 'poor'
)

dataset['Quality_vs_Daily'] = dataset.apply(
    lambda row: 'good' if row['NO2(GT)'] <= row['Daily_Average'] else 'poor', 
    axis=1
)

dataset['Quality_vs_Weekly'] = dataset.apply(
    lambda row: 'good' if row['NO2(GT)'] <= row['Weekly_Average'] else 'poor', 
    axis=1
)

print(dataset)

                    Date  NO2(GT)  Global_Average  Daily_Average  \
0    2004-03-10 18:00:00    113.0       58.148873     108.833333   
1    2004-03-10 19:00:00     92.0       58.148873     108.833333   
2    2004-03-10 20:00:00    114.0       58.148873     108.833333   
3    2004-03-10 21:00:00    122.0       58.148873     108.833333   
4    2004-03-10 22:00:00    116.0       58.148873     108.833333   
...                  ...      ...             ...            ...   
9352 2005-04-04 10:00:00    190.0       58.148873     122.000000   
9353 2005-04-04 11:00:00    179.0       58.148873     122.000000   
9354 2005-04-04 12:00:00    175.0       58.148873     122.000000   
9355 2005-04-04 13:00:00    156.0       58.148873     122.000000   
9356 2005-04-04 14:00:00    168.0       58.148873     122.000000   

      Weekly_Average Quality_vs_Global Quality_vs_Daily Quality_vs_Weekly  
0          95.892157              poor             poor              poor  
1          95.892157           

In [77]:
# Classification statistics
print(dataset['Quality_vs_Global'].value_counts(), end='\n\n')

print(dataset['Quality_vs_Daily'].value_counts(), end='\n\n')

print(dataset['Quality_vs_Weekly'].value_counts(), end='\n\n')

Quality_vs_Global
poor    6771
good    2586
Name: count, dtype: int64

Quality_vs_Daily
poor    5233
good    4124
Name: count, dtype: int64

Quality_vs_Weekly
poor    5415
good    3942
Name: count, dtype: int64



# Dataset splitting

In [None]:
# Preparazione delle feature
# Estraiamo componenti temporali dalla data
dataset['Hour'] = dataset['Date'].dt.hour
dataset['DayOfWeek'] = dataset['Date'].dt.dayofweek
dataset['Month'] = dataset['Date'].dt.month

# Features per il modello
X = dataset[['Hour', 'DayOfWeek', 'Month', 'NO2(GT)']]
# X = dataset[['Date', 'NO2(GT)']]
y = dataset['Quality_vs_Weekly']

# Encoding della variabile target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split dei dati
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Training set size: 7485
Test set size: 1872


# DT

In [None]:
# Creazione e addestramento del modello Decision Tree
dt_model = DecisionTreeClassifier(
    criterion='gini',
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

dt_model.fit(X_train, y_train)

# Predizioni e valutazione
dt_pred = dt_model.predict(X_test)
print(classification_report(y_test, dt_pred, target_names=le.classes_))

# Logistic Regression

In [79]:
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)

lr_pred = lr_model.predict(X_test)
print(classification_report(y_test, lr_pred, target_names=le.classes_))

              precision    recall  f1-score   support

        good       0.84      0.73      0.78       789
        poor       0.82      0.90      0.86      1083

    accuracy                           0.82      1872
   macro avg       0.83      0.81      0.82      1872
weighted avg       0.83      0.82      0.82      1872



# Random Forest

In [80]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)
print(classification_report(y_test, rf_pred, target_names=le.classes_))

              precision    recall  f1-score   support

        good       0.86      0.86      0.86       789
        poor       0.90      0.90      0.90      1083

    accuracy                           0.88      1872
   macro avg       0.88      0.88      0.88      1872
weighted avg       0.88      0.88      0.88      1872



# MLP

In [81]:
# Standardizzazione delle feature (importante per MLP)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Creazione e addestramento del modello MLP
mlp_model = MLPClassifier(
    hidden_layer_sizes=(100, 50),  # Due layer nascosti con 100 e 50 neuroni
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='constant',
    learning_rate_init=0.001,
    max_iter=1000,
    random_state=42
)

mlp_model.fit(X_train_scaled, y_train)

# Predizioni e valutazione
mlp_pred = mlp_model.predict(X_test_scaled)
print(classification_report(y_test, mlp_pred, target_names=le.classes_))

              precision    recall  f1-score   support

        good       0.86      0.89      0.87       789
        poor       0.92      0.89      0.91      1083

    accuracy                           0.89      1872
   macro avg       0.89      0.89      0.89      1872
weighted avg       0.89      0.89      0.89      1872

