# Data Mining and Discovery <br>
## Report Assignment
**Nazmul Hossain** | **23015862**

## Import necessary libraries

In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.preprocessing import StandardScaler

## Choose and load a dataset from `datasets.txt`

Air Quality Data Set (UCI), regression - https://archive.ics.uci.edu/ml/datasets/Air+Quality

In [7]:
air_quality_df = pd.read_csv('datasets/AirQualityUCI.csv', sep=';')
air_quality_df.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,7578,,
1,10/03/2004,19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,7255,,
2,10/03/2004,20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,7502,,
3,10/03/2004,21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,7867,,
4,10/03/2004,22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,7888,,


## Dataset preprocessing

In [8]:
air_quality_df.shape

(9471, 17)

In [9]:
air_quality_df.columns

Index(['Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)',
       'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)',
       'PT08.S5(O3)', 'T', 'RH', 'AH', 'Unnamed: 15', 'Unnamed: 16'],
      dtype='object')

In [10]:
air_quality_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9471 entries, 0 to 9470
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           9357 non-null   object 
 1   Time           9357 non-null   object 
 2   CO(GT)         9357 non-null   object 
 3   PT08.S1(CO)    9357 non-null   float64
 4   NMHC(GT)       9357 non-null   float64
 5   C6H6(GT)       9357 non-null   object 
 6   PT08.S2(NMHC)  9357 non-null   float64
 7   NOx(GT)        9357 non-null   float64
 8   PT08.S3(NOx)   9357 non-null   float64
 9   NO2(GT)        9357 non-null   float64
 10  PT08.S4(NO2)   9357 non-null   float64
 11  PT08.S5(O3)    9357 non-null   float64
 12  T              9357 non-null   object 
 13  RH             9357 non-null   object 
 14  AH             9357 non-null   object 
 15  Unnamed: 15    0 non-null      float64
 16  Unnamed: 16    0 non-null      float64
dtypes: float64(10), object(7)
memory usage: 1.2+ MB


In [11]:
air_quality_df.describe()

Unnamed: 0,PT08.S1(CO),NMHC(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),Unnamed: 15,Unnamed: 16
count,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,0.0,0.0
mean,1048.990061,-159.090093,894.595276,168.616971,794.990168,58.148873,1391.479641,975.072032,,
std,329.83271,139.789093,342.333252,257.433866,321.993552,126.940455,467.210125,456.938184,,
min,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,,
25%,921.0,-200.0,711.0,50.0,637.0,53.0,1185.0,700.0,,
50%,1053.0,-200.0,895.0,141.0,794.0,96.0,1446.0,942.0,,
75%,1221.0,-200.0,1105.0,284.0,960.0,133.0,1662.0,1255.0,,
max,2040.0,1189.0,2214.0,1479.0,2683.0,340.0,2775.0,2523.0,,


In [12]:
air_quality_df.isnull().sum()

Unnamed: 0,0
Date,114
Time,114
CO(GT),114
PT08.S1(CO),114
NMHC(GT),114
C6H6(GT),114
PT08.S2(NMHC),114
NOx(GT),114
PT08.S3(NOx),114
NO2(GT),114


### Handling missing values

In [13]:
# Drop unnecessary columns
air_quality_df = air_quality_df.drop(columns=['Unnamed: 15', 'Unnamed: 16'])

In [15]:
# Fill missing numeric values with the column mean
air_quality_df = air_quality_df.fillna(air_quality_df.mean())

TypeError: can only concatenate str (not "int") to str

In [16]:
# Drop rows with missing values in key columns
air_quality_df = air_quality_df.dropna(subset=['CO(GT)', 'C6H6(GT)', 'T', 'RH'])

### Convert Numeric Columns

In [17]:
# Convert numeric columns with commas to floats
cols_to_convert = ['CO(GT)', 'C6H6(GT)', 'T', 'RH', 'AH']
for col in cols_to_convert:
    air_quality_df[col] = air_quality_df[col].str.replace(',', '.').astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  air_quality_df[col] = air_quality_df[col].str.replace(',', '.').astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  air_quality_df[col] = air_quality_df[col].str.replace(',', '.').astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  air_quality_df[col] = air_quality_df[col].str.rep

In [18]:
# Check data types and null values
print(air_quality_df.info())
print(air_quality_df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Index: 9357 entries, 0 to 9356
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           9357 non-null   object 
 1   Time           9357 non-null   object 
 2   CO(GT)         9357 non-null   float64
 3   PT08.S1(CO)    9357 non-null   float64
 4   NMHC(GT)       9357 non-null   float64
 5   C6H6(GT)       9357 non-null   float64
 6   PT08.S2(NMHC)  9357 non-null   float64
 7   NOx(GT)        9357 non-null   float64
 8   PT08.S3(NOx)   9357 non-null   float64
 9   NO2(GT)        9357 non-null   float64
 10  PT08.S4(NO2)   9357 non-null   float64
 11  PT08.S5(O3)    9357 non-null   float64
 12  T              9357 non-null   float64
 13  RH             9357 non-null   float64
 14  AH             9357 non-null   float64
dtypes: float64(13), object(2)
memory usage: 1.1+ MB
None
Date             0
Time             0
CO(GT)           0
PT08.S1(CO)      0
NMHC(GT)        

## Understand and apply subtopic 1 (Linear Regression) to the dataset

  ### Define target and feature variables
Target variable `y`: `CO(GT)` (Carbon Monoxide levels)

In [19]:
# Features (X) and target variable (y)
X = air_quality_df.drop(columns=['CO(GT)', 'Date', 'Time'], axis=True)
y = air_quality_df['CO(GT)']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (7485, 12)
Testing set shape: (1872, 12)


### `LinearRegression` Model Train and Evaluate

In [40]:
# Standardize the feature data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Add polynomial features (degree=3)
poly = PolynomialFeatures(degree=3, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# Initialize and train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train_poly, y_train)

# Predict on test set
y_pred = lr_model.predict(X_test_poly)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 2317.066565261518
R² Score: 0.6078166593017627


### `Ridge` Regression model training

In [31]:
ridge_model = Ridge(alpha=0.5)
ridge_model.fit(X_train_poly, y_train)
y_pred_ridge = ridge_model.predict(X_test_poly)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("Ridge Regression MSE:", mse_ridge)
print("Ridge Regression R² Score:", r2_ridge)

Ridge Regression MSE: 2368.1754784521363
Ridge Regression R² Score: 0.5991660384628699


### `Lasso` Regression model training

In [35]:
lasso_model = Lasso(alpha=0.2)
lasso_model.fit(X_train_poly, y_train)
y_pred_lasso = lasso_model.predict(X_test_poly)

mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print("Lasso Regression MSE:", mse_lasso)
print("Lasso Regression R² Score:", r2_lasso)

Lasso Regression MSE: 2475.77054226085
Lasso Regression R² Score: 0.5809546533435224


  model = cd_fast.enet_coordinate_descent(


### `RandomForestRegressor` model training

In [41]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_poly, y_train)
y_pred_rf = rf_model.predict(X_test_poly)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest MSE:", mse_rf)
print("Random Forest R² Score:", r2_rf)

Random Forest MSE: 2064.590867879333
Random Forest R² Score: 0.6505503312337501


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_poly, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best R² Score:", grid_search.best_score_)


## Apply subtopic 2 (Classification)

### Binarize the target variable
Low CO vs High CO:
- 1 if CO >= 2
- 0 otherwise

In [22]:
y_class = (y >= 2).astype(int)

# Split the data
X_train, X_test, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.2, random_state=42)

### `LogisticRegression` model training

In [23]:
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=5000)
clf.fit(X_train, y_train_class)

# Predict on test set
y_pred_class = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test_class, y_pred_class)
print("Classification Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test_class, y_pred_class))

Classification Accuracy: 0.9316239316239316

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.94      0.95      1166
           1       0.91      0.91      0.91       706

    accuracy                           0.93      1872
   macro avg       0.93      0.93      0.93      1872
weighted avg       0.93      0.93      0.93      1872



## Compare and contrast both results (subtopic 1 vs subtopic 2)