In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load dataset
df = pd.read_csv('AirQualityUCI.csv', sep=',', encoding='utf-8-sig')

# Drop unnamed or empty columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# a. Data Cleaning
# Replace missing values denoted by -200 with NaN
df.replace(-200, np.nan, inplace=True)

# Drop rows where target variable is missing
df.dropna(subset=['CO(GT)'], inplace=True)

# Fill other missing values with column mean
df.fillna(df.mean(numeric_only=True), inplace=True)

# Convert 'Date' and 'Time' to datetime
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], dayfirst=True)
df.drop(['Date', 'Time'], axis=1, inplace=True)

# b. Data Integration
# (Already integrated in one file, just setting datetime as index)
df.set_index('Datetime', inplace=True)

# c. Data Transformation
# Normalize numeric columns for model input
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_columns = df.columns
df[scaled_columns] = scaler.fit_transform(df[scaled_columns])

# d. Data Model Building
# Predict 'CO(GT)' based on other features
X = df.drop('CO(GT)', axis=1)
y = df['CO(GT)']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.4f}')

# e. Error Correcting (basic)
# Clip predictions to 0–1 (since scaled)
y_pred_corrected = np.clip(y_pred, 0, 1)

# You could also reverse normalization for real-world interpretation if needed


Mean Squared Error: 0.0016
