In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import joblib

# Load data
data = pd.read_csv('C:\Aditya\Python\Khushi Project\AQI Files\IGIAirport_AQI_2018_2023.csv')

# Check for missing values and handle them
if data.isnull().any().any():
    # Option 1: Remove rows with missing values
    # data = data.dropna()

    # Option 2: Impute missing values, here using the mean of each column
    imputer = SimpleImputer(strategy='mean')
    data[['PM2.5', 'PM10', 'CO', 'NO2', 'SO2', 'Ozone']] = imputer.fit_transform(data[['PM2.5', 'PM10', 'CO', 'NO2', 'SO2', 'Ozone']])

# Feature scaling
scaler = StandardScaler()
features = scaler.fit_transform(data[['PM2.5', 'PM10', 'CO', 'NO2', 'SO2', 'Ozone']])

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(features, data['Overall AQI'], test_size=0.3, random_state=42)

# Initialize models
rf_model = RandomForestRegressor(n_estimators=50, random_state=42)
svm_model = SVR(kernel='rbf')
knn_model = KNeighborsRegressor(n_neighbors=5)

# Voting Regressor setup
voting_regressor = VotingRegressor(estimators=[('rf', rf_model), ('svm', svm_model), ('knn', knn_model)])

# Fit models
rf_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
knn_model.fit(X_train, y_train)
voting_regressor.fit(X_train, y_train)

# Evaluate models
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = mse ** 0.5
    return rmse

rmse_rf = evaluate_model(rf_model, X_test, y_test)
rmse_svm = evaluate_model(svm_model, X_test, y_test)
rmse_knn = evaluate_model(knn_model, X_test, y_test)
rmse_voting = evaluate_model(voting_regressor, X_test, y_test)

# Determine the best model based on RMSE
best_model, best_rmse = min([
    ('Random Forest', rmse_rf),
    ('SVM', rmse_svm),
    ('KNN', rmse_knn),
    ('Voting Regressor', rmse_voting)
], key=lambda x: x[1])

print(f"Best Model: {best_model} with RMSE: {best_rmse}")

# Save the best model
models = {'Random Forest': rf_model, 'SVM': svm_model, 'KNN': knn_model, 'Voting Regressor': voting_regressor}
joblib.dump(models[best_model], f'{best_model}_best_model.pkl')




ValueError: Columns must be same length as key

In [27]:
#save the model
import joblib
joblib.dump(model, 'IGIAirport_model.pkl')


['IGIAirport_model.pkl']

In [28]:
#call the ML model
import joblib
model = joblib.load('C:\Aditya\Python\Khushi Project\IGIAirport_model.pkl')

In [29]:
#make prediction by takimg input from user
features = np.array([[157.1,198.5998738622873,0.18,40.68,21.96,23.370162676287308]])
scaled_features = scaler.transform(features)
prediction = model.predict(scaled_features)
print(f"Predicted AQI: {prediction[0]}")


Predicted AQI: 328.7104837209302




In [30]:
#make prediction by takimg input from user
features = np.array([[90.55,198.5998738622873,0.23,19.63,16.34,23.370162676287308]])
scaled_features = scaler.transform(features)
prediction = model.predict(scaled_features)
print(f"Predicted AQI: {prediction[0]}")


Predicted AQI: 301.0




In [None]:
#m