In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Define the file path to the dataset
file_path = r'C:\Users\Hiremath\OneDrive\Desktop\raw.githubusercontent.com_dsrscientist_dataset3_main_weatherAUS.csv'

# Load the dataset into a Pandas DataFrame
data = pd.read_csv(file_path)

# Data Preprocessing
# Drop rows with missing values in the target variables
data.dropna(subset=['RainTomorrow', 'Rainfall'], inplace=True)

# Drop columns that are not needed for modeling
data.drop(['Date', 'Location'], axis=1, inplace=True)

# Encode categorical variables
label_encoders = {}
categorical_columns = data.select_dtypes(include=['object']).columns
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Split the data into features (X) and the target variables (y_class for classification, y_reg for regression)
X = data.drop(['RainTomorrow', 'Rainfall'], axis=1)
y_class = data['RainTomorrow']
y_reg = data['Rainfall']

# Split the data into training and testing sets for both tasks
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(
    X, y_class, y_reg, test_size=0.2, random_state=42
)

# Impute missing values with the mean for numerical features
numerical_features = X_train.select_dtypes(include=['float64']).columns
imputer = SimpleImputer(strategy='mean')
X_train[numerical_features] = imputer.fit_transform(X_train[numerical_features])
X_test[numerical_features] = imputer.transform(X_test[numerical_features])

# Standardize the features for regression task
scaler = StandardScaler()
X_reg_train = scaler.fit_transform(X_train)
X_reg_test = scaler.transform(X_test)

# Classification Task: Predicting whether it will rain tomorrow
# Initialize a Random Forest classifier with fewer hyperparameter combinations
classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# Fit the classifier
classifier.fit(X_train, y_class_train)

# Make predictions on the test set for classification
y_class_pred = classifier.predict(X_test)

# Evaluate the classification model
accuracy_class = accuracy_score(y_class_test, y_class_pred)
print("Classification Accuracy:", accuracy_class)
print("Classification Report:")
print(classification_report(y_class_test, y_class_pred))

# Regression Task: Predicting how much rainfall there will be
# Initialize a Random Forest regressor with fewer hyperparameter combinations
regressor = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

# Fit the regressor
regressor.fit(X_reg_train, y_reg_train)

# Make predictions on the test set for regression
y_reg_pred = regressor.predict(X_reg_test)

# Evaluate the regression model
mse = mean_squared_error(y_reg_test, y_reg_pred)
r2 = r2_score(y_reg_test, y_reg_pred)
print("\nRegression Mean Squared Error:", mse)
print("R-squared Score:", r2)


Classification Accuracy: 0.8935643564356436
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.98      0.93      1263
           1       0.88      0.60      0.71       353

    accuracy                           0.89      1616
   macro avg       0.89      0.79      0.82      1616
weighted avg       0.89      0.89      0.89      1616


Regression Mean Squared Error: 113.65808760321806
R-squared Score: 0.2809444938324135
