In [63]:
# Start
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [64]:
# Loading the datasets
df_train = pd.read_csv('Weather Training Data.csv')
df_test = pd.read_csv('Weather Test Data.csv')

In [65]:
# Removing 'row ID' as it's not a feature and handling missing values
df_train.drop(columns=['row ID', 'Sunshine', 'Evaporation'], inplace=True)
df_test.drop(columns=['row ID', 'Sunshine', 'Evaporation'], inplace=True) # Assuming df_test has similar columns

# define your features and labels
X = df_train.drop('RainTomorrow', axis=1)
y = df_train['RainTomorrow']

In [66]:
numeric_features = df_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df_train.select_dtypes(include=['object']).columns.difference(['RainTomorrow']).tolist()

# Creating preprocessing pipelines for both numerical and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [67]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df_train.drop('RainTomorrow', axis=1), df_train['RainTomorrow'], test_size=0.2, random_state=42)

In [68]:
# Joining features and target to ensure the indices align for resampling
train_data = pd.concat([X_train, y_train], axis=1)

# Separating the minority and majority classes
majority_class = train_data[train_data.RainTomorrow == 0]
minority_class = train_data[train_data.RainTomorrow == 1]

# Upsampling the minority class
minority_class_upsampled = resample(minority_class,
                                    replace=True,
                                    n_samples=len(majority_class),
                                    random_state=123)

# Combine the majority class with the upsampled minority class
train_data_upsampled = pd.concat([majority_class, minority_class_upsampled])

In [69]:
# Splitting back into features and target
X_train_balanced = train_data_upsampled.drop('RainTomorrow', axis=1)
y_train_balanced = train_data_upsampled['RainTomorrow']

# Creating the Random Forest pipeline with the preprocessor and classifier
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [70]:
# Hyperparameter grid for Random Forest (simplified for illustration)
param_grid_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
}

In [71]:
# Grid search with cross-validation for RandomForestClassifier
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_balanced, y_train_balanced)

# Training and evaluating the best Random Forest model from grid search
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
print(f"Best Random Forest parameters: {grid_search_rf.best_params_}")
print(f"Random Forest Test Accuracy: {accuracy_score(y_test, y_pred_rf)}")

# Detailed classification report for the Random Forest model
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Creating the Logistic Regression pipeline with the preprocessor and classifier
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])
# Training and evaluating the Logistic Regression model
pipeline_lr.fit(X_train_balanced, y_train_balanced)
y_pred_lr = pipeline_lr.predict(X_test)
print(f"Logistic Regression Test Accuracy: {accuracy_score(y_test, y_pred_lr)}")
# Detailed classification report for the Logistic Regression model
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\mazen\anaconda3\Lib\site-packages\pandas\core\indexes\base.py", line 3802, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\_libs\hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'RainTomorrow'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\mazen\anaconda3\Lib\site-packages\sklearn\utils\__init__.py", line 447, in _get_column_indices
    col_idx = all_columns.get_loc(col)
              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mazen\anaconda3\Lib\site-packages\pandas\core\indexes\base.py", line 3804, in get_loc
    raise KeyError(key) from err
KeyError: 'RainTomorrow'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\mazen\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\mazen\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mazen\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 416, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mazen\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 370, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mazen\anaconda3\Lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mazen\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 950, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mazen\anaconda3\Lib\site-packages\sklearn\utils\_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mazen\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mazen\anaconda3\Lib\site-packages\sklearn\compose\_column_transformer.py", line 740, in fit_transform
    self._validate_column_callables(X)
  File "C:\Users\mazen\anaconda3\Lib\site-packages\sklearn\compose\_column_transformer.py", line 448, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mazen\anaconda3\Lib\site-packages\sklearn\utils\__init__.py", line 455, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe


In [58]:
# Detailed classification report for the Logistic Regression model
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

ValueError: A given column is not a column of the dataframe

In [59]:
# Predictions and evaluation
y_pred_rf = model_rf.predict(X_test)
y_pred_lr = model_lr.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

AttributeError: 'ColumnTransformer' object has no attribute 'transformers_'