In [24]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import joblib
import numpy as np

In [26]:
data = pd.read_csv("data/weatherAUS.csv")

In [27]:
selected_columns = [
    'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
    'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow'
]
data = data[selected_columns]


label_encoder = LabelEncoder()
data['RainTomorrow'] = label_encoder.fit_transform(data['RainTomorrow'])


X = data.drop(columns='RainTomorrow')
y = data['RainTomorrow']

# Split the data into training and testing sets
seed = 42  
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=seed, stratify=y)

# Step 2: Build the pipeline
numerical_features = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'Cloud3pm', 'Temp9am', 'Temp3pm']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical features: impute missing values, encode as binary
categorical_features = ['RainToday']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='if_binary'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create the full pipeline by adding the classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [30]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Random Forest model with pipeline: {accuracy:.2f}")

Accuracy of the Random Forest model with pipeline: 0.81


In [34]:
joblib.dump(pipeline, 'model/aussie_rain_pipeline.joblib', compress=('zlib', 3))

['model/aussie_rain_pipeline.joblib']