In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

In [2]:
url="https://raw.githubusercontent.com/metropolisjenensis/data_for_prob_ML/refs/heads/main/weatherAUS.csv"
df= pd.read_csv(url, encoding='utf-8')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [4]:
df["Sunshine"].describe()

count    75625.000000
mean         7.611178
std          3.785483
min          0.000000
25%          4.800000
50%          8.400000
75%         10.600000
max         14.500000
Name: Sunshine, dtype: float64

In [5]:
# Drop columns with too many missing values
threshold = 0.3  # 30% missing allowed
df_clean = df.loc[:, df.isnull().mean() < threshold]

In [6]:
# Drop rows with missing target
df_clean = df_clean.dropna(subset=["RainTomorrow"])

In [7]:
# Encode target variable
df_clean["RainTomorrow"] = df_clean["RainTomorrow"].map({"No": 0, "Yes": 1})


In [8]:
# Drop 'Date' (not useful for this task)
df_clean = df_clean.drop(columns=["Date"])



In [9]:
# Fill missing numerical values with column median
for col in df_clean.select_dtypes(include=[np.number]).columns:
    df_clean[col].fillna(df_clean[col].median(), inplace=True)



In [10]:
# Encode categorical features
cat_cols = df_clean.select_dtypes(include=["object"]).columns
for col in cat_cols:
    df_clean[col] = LabelEncoder().fit_transform(df_clean[col].astype(str))



In [11]:
# Split features and target
X = df_clean.drop(columns=["RainTomorrow"])
y = df_clean["RainTomorrow"]


In [12]:

# Standardize features for logistic regression and autoencoder
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((113754, 17), (28439, 17))

In [13]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Train XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

# Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)

# Evaluate both models
xgb_cm = confusion_matrix(y_test, xgb_preds)
xgb_acc = accuracy_score(y_test, xgb_preds)
xgb_report = classification_report(y_test, xgb_preds, output_dict=True)

lr_cm = confusion_matrix(y_test, lr_preds)
lr_acc = accuracy_score(y_test, lr_preds)
lr_report = classification_report(y_test, lr_preds, output_dict=True)

xgb_cm, xgb_acc, lr_cm, lr_acc

Parameters: { "use_label_encoder" } are not used.



(array([[20913,  1151],
        [ 2871,  3504]], dtype=int64),
 0.8585744927740075,
 array([[20934,  1130],
        [ 3345,  3030]], dtype=int64),
 0.8426456626463659)

In [None]:

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import regularizerss
# Autoencoder
input_dim = X_train.shape[1]
encoding_dim = 8

input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu', activity_regularizer=regularizers.l1(10e-5))(input_layer)
decoded = Dense(input_dim, activation='linear')(encoded)

autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')

X_train_0 = X_train[y_train == 0]
autoencoder.fit(X_train_0, X_train_0, epochs=20, batch_size=64, shuffle=True, verbose=0)

# Predictions and evaluation
X_test_pred = autoencoder.predict(X_test, verbose=0)
reconstruction_error = np.mean(np.power(X_test - X_test_pred, 2), axis=1)
threshold = np.percentile(reconstruction_error[y_test == 0], 95)
autoencoder_preds = (reconstruction_error > threshold).astype(int)

ae_cm = confusion_matrix(y_test, autoencoder_preds)
ae_acc = accuracy_score(y_test, autoencoder_preds)
ae_report = classification_report(y_test, autoencoder_preds, output_dict=True)

ae_cm, ae_acc