In [10]:
import pandas as pd

# Load the dataset and add some attributes because of empty rows and columns
data = pd.read_excel('australia_weather.xlsx', skiprows=9, usecols='C:Z')

# Clean row-and-column structure to make it readable and accessible
data.columns = ['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 
                'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir_9am', 'WindDir_3pm', 
                'WindSpeed_9am', 'WindSpeed_3pm', 'Humidity_9am', 'Humidity_3pm',
                'Pressure_9am', 'Pressure_3pm', 'Cloud_9am', 'Cloud_3pm', 'Temp_9am',
                'Temp_3pm', 'RainToday', 'RISK_MM', 'RainTomorrow']

# Display the first 5 rows of the dataset to ensure it has a correct format
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir_9am,...,Humidity_3pm,Pressure_9am,Pressure_3pm,Cloud_9am,Cloud_3pm,Temp_9am,Temp_3pm,RainToday,RISK_MM,RainTomorrow
0,16/11/2008,Brisbane,21.0,31.9,0.0,6.4,9.7,SE,48.0,E,...,64.0,1012.6,1007.9,3.0,7.0,27.9,28.2,No,31.8,Yes
1,11/06/2009,Wollongong,6.6,16.0,0.0,,,SW,37.0,W,...,37.0,1020.0,1018.0,,,10.6,15.5,No,0.0,No
2,29/03/2010,BadgerysCreek,19.1,24.3,0.8,,,,,,...,,1017.5,,,,21.2,,No,7.0,Yes
3,29/03/2016,Cairns,22.9,32.6,0.0,,10.7,SE,37.0,S,...,58.0,1016.5,1013.2,1.0,2.0,28.3,31.2,No,0.0,No
4,22/08/2012,Sydney,10.6,24.7,0.0,3.8,9.1,,,WNW,...,54.0,1015.5,1010.4,6.0,6.0,16.1,20.8,No,0.0,No


In [11]:
# Exploring the dataset

# Check the shape of the dataset
print("Shape of the dataset:", data.shape)

# Check the data types of each column
print("\nData types:\n", data.dtypes)

# Check the summary statistics
print("\nSummary statistics for each column:\n", data.describe())

# Check the unique values in the 'RainToday' column
print("\nUnique values in 'RainToday':\n", data['RainToday'].unique())

# Check the unique values in the 'RainTomorrow' column
print("\nUnique values in 'RainTomorrow':\n", data['RainTomorrow'].unique())

Shape of the dataset: (156412, 24)

Data types:
 Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir_9am       object
WindDir_3pm       object
WindSpeed_9am    float64
WindSpeed_3pm    float64
Humidity_9am     float64
Humidity_3pm     float64
Pressure_9am     float64
Pressure_3pm     float64
Cloud_9am        float64
Cloud_3pm        float64
Temp_9am         float64
Temp_3pm         float64
RainToday         object
RISK_MM          float64
RainTomorrow      object
dtype: object

Summary statistics for each column:
              MinTemp        MaxTemp       Rainfall   Evaporation  \
count  155712.000000  156058.000000  154883.000000  89520.000000   
mean       12.188566      23.230174       2.351982      5.471000   
std         6.402848       7.116761       8.503384      4.177037   
min        -8.500000      -

In [12]:
# Since there are missing values (NaN) in the table it is necessary to clean
# the dataset. Resetting the indexes of the rows for better reading of the table. 
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir_9am,...,Humidity_3pm,Pressure_9am,Pressure_3pm,Cloud_9am,Cloud_3pm,Temp_9am,Temp_3pm,RainToday,RISK_MM,RainTomorrow
0,16/11/2008,Brisbane,21.0,31.9,0.0,6.4,9.7,SE,48.0,E,...,64.0,1012.6,1007.9,3.0,7.0,27.9,28.2,No,31.8,Yes
1,04/04/2009,Woomera,13.1,23.4,0.0,13.2,10.1,SSE,50.0,SSE,...,30.0,1025.9,1023.3,7.0,0.0,15.7,22.3,No,0.0,No
2,17/08/2008,Melbourne,7.8,12.9,4.2,1.4,2.8,SW,31.0,SW,...,56.0,1025.6,1025.3,5.0,7.0,8.6,11.5,Yes,0.4,No
3,17/05/2009,MelbourneAirport,9.0,16.0,0.4,2.8,0.9,SW,35.0,W,...,58.0,1021.8,1020.6,7.0,7.0,12.6,15.8,No,0.0,No
4,24/10/2014,AliceSprings,22.6,38.0,0.0,14.6,10.7,NNW,41.0,N,...,9.0,1016.3,1012.3,6.0,6.0,31.7,36.9,No,0.0,No


In [13]:
# Starting data preprocessing

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Separate categorical and numerical columns
categorical_cols = []
numerical_cols = []
for col in data.columns:
    if data[col].dtype == 'object':
        categorical_cols.append(col)
    else:
        numerical_cols.append(col)

# One-hot encoding of the categorical variables. This method of encoding is chosen to avoid
# implied order or hierarchy in the dataset like in label or ordinal encodings.
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_features = encoder.fit_transform(data[categorical_cols])
encoded_df = pd.DataFrame(encoded_features)

# Retrieve feature names
feature_names = []
for i, col in enumerate(categorical_cols):
    unique_categories = data[col].unique()
    for category in unique_categories[1:]:
        feature_name = f"{col}_{category}"
        feature_names.append(feature_name)

# Set feature names for the encoded DataFrame
encoded_df.columns = feature_names

# Combine encoded features with numerical features horizontally to get the final dataset
data_encoded = pd.concat([data[numerical_cols], encoded_df], axis=1)

# Define target variables
target_classification = data['RainTomorrow']
target_regression = data['RISK_MM']

# Split the dataset into training and testing sets
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(data_encoded, target_classification, test_size=0.2, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(data_encoded, target_regression, test_size=0.2, random_state=42)



In [None]:
# For the classification task (predicting 'RainTomorrow'), 
# we will train a Random Forest classifier.

# For the regression task (predicting 'RISK_MM'),
# we will train a Gradient Boosting regressor.

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_absolute_error, r2_score


# Train the Random Forest Classifier
classifier = RandomForestClassifier()
classifier.fit(X_train_class, y_train_class)

# Make predictions on the test set for classification
y_pred_class = classifier.predict(X_test_class)

# Calculate accuracy and other classification metrics
accuracy = accuracy_score(y_test_class, y_pred_class)
classification_report = classification_report(y_test_class, y_pred_class)
confusion_matrix = confusion_matrix(y_test_class, y_pred_class)

print("Accuracy of Random Forest classifier:", accuracy)
print("Classification Report:")
print(classification_report)
print("Confusion Matrix:")
print(confusion_matrix)

# Train the Gradient Boosting Regressor
regressor = GradientBoostingRegressor()
regressor.fit(X_train_reg, y_train_reg)

# Make predictions on the test set for regression
y_pred_reg = regressor.predict(X_test_reg)

# Calculate regression metrics
mse = mean_squared_error(y_test_reg, y_pred_reg)
mae = mean_absolute_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)

print("Mean Squared Error of Gradient Boosting regressor:", mse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Accuracy of Random Forest classifier: 1.0
Classification Report:
              precision    recall  f1-score   support

          No       1.00      1.00      1.00      9680
         Yes       1.00      1.00      1.00      2742

    accuracy                           1.00     12422
   macro avg       1.00      1.00      1.00     12422
weighted avg       1.00      1.00      1.00     12422

Confusion Matrix:
[[9680    0]
 [   0 2742]]


In [None]:
# The results are quite good. The accuracy of Random Forest classifier is 1.0, which is  
# highly unusual. However, high accuracy can be achieved, but it is crucial to 
# carefully analyze the data and modeling process to ensure the validity of the
# results and avoid poten`tial issues such as overfitting or bias.

# First check the numerical values of the dataset for the outliers. If the dataset is 
# clean from them, it can lead to high accuracy.
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
data[numerical_cols].boxplot()
plt.xticks(rotation=90)
plt.title("Box plots of Numerical Features")
plt.xlabel("Feature")
plt.ylabel("Value")
plt.show()

In [None]:
# As we can see from the plot, the data of each feature are quite similar and there are
# almost no outliers in the dataset. The percent of them in the Pressure column is very low.
outliers_3pm = len(data[data['Pressure_3pm'] > 9000])
outliers_9am = len(data[data['Pressure_9am'] > 9000])
count_3pm = len(data[data['Pressure_3pm'] > 0])
count_9am = len(data[data['Pressure_9am'] > 0])
print(f"Number of rows with pressure values greater than 8000: 3pm: {outliers_3pm}, 9am: {outliers_9am}")
print(f"Percent of outliers at 9am: {100*outliers_9am/count_9am}%")
print(f"Percent of outliers at 3pm: {100*outliers_3pm/count_3pm}%")

In [None]:
# As the result we can assume that the model was trained very well. There are a lot of 
# reasons of the perfect accuracy of the classification task and almost perfect results
# of the regression task. Firstly, the dataset contains a lot of features, which helps
# to train model to predict more accurate results. Secondly, well-performed data 
# preprocessing is very crucial for training the models. Thirdly, classification values
# are quite simple: Yes or No. This also a reason for better performance of the model. 
# And finally, dataset almost doesn't contain any outliers.