In [1]:
import pandas as pd

TARGET = 'Weather Type'
WEATHER = '/kaggle/input/weather-type-classification/weather_classification_data.csv'

df = pd.read_csv(filepath_or_buffer=WEATHER)
df.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy


Let's do some dimension reduction using just the numerical data and see if our data has a strong signal.

In [2]:
import arrow
from umap import UMAP

COLUMNS = [column for column, dtype in df.dtypes.items() if str(dtype) in {'int64', 'float64'}]
TARGET = 'Weather Type'
time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-21 20:04:56.910640: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-21 20:04:56.910799: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-21 20:04:57.075762: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Sun Jul 21 20:05:08 2024 Construct fuzzy simplicial set
Sun Jul 21 20:05:08 2024 Finding Nearest Neighbors
Sun Jul 21 20:05:08 2024 Building RP forest with 11 trees
Sun Jul 21 20:05:14 2024 NN descent for 14 iterations
	 1  /  14
	 2  /  14
	 3  /  14
	Stopping threshold met -- exiting after 3 iterations
Sun Jul 21 20:05:34 2024 Finished Nearest Neighbor Search
Sun Jul 21 20:05:38 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Sun Jul 21 20:06:01 2024 Finished embedding
done with UMAP in 0:00:52.187727


In [3]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df.sample(n=1000, random_state=2024), x='x', y='y', color=TARGET, )

This is encouraging; we have some tough cases but most of our data clusters nicely.

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])
model = LogisticRegression(max_iter=10000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 692 iterations
accuracy: 0.8561


Our accuracy is good but not great; is our model getting anything systematically wrong?

In [5]:
from sklearn.metrics import classification_report

print(classification_report(zero_division=0 , y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

      Cloudy       0.86      0.79      0.82       660
       Rainy       0.83      0.90      0.86       660
       Snowy       0.90      0.90      0.90       660
       Sunny       0.84      0.84      0.84       660

    accuracy                           0.86      2640
   macro avg       0.86      0.86      0.86      2640
weighted avg       0.86      0.86      0.86      2640



No not really; the model does well for snowy weather and equally poorly on the other three classes.

In [6]:
from plotly import express

express.histogram(x=COLUMNS, y=model.coef_[0], title='Regression coefficients')

Would you have guessed up front that visibility is the most important feature?