In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("weather.csv")

In [None]:
df.head() #looking good

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy


In [4]:
df.info() #too good, no missing data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  float64
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  float64
 4   Cloud Cover           13200 non-null  object 
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  object 
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  object 
 10  Weather Type          13200 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 1.1+ MB


In [4]:
for col in df.select_dtypes(include="object"):
    col = df[col]
    print(f"{col.name}: unique: {col.unique()}")

Cloud Cover: unique: ['partly cloudy' 'clear' 'overcast' 'cloudy']
Season: unique: ['Winter' 'Spring' 'Summer' 'Autumn']
Location: unique: ['inland' 'mountain' 'coastal']
Weather Type: unique: ['Rainy' 'Cloudy' 'Sunny' 'Snowy']


In [5]:
df.select_dtypes(exclude="object").describe()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km)
count,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0
mean,19.127576,68.710833,9.832197,53.644394,1005.827896,4.005758,5.462917
std,17.386327,20.194248,6.908704,31.946541,37.199589,3.8566,3.371499
min,-25.0,20.0,0.0,0.0,800.12,0.0,0.0
25%,4.0,57.0,5.0,19.0,994.8,1.0,3.0
50%,21.0,70.0,9.0,58.0,1007.65,3.0,5.0
75%,31.0,84.0,13.5,82.0,1016.7725,7.0,7.5
max,109.0,109.0,48.5,109.0,1199.21,14.0,20.0


In [6]:
# check inbalance
df["Weather Type"].value_counts()

Weather Type
Rainy     3300
Cloudy    3300
Sunny     3300
Snowy     3300
Name: count, dtype: int64

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [9]:
numerical_cols = ["Temperature","Humidity","Wind Speed","Precipitation (%)","Atmospheric Pressure","UV Index","Visibility (km)"]
categorical_cols = ["Cloud Cover","Season","Location"]

numerical_pipeline = Pipeline([("scaler",StandardScaler(with_mean=False))])
categorical_pipeline = Pipeline([("encoder",OneHotEncoder())])

preprocessor = ColumnTransformer([("numerical pipeline", numerical_pipeline, numerical_cols), ("categorical",categorical_pipeline, categorical_cols)])

In [8]:
from sklearn.preprocessing import LabelEncoder

train_output_series = df["Weather Type"]
# test_output_series = test_df["Weather Type"]
            
all_classes = train_output_series.value_counts().to_list()
            
le = LabelEncoder()
le.fit(all_classes)
train_output_series = le.transform(train_output_series)
# test_output_series = le.transform(test_output_series)

ValueError: invalid literal for int() with base 10: 'Rainy'

In [10]:
X = df.drop("Weather Type", axis=1)
y = df["Weather Type"]

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_input_train_arr = preprocessor.fit_transform(X_train)

In [13]:
X_input_test_arr = preprocessor.transform(X_test)

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_output_train_arr = le.fit_transform(y_train)
y_output_test_arr = le.transform(y_test)

In [15]:
from tensorflow import keras
import tensorflow as tf

In [16]:
model = keras.Sequential([
    keras.layers.Dense(32, activation="relu", input_shape=(X_input_train_arr.shape[1],)),
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dense(4, activation="softmax")
])

model.compile(optimizer="adam", loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), metrics=["sparse_categorical_accuracy"])

history = model.fit(X_input_train_arr, y_output_train_arr, epochs=100, batch_size=32)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 1.2568 - sparse_categorical_accuracy: 0.4732
Epoch 2/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.5472 - sparse_categorical_accuracy: 0.8537
Epoch 3/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.4417 - sparse_categorical_accuracy: 0.8688
Epoch 4/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.3742 - sparse_categorical_accuracy: 0.8831
Epoch 5/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.3529 - sparse_categorical_accuracy: 0.8869
Epoch 6/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.3144 - sparse_categorical_accuracy: 0.8975
Epoch 7/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.3039 - sparse_categorical_accuracy: 0.8993
Epoch 8/100
[1m330/

In [17]:
X_input_test_arr[0:1]

array([[ 1.32090861,  1.87744562,  0.65191991,  0.1878331 , 27.40841499,
         2.33481081,  2.97388887,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ]])

In [18]:
preds = model.predict(X_input_test_arr)

[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [19]:
preds_label = np.argmax(preds,axis=1)

In [20]:
y_output_test_arr.shape

(2640,)

In [21]:
from sklearn.metrics import classification_report

In [22]:
print(classification_report(y_output_test_arr,preds_label ))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       651
           1       0.93      0.88      0.90       647
           2       0.94      0.94      0.94       701
           3       0.93      0.88      0.90       641

    accuracy                           0.90      2640
   macro avg       0.90      0.90      0.90      2640
weighted avg       0.90      0.90      0.90      2640

