In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("seaborn-whitegrid")

np.random.seed(0)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [2]:
train = pd.read_csv('../input/szeged-weather/weatherHistory.csv')
train.head()

In [3]:
train.isnull().sum()

In [4]:
train.describe()


In [5]:
train.hist(figsize = (18,12))

In [6]:
train.loc[train['Pressure (millibars)']==0]['Pressure (millibars)'].count()

In [7]:
ColunaCat = train.select_dtypes(include = ["object"]).keys()
ColunaNum = train.select_dtypes(include = ["int64",'float64']).keys()

In [8]:
for col in ColunaCat:
    print(col,'\n', train[col].value_counts() ,'\n\n')

In [9]:
train["Date"] = train["Formatted Date"].str.split('+',expand = True)[0]
train["Date"] = pd.to_datetime(train["Date"])
train.drop("Formatted Date", axis = 1, inplace = True)

In [10]:
train["Year"] = train["Date"].dt.year
train["Month"] = train["Date"].dt.month
train["Day"] = train["Date"].dt.day
train["Hour"] = train["Date"].dt.hour
train.drop("Date", axis = 1, inplace = True)
train.head()

In [11]:
train = train.drop('Loud Cover', axis=1)
train.drop(train.loc[train["Summary"].isin(["Breezy and Dry","Windy and Dry","Dangerously Windy and Partly Cloudy"])].index, inplace=True)


In [12]:
ColunaCat = ColunaCat.drop(["Formatted Date", "Daily Summary"])

In [13]:
train.head()

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

imputer = SimpleImputer(missing_values=0)
cat_transf = Pipeline(steps=[("imputer",SimpleImputer(strategy = "constant", fill_value = "none")),
                             ("encoder",OrdinalEncoder())]) 
transformer = ColumnTransformer(transformers=[("press", imputer, ["Pressure (millibars)"]),
                                              ("categorical", cat_transf, ColunaCat)], remainder = "passthrough")

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

y = train['Daily Summary']
x = train.drop('Daily Summary',axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, stratify = y)
x_train = transformer.fit_transform(x_train)
x_test = transformer.transform(x_test)

label = LabelEncoder()
y_train = label.fit_transform(y_train)
y_test = label.transform(y_test)

In [16]:
from tensorflow import keras
from tensorflow.keras import layers

input_size = x.shape[1]
out_size = y.unique().shape[0]

In [17]:
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape = [input_size]),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.3),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.3),
    layers.Dense(out_size, activation='softmax'),
])

In [18]:
y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)


In [19]:
model.compile(optimizer = "adam",
             loss = "categorical_crossentropy",
             metrics = ["categorical_accuracy"])

early_ =  keras.callbacks.EarlyStopping(patience = 10,
                                               min_delta = 0.001,
                                               restore_best_weights = True)

In [20]:
history = model.fit(x_train, y_train,
                    validation_data = (x_test, y_test),
                    batch_size = 3000,
                    epochs = 100,
                    callbacks = [early_])

In [24]:
history = pd.DataFrame(history.history)
fig, axs = plt.subplots(1,2, figsize = (18,6))

axs[0].set_title("Cross-Entropy")
sns.lineplot(data = history.loc[:,["loss", "val_loss"]], ax = axs[0])

axs[1].set_title("Accuracy")
sns.lineplot(data = history.loc[:,["categorical_accuracy", "val_categorical_accuracy"]], ax = axs[1])