In [215]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

wH = pd.read_csv('../input/szeged-weather/weatherHistory.csv')
wH.info()
wH.describe()

In [216]:
print(wH.loc[wH['Precip Type'].isnull()])
print('\nÉ possível que valores faltantes representem dias sem precipitação.\n')
#wH['Precip Type'] = wH['Precip Type'].fillna('noprec')
wH.info()

In [217]:
print(wH.loc[wH['Visibility (km)'] == 0])
print('\nValores 0 na visibilidade serão tratados como NaN\n')
wH['Visibility (km)'] = wH['Visibility (km)'].replace(0, np.nan)
print('\nAgora o valor mínimo é 16 m, um valor já possível.')
wH.describe()

In [218]:
print(wH.loc[wH['Pressure (millibars)'] == 0])
print('\nValores 0 na pressão serão tratados como NaN\n')
wH['Pressure (millibars)'] = wH['Pressure (millibars)'].replace(0, np.nan)
print('\nAgora o valor mínimo é 973.78 mbar, um valor já possível.')
wH.describe()

In [219]:
fig = plt.figure(figsize = (15,15))
ax = fig.gca()
wH.hist(ax = fig)

In [220]:
num = list(wH.select_dtypes(include=['number']))
print('Numéricas:\n',num)

cat = list(wH.select_dtypes(include=['object']))
print('\nCategóricas:\n',cat)

print('\nA data não deveria ser categórica.')

for c in cat:
    print()
    print(wH[c].value_counts())

In [221]:
wH['Date'] = pd.to_datetime(wH['Formatted Date'].str.split('+', expand = True)[0])

wH['hour'] = wH['Date'].dt.hour
wH['day'] = wH['Date'].dt.day
wH['month'] = wH['Date'].dt.month
wH['year'] = wH['Date'].dt.year

wH = wH.drop(['Formatted Date','Date','Loud Cover'],axis=1)
wH = wH[wH.duplicated(subset=['Summary'], keep=False)]
#weatherHistory.drop(weatherHistory.loc[weatherHistory["Summary"].isin(["Breezy and Dry","Windy and Dry","Dangerously Windy and Partly Cloudy"])].index, inplace=True)

wH.describe()

In [222]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

X = wH.drop(['Daily Summary'], axis = 1)
y = wH['Daily Summary']
Xtrain, Xval, ytrain, yval = train_test_split(X,y,test_size=0.3,stratify=y,random_state = 0)

num = list(X.select_dtypes(include=['number']))
cat = list(X.select_dtypes(include=['object']))

numT = SimpleImputer(strategy = 'mean')#, missing_values = 0)
catT = Pipeline(steps=[('imputer', SimpleImputer(strategy = 'most_frequent')),
                       ('encoder', OrdinalEncoder())])

colTrans = ColumnTransformer(transformers=[('num', numT, num),
                                           ('cat', catT, cat)])

In [223]:
Xtrain = colTrans.fit_transform(Xtrain)
Xval = colTrans.transform(Xval)

label = LabelEncoder()
ytrain = label.fit_transform(ytrain)
yval = label.transform(yval)

In [224]:
from tensorflow import keras

nEnt = X.shape[1]
nSai = y.unique().shape[0]

model = keras.Sequential([keras.layers.Dense(256, activation='relu',
                                             input_shape = [nEnt]),
                          keras.layers.BatchNormalization(),
                          keras.layers.Dropout(rate=0.3),
                          keras.layers.Dense(256, activation='relu'),
                          keras.layers.BatchNormalization(),
                          keras.layers.Dropout(rate=0.3),
                          keras.layers.Dense(nSai, activation='softmax')])

ytrain = keras.utils.to_categorical(ytrain)
yval = keras.utils.to_categorical(yval)

model.compile(optimizer="adam",loss="categorical_crossentropy",metrics="categorical_accuracy")

ES = keras.callbacks.EarlyStopping(patience=10,min_delta=0.001,restore_best_weights=True)

h = model.fit(Xtrain, ytrain,
              validation_data = (Xval, yval),
              batch_size = 3000,epochs = 100,
              callbacks = [ES])

h = pd.DataFrame(h.history)

fig, (ax1,ax2) = plt.subplots(2, 1, figsize = (15,6))
ax1.set_title('Perda')
sns.lineplot(data = h.loc[:,['loss', 'val_loss']], ax=ax1)
ax2.set_title('Acurácia')
sns.lineplot(data = h.loc[:,['categorical_accuracy', 'val_categorical_accuracy']], ax=ax2)