In [517]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline


data = pd.read_csv('../input/szeged-weather/weatherHistory.csv')

**1 - Exploração do banco de dados**

In [518]:
print(data.isnull().sum())
print()
print(data.info())
data.head()

#print(data.loc[data['Precip Type'].isnull()])

In [519]:
data['Precip Type'].fillna(method='ffill', inplace=True)

data.describe()

In [520]:
data.hist(figsize=(20,10))

In [521]:
data.loc[data['Pressure (millibars)'] == 0]['Pressure (millibars)'].count()

In [522]:
categorical = data.select_dtypes(include = ['object']).keys()
numerical = data.select_dtypes(include = ['int64','float64']).keys()

print('Variaveis Numericas: %s' % (str(numerical)))

print('\nVariaveis Categoricas: %s' % (str(categorical)))

In [523]:
for cols in categorical:
    print('%s: %s' %(str(cols), str(data[cols].value_counts())))
    
train_new = data.copy()


**2 - Limpeza do banco de dados**

In [524]:
train_new['Formatted Date'] = train_new['Formatted Date'].str.split('+', expand = True)[0]
train_new['Formatted Date'] = pd.to_datetime(train_new['Formatted Date'])

train_new = train_new.rename(columns={'Formatted Date' : 'Date'})

In [525]:
train_new['Day'] = train_new['Date'].dt.day
train_new['Month'] = train_new['Date'].dt.month
train_new['Year'] = train_new['Date'].dt.year
train_new['Hour'] = train_new['Date'].dt.hour
train_new.drop(columns = ['Date', 'Loud Cover'], inplace = True)

In [526]:
train_new.drop(train_new.loc[train_new['Summary'].isin(['Windy and Dry',
                                                        'Dangerously Windy and Partly Cloudy',
                                                        'Breezy and Dry'])].index, inplace = True)

categorical = categorical.drop(['Formatted Date', 'Daily Summary'])

In [527]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder


simple_imputer = SimpleImputer(missing_values=0)

pipecat = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant', fill_value ='none')), 
                          ('encoder', OrdinalEncoder())])

transformer = ColumnTransformer(transformers=[('press', simple_imputer, ['Pressure (millibars)']),
                                                    ('categorical', pipecat, categorical)], remainder = 'passthrough')



from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

Y = train_new['Daily Summary']
X = train_new.drop(columns=['Daily Summary'])

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, stratify = Y)

X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

label_encoder = LabelEncoder()
Y_train = label_encoder.fit_transform(Y_train)
Y_test = label_encoder.transform(Y_test)

**3 - Treinamento e Avaliação da rede neural**

In [528]:
from tensorflow import keras
from tensorflow.keras import layers

columns_count = X.shape[1]
y_outline = Y.unique().shape[0]
model = keras.Sequential([layers.Dense(256, activation='relu', input_shape = [columns_count]),
                          layers.BatchNormalization(),
                          layers.Dropout(0.3),
                          layers.Dense(256, activation='relu'),
                          layers.BatchNormalization(),
                          layers.Dropout(0.3),
                          layers.Dense(y_outline, activation = 'softmax'),])

In [529]:
Y_train = keras.utils.to_categorical(Y_train)
Y_test = keras.utils.to_categorical(Y_test)

In [530]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['categorical_accuracy'])

early_ = keras.callbacks.EarlyStopping(patience = 10,
                                               min_delta = 0.001,
                                               restore_best_weights = True)

In [531]:
history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test),
                    batch_size=3000,epochs=100, callbacks= [early_])

In [540]:
history_dataframe = pd.DataFrame(history.history)
fig, pl = plt.subplots(1,2, figsize = (28,14))

pl[0].set_title("Cross-Entropy")
sns.lineplot(data = history_dataframe.loc[:,["loss", "val_loss"]], ax = pl[0])

pl[1].set_title("Precisão")
sns.lineplot(data = history_dataframe.loc[:,["categorical_accuracy", "val_categorical_accuracy"]], ax = pl[1])