In [27]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.preprocessing import StandardScaler, LabelEncoder  
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import matplotlib.pyplot as plt
import chart_studio.plotly as py
import plotly.figure_factory as ff

In [28]:
df = pd.read_csv('weatherAUS.csv.zip', compression='zip')
df = df.drop(['RainToday', 'Date'], axis=1)

In [29]:
df.sample(5)

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow
91350,GoldCoast,19.5,26.1,0.2,,,E,80.0,SSW,SE,...,33.0,79.0,78.0,1015.2,1013.9,,,21.9,22.9,Yes
19405,NorahHead,10.1,16.3,5.0,,,SSW,26.0,WSW,SSE,...,13.0,73.0,65.0,1026.7,1023.0,,,12.6,15.7,No
34045,SydneyAirport,9.8,15.5,0.0,3.8,7.2,W,65.0,W,SSW,...,24.0,47.0,39.0,1017.8,1017.8,1.0,1.0,12.0,15.4,No
133150,Launceston,13.1,19.7,0.0,12.0,,SSE,41.0,SSE,S,...,26.0,72.0,55.0,1029.5,1029.8,,,14.5,19.0,No
64635,MelbourneAirport,14.5,24.0,0.0,5.4,8.2,SW,54.0,W,SW,...,31.0,57.0,53.0,1019.9,1020.2,7.0,7.0,17.3,20.4,No


In [30]:
df.RainTomorrow.value_counts()

No     110316
Yes     31877
Name: RainTomorrow, dtype: int64

In [31]:
# Задание 1: провести EDA (Explanatory Data Analysis)

# Желательно построить графики по некоторым, на ваш взгляд, ключевым признакам и целевой переменной,
# чтобы оценить данные, с которыми мы работаем. Это задание полностью творческое и будет оценено субъективно
# с точки зрения степени раскрытия инсайтов и "красивости" графиков и статистических расчетов

In [32]:
df = df.dropna()

df['RainTomorrow_int'] = np.where(df['RainTomorrow'] == 'Yes', 1, 0)

df.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow,RainTomorrow_int
6049,Cobar,17.9,35.2,0.0,12.0,12.3,SSW,48.0,ENE,SW,...,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,No,0
6050,Cobar,18.4,28.9,0.0,14.8,13.0,S,37.0,SSE,SSE,...,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,No,0
6052,Cobar,19.4,37.6,0.0,10.8,10.6,NNE,46.0,NNE,NNW,...,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,No,0
6053,Cobar,21.9,38.4,0.0,11.4,12.2,WNW,31.0,WNW,WSW,...,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,No,0
6054,Cobar,24.2,41.0,0.0,11.2,8.4,WNW,35.0,NW,WNW,...,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,No,0


In [33]:
df_yes = df[df['RainTomorrow_int'] == 1]
df_yes.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow_int
count,12427.0,12427.0,12427.0,12427.0,12427.0,12427.0,12427.0,12427.0,12427.0,12427.0,12427.0,12427.0,12427.0,12427.0,12427.0,12427.0,12427.0
mean,14.520286,22.285129,5.487302,4.599026,4.529597,46.727368,16.980204,21.209624,75.314959,66.905931,1013.926909,1011.816834,5.946729,6.26193,17.980309,20.348869,1.0
std,6.475014,6.866919,11.553907,3.155133,3.390385,15.453586,9.031452,9.284455,15.746792,18.449353,7.127063,7.163316,2.16376,1.840983,6.599997,6.712269,0.0
min,-4.7,7.0,0.0,0.0,0.0,11.0,2.0,2.0,5.0,1.0,980.5,977.1,0.0,0.0,-0.1,4.3,1.0
25%,9.2,16.8,0.0,2.2,1.4,35.0,11.0,15.0,66.0,55.0,1009.3,1007.0,5.0,6.0,12.6,14.9,1.0
50%,14.0,21.4,0.8,4.0,4.3,44.0,15.0,20.0,77.0,68.0,1013.8,1011.6,7.0,7.0,17.3,19.6,1.0
75%,19.9,27.8,5.8,6.4,7.2,56.0,22.0,28.0,88.0,81.0,1018.6,1016.5,7.0,7.0,23.2,25.4,1.0
max,29.8,46.8,206.2,43.0,13.9,122.0,65.0,65.0,100.0,100.0,1039.5,1036.0,8.0,8.0,36.4,46.1,1.0


In [34]:
df_no = df[df['RainTomorrow_int'] == 0]
df_no.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow_int
count,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0,43993.0
mean,13.166611,24.765538,1.182149,5.758525,8.641254,39.224877,15.296343,19.384857,63.207306,44.714023,1018.175237,1015.637006,3.760075,3.779806,18.26842,23.377392,0.0
std,6.368581,6.902305,4.617274,3.79657,3.338099,12.174093,8.065273,8.234055,18.363871,17.847462,6.549944,6.544964,2.767432,2.583256,6.557603,6.722703,0.0
min,-6.7,4.1,0.0,0.0,0.0,9.0,2.0,2.0,0.0,0.0,982.9,983.2,0.0,0.0,-0.7,3.7,0.0
25%,8.4,19.3,0.0,3.0,6.7,31.0,9.0,13.0,53.0,32.0,1013.7,1011.0,1.0,1.0,13.3,18.1,0.0
50%,13.0,24.6,0.0,5.2,9.5,37.0,15.0,19.0,64.0,46.0,1018.0,1015.5,3.0,3.0,17.9,23.1,0.0
75%,18.0,30.1,0.2,7.8,11.0,46.0,20.0,24.0,76.0,58.0,1022.5,1020.1,7.0,6.0,23.3,28.5,0.0
max,31.4,48.1,182.6,81.2,14.5,124.0,67.0,76.0,100.0,100.0,1040.4,1038.9,8.0,9.0,39.4,46.1,0.0


In [35]:
# Если смотреть по mean, то сильные различия есть в столбцах Rainfall, WindGustSpeed, Humidity9am, Humidity3pm

In [36]:
corrs = df.corr()

figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)

figure.show()

# Судя по всему, влияние на RainTomorrow оказывают Rainfall, Sunshine, Humidity9am, Humidity3pm, Cloud3pm, Temp3pm

In [None]:
# Задание 2:

# Данные, которые мы видим, представлены в табличном формате и имеют несколько проблем, требующих решения:

# 1) Есть пропущенные значения (вероятно, некоторые колонки или строчки следовательно необходимо убрать)
# 2) Некоторые признаки представлены не в виде числовых данных (нужно закодировать с помощью Label Encoder)
# 3) Дизбаланс данных (не каждый метод оценки подойдет)
# 4) Данные имеют разный разброс. Вероятно, требуется скейлинг

In [38]:
df = df.drop(columns=['RainTomorrow']).rename(columns={'RainTomorrow_int': 'RainTomorrow'}).reset_index()

In [39]:
le = preprocessing.LabelEncoder()
df['Location'] = le.fit_transform(df['Location'])
df['WindGustDir'] = le.fit_transform(df['WindGustDir'])
df['WindDir9am'] = le.fit_transform(df['WindDir9am'])
df['WindDir3pm'] = le.fit_transform(df['WindDir3pm'])

df.sample(10)

Unnamed: 0,level_0,index,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,...,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
52421,52421,137575,0,11.3,30.3,0.0,11.0,11.3,2,46.0,...,9.0,24.0,30.0,13.0,1018.1,1013.6,0.0,2.0,21.4,29.1
18610,18610,65620,9,8.5,28.7,0.0,6.4,13.1,10,33.0,...,11.0,11.0,55.0,29.0,1015.0,1012.2,1.0,3.0,16.2,25.8
1170,1170,9761,5,19.4,23.4,4.0,2.6,0.0,5,22.0,...,11.0,11.0,94.0,93.0,1012.4,1011.3,8.0,8.0,19.8,20.0
23420,23420,71864,10,11.8,31.5,0.0,6.8,10.8,8,43.0,...,6.0,13.0,47.0,14.0,1022.5,1019.2,0.0,1.0,17.9,30.8
38195,38195,100732,12,5.2,15.9,2.4,1.6,6.1,11,52.0,...,13.0,24.0,92.0,88.0,1025.3,1023.0,7.0,3.0,12.2,12.3
52556,52556,137754,0,13.5,38.4,0.0,11.6,11.9,2,57.0,...,26.0,26.0,8.0,6.0,1015.6,1010.9,0.0,0.0,28.8,36.1
46834,46834,121346,15,6.4,17.8,0.0,3.0,9.1,0,31.0,...,11.0,11.0,57.0,36.0,1030.6,1026.5,0.0,0.0,11.6,17.3
31110,31110,86025,1,17.8,27.0,0.0,6.2,5.5,10,26.0,...,9.0,9.0,59.0,58.0,1019.9,1017.0,3.0,6.0,23.1,25.3
53349,53349,138675,0,12.0,30.8,0.0,7.4,11.2,8,39.0,...,2.0,17.0,24.0,12.0,1020.2,1016.2,0.0,1.0,22.6,29.3
44329,44329,118741,16,17.6,29.7,0.0,8.6,12.2,15,41.0,...,17.0,28.0,50.0,52.0,1013.3,1012.6,1.0,6.0,23.8,27.7


In [43]:
scaler = StandardScaler()
df[['Humidity9am', 'Humidity3pm']] = scaler.fit_transform(df[['Humidity9am', 'Humidity3pm']])
df[['MinTemp', 'MaxTemp']] = scaler.fit_transform(df[['MinTemp', 'MaxTemp']])
df[['Rainfall', 'WindGustSpeed']] = scaler.fit_transform(df[['Rainfall', 'WindGustSpeed']])
df[['Sunshine', 'Evaporation']] = scaler.fit_transform(df[['Sunshine', 'Evaporation']])
df[['WindSpeed9am', 'WindSpeed3pm']] = scaler.fit_transform(df[['WindSpeed9am', 'WindSpeed3pm']])
df[['Pressure9am', 'Pressure3pm']] = scaler.fit_transform(df[['Pressure9am', 'Pressure3pm']])
df[['Cloud9am', 'Cloud3pm']] = scaler.fit_transform(df[['Cloud9am', 'Cloud3pm']])
df[['Temp9am', 'Temp3pm']] = scaler.fit_transform(df[['Temp9am', 'Temp3pm']])

In [44]:
df.sample(5)

Unnamed: 0,level_0,index,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,...,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
23848,23848,72333,10,-1.677636,-0.906549,-0.303702,-1.001863,-1.206887,5,-0.740704,...,-1.042119,0.025055,0.979083,-0.326882,0.703472,0.306283,0.986113,1.009919,-2.01052,-0.849901
14806,14806,45966,3,0.130166,1.245341,-0.303702,0.729622,0.54931,6,1.508996,...,-0.320699,1.200129,-1.667688,-1.41616,-0.483335,-0.770734,0.986113,0.632165,0.851872,1.34421
39830,39830,102989,14,0.17692,0.714542,-0.303702,0.729622,0.895228,1,-0.290764,...,-0.080225,0.025055,-1.073515,-1.119084,1.861332,1.732603,-0.44392,0.632165,0.212402,0.773741
38132,38132,100661,12,-0.275031,0.341547,-0.303702,-0.244338,0.948446,11,-0.290764,...,-0.080225,-0.797496,0.384909,-0.079319,0.110068,0.131632,-0.801428,0.254411,-0.000755,0.247154
40079,40079,103279,14,-0.462045,0.556736,-0.303702,0.567295,1.507236,9,-0.290764,...,0.400721,-1.502541,-1.0195,-1.515185,-0.092557,-0.363214,-0.086412,-0.87885,-0.640226,0.393428


In [42]:
df_train, df_test = train_test_split(df, shuffle=True, random_state=42, test_size=0.25)

In [None]:
# Задание 3: Построить свой DataLoader

class DataLoader:

    def __init__(self, X_data, y_data, batch_size=64, shuffle=True):
        self.X_data = X_data
        self.y_data = y_data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indexes = list(self.X_data.index)

    def __getitem__(self, index):
        X_elem, y_elem = self.X_data[index], self.y_data[index]
        return (X_elem, y_elem)

    def __len__(self):
        # количество батчей
        pass

    def __iter__(self):
        return self

    def __next__(self):
        if self.shuffle == True:
            self.indexes = np.random.shuffle(self.indexes)

        for i in self.indexes:
            batch = []
            n = 0
            while n < self.batch_size:
                batch.append((self.X_data[i], self.y_data[i])
            yield batch

In [None]:
train_loader, test_loader = None, None

In [20]:
# Задание 4: Построить двухслойный перцептрон

class ShallowPerceptron(nn.Module):
    
    def __init__(self):
        super(ShallowPerceptron, self).__init__()
        
        # First nn.Linear shape: (n_features, hidden_size)
        self.layer_1 = None
        
        # Second nn.Linear shape: (hidden_size, hidden_size)
        self.layer_2 = None
        
        # Last nn.Linear shape: (hidden_size, 1)
        self.layer_out = None
        
        # Read about ReLU: https://www.kaggle.com/dansbecker/rectified-linear-units-relu-in-deep-learning
        self.relu = nn.ReLU()
        
        # Read about Dropout: https://jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf
        self.dropout = nn.Dropout(p=0.1)

        
    def forward(self, inputs):
        # Каждый слой по сути является функцией, в которую передаются некоторые input
        # Очередность: layer_1 --> relu --> layer_2 --> relu --> dropout --> layer_out
        return x

In [23]:
EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.001


model = ShallowPerceptron()
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)


In [26]:
# Задание 5: написать цикл обучения

# Переводим модель в режим "тренировки"
model.train()


# Одна эпоха - один полный проход по данным 
for epoch in range(EPOCHS):
    
    # На каждой эпохе необходимо отслеживать суммарный лосс и считать метрику качества
    # В качестве метрики можно использовать F1 score
    epoch_loss = 0
    epoch_f1 = 0
    for X_batch, y_batch in train_loader:
        
        # Зануляем градиенты с предыдущего шага
        optimizer.zero_grad()
        
        # Делаем forward
        y_pred = model(X_batch)
        
        # Считаем лосс с помощью нашего criterion
        loss = None
        
        # Считаем метрику
        f1 = None
        
        # Делаем backpropagation
        loss.backward()
        
        # Изменяем веса
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_f1 += f1
        
    
    # Тут нужно вывести: Номер текущей эпохи | Лосс, усредненный на количество батчей | усредненный F1

In [None]:
# Задание 6: Провести оценку на тестовых данных


model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        y_test_pred = model(X_batch)
        # Нужно сохранять результаты

# Здесь считаем метрику на полученных предсказаниях
# Радуемся!