In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_validate

In [None]:
rain_df = pd.read_csv("https://raw.githubusercontent.com/mini-pw/2021L-WUM/main/Prace_domowe/Praca_domowa3/australia.csv")
rain_df.head()

In [None]:
rain_df.hist(figsize = (18, 12))
plt.show()

Widać dużą dysproporcje miedzy dnami deszczowymi i bezdeszczowymi, uwzględnimy ją przy dzieleniu dancch na zbiory.

### Podział na zbiór treningowy i testowy

In [None]:
X = rain_df.drop('RainTomorrow', axis = 1)
y = rain_df[['RainTomorrow']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 29)

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape:  {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape:  {y_test.shape}')

## Uczenie modeli

In [None]:
split = KFold(n_splits = 10, shuffle =True, random_state = 29)

def results_to_df(results, model):
    data = [
        ['accuracy', np.mean(results['test_accuracy'])],
        ['precision', np.mean(results['test_precision'])],
        ['roc_aux', np.mean(results['test_roc_auc'])],
        ['recall', np.mean(results['test_recall'])]
    ]
    return pd.DataFrame(data, columns = ['measure', model])

### Regresja liniowa

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty = 'l1', solver='saga', random_state = 29, max_iter = 1000)

lr_pipe = Pipeline(
        steps=[
        ('standardscaler', StandardScaler()),
        ('lr', lr)])  

lr_results = cross_validate(lr_pipe, X_train, y_train, cv = split, n_jobs = -1,
                               scoring = ['accuracy', 'precision', 'roc_auc', 'recall'])


lr_df = results_to_df(lr_results, 'LogisticRegression')

In [None]:
lr_df

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

randomForest = RandomForestClassifier(max_depth = 10, random_state = 29, n_estimators = 300)

randomForest_pipe = Pipeline(
    steps=[
        ('standardscaler', StandardScaler()),
        ('rf', randomForest)])  

randomForest_results = cross_validate(randomForest_pipe, X_train, y_train, cv = split, n_jobs = -1,
                               scoring = ['accuracy', 'precision', 'roc_auc', 'recall'])


randomForest_df = results_to_df(randomForest_results, 'RandomForest')

In [None]:
randomForest_df

### K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 10, algorithm = 'ball_tree', leaf_size = 40)

knn_pipe = Pipeline(
    steps=[
        ('standardscaler', StandardScaler()),
        ('knn', knn)])  

knn_results = cross_validate(knn_pipe, X_train, y_train, cv = split, n_jobs = -1,
                               scoring = ['accuracy', 'precision', 'roc_auc', 'recall'])


knn_df = results_to_df(knn_results, 'KNeighbours')

In [None]:
knn_df

## Ocena modeli

In [None]:
all_results = pd.merge((pd.merge(lr_df, randomForest_df, on = 'measure')), knn_df, on = 'measure')

all_results

In [None]:
all_results_melt = all_results.melt(id_vars=['measure'],
                                    value_vars=['LogisticRegression', 'RandomForest', 'KNeighbours'])

plt.figure(figsize = (9, 6))
sns.barplot(data = all_results_melt, x = 'variable', y = 'value', hue = 'measure'
           ).set(xlabel='', ylabel='mean value')
plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)
plt.show()

Dla każdego z modeli średnie wartośi mair są na podobnym poziomie, więc ciężko wybrać najlepszy s
pośród nich.

Accuracy we wszystkich moedlach jest bardzo wysokie, jednak sprawdza ono tylko poprawność odpowiedzi, która należy do zbioru {0, 1}, przy czym 0 jest znacznie więcej niż 1. Nietrudo więc trafić w większość poprawnych odpowiedzi. Auc_roc daje równie wysokie wyniki.

Porównując Precision i Recall można zauważyć dysproporcje, dni bez deszczu są częściej są uznawane na deszczowe niż dni deszczowe za dni bez deszczu. Te dwie miary razem z accuracy dają dobry obraz działania modelu.