Podstawowa część zadania polega na wytrenowaniu dowolnych 3 klasyfikatorów i sprawdzeniu ich skuteczności. Raport powinien zawierać:

    podział danych na zbiór treningowy i testowy
    nauczenie 3 dowolnych klasyfikatorów
    w każdym klasyfikatorze należy wybrać minimum jeden hiperparametr (nie trzeba go stroić - wystarczy się zapoznać z parametrami modelu)
    wykorzystanie przynajmniej 3 miar oceny jakości klasyfikatorów i wybór najlepszego z nich.


In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import cross_val_score


import category_encoders as ce

from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats

random.seed(4)

In [2]:
csv = pd.read_csv('australia.csv')
df = csv.copy()
df

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,17.9,35.2,0.0,12.0,12.3,48.0,6.0,20.0,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,0,0
1,18.4,28.9,0.0,14.8,13.0,37.0,19.0,19.0,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,0,0
2,19.4,37.6,0.0,10.8,10.6,46.0,30.0,15.0,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,0,0
3,21.9,38.4,0.0,11.4,12.2,31.0,6.0,6.0,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,0,0
4,24.2,41.0,0.0,11.2,8.4,35.0,17.0,13.0,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56415,19.3,33.4,0.0,6.0,11.0,35.0,9.0,20.0,63.0,32.0,1013.9,1010.5,0.0,1.0,24.5,32.3,0,0
56416,21.2,32.6,0.0,7.6,8.6,37.0,13.0,11.0,56.0,28.0,1014.6,1011.2,7.0,0.0,24.8,32.0,0,0
56417,20.7,32.8,0.0,5.6,11.0,33.0,17.0,11.0,46.0,23.0,1015.3,1011.8,0.0,0.0,24.8,32.1,0,0
56418,19.5,31.8,0.0,6.2,10.6,26.0,9.0,17.0,62.0,58.0,1014.9,1010.7,1.0,1.0,24.8,29.2,0,0


In [3]:
df['RainTomorrow'].value_counts()

0    43993
1    12427
Name: RainTomorrow, dtype: int64

Niezbalansowany target

In [4]:
df[df.RainTomorrow==0] = resample(df[df.RainTomorrow==0], 
                                 replace=False,    
                                 n_samples= df.shape[0]//4,     
                                 random_state=4)

In [5]:
df = pd.concat([df[df.RainTomorrow==1],df[df.RainTomorrow==0]])

In [6]:
df['RainTomorrow'].value_counts()

0.0    14105
1.0    12427
Name: RainTomorrow, dtype: int64

In [7]:
df.shape

(26532, 18)

# podział danych na zbiór treningowy i testowy

In [8]:
y = np.array(df['RainTomorrow'])
X = df.drop(['RainTomorrow'],axis=1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# nauczenie 3 dowolnych klasyfikatorów
w każdym klasyfikatorze należy wybrać minimum jeden hiperparametr

In [10]:
lr = LogisticRegression(max_iter=2004)

lr.fit(X_train,y_train)
y_hat = lr.predict(X_test)
print('y:     ' + str(y_test[0:20]) + '\ny_hat: ' + str(y_hat[0:20]))

y:     [1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1.]
y_hat: [1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1.]


In [11]:
tree1 = DecisionTreeClassifier(max_features=.4, max_depth=2)

tree1.fit(X_train,y_train)
y_hat = lr.predict(X_test)
print('y:     ' + str(y_test[0:20]) + '\ny_hat: ' + str(y_hat[0:20]))

y:     [1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1.]
y_hat: [1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1.]


In [12]:
nb = GaussianNB(var_smoothing=1e-5)

nb.fit(X_train,y_train)
y_hat = lr.predict(X_test)
print('y:     ' + str(y_test[0:20]) + '\ny_hat: ' + str(y_hat[0:20]))

y:     [1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1.]
y_hat: [1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1.]


# wykorzystanie przynajmniej 3 miar oceny jakości klasyfikatorów i wybór najlepszego z nich.

Regresja Logistyczna

In [13]:
acc = cross_val_score(lr, X_train, y_train, scoring='accuracy', cv = 4, n_jobs=2)
acc

array([0.79431402, 0.78505277, 0.80185225, 0.79582167])

In [14]:
f1 = cross_val_score(lr, X_train, y_train, scoring='f1', cv = 4, n_jobs=2)
f1

array([0.77545262, 0.76649509, 0.78474497, 0.77578051])

In [15]:
auc = cross_val_score(lr, X_train, y_train, scoring='roc_auc', cv = 4, n_jobs=2)
auc

array([0.87818551, 0.8716716 , 0.87688966, 0.88604221])

In [16]:
print("Acc: ",acc.mean())
print("f1: ",f1.mean())
print("AUC: ",auc.mean())

Acc:  0.7942601766099504
f1:  0.7756182971780374
AUC:  0.8781972465955028


In [19]:
d = {'regresja': [acc.mean(), f1.mean(),auc.mean()], 'drzewa': [0, 0, 0], 'Bayes': [0, 0, 0]}
sc = pd.DataFrame(d, index=['acc', 'f1', 'AUC'])

Drzewa decyzyjne

In [20]:
acc = cross_val_score(tree1, X_train, y_train, scoring='accuracy', cv = 4, n_jobs=2)
acc

array([0.73702348, 0.72668533, 0.6956709 , 0.73939263])

In [21]:
f1 = cross_val_score(tree1, X_train, y_train, scoring='f1', cv = 4, n_jobs=2)
f1

array([0.73857024, 0.73004597, 0.59011164, 0.58260625])

In [22]:
auc = cross_val_score(tree1, X_train, y_train, scoring='roc_auc', cv = 4, n_jobs=2)
auc

array([0.80769081, 0.76803741, 0.78858446, 0.79360141])

In [23]:
print("Acc: ",acc.mean())
print("f1: ",f1.mean())
print("AUC: ",auc.mean())

Acc:  0.7246930863665734
f1:  0.6603335248632706
AUC:  0.789478520464241


In [24]:
sc['drzewa']= [acc.mean(), f1.mean(),auc.mean()]

Klasyfikator Bayesowski

In [25]:
acc = cross_val_score(nb, X_train, y_train, scoring='accuracy', cv = 4, n_jobs=2)
acc

array([0.77191471, 0.7622227 , 0.76825328, 0.76502261])

In [26]:
f1 = cross_val_score(nb, X_train, y_train, scoring='f1', cv = 4, n_jobs=2)
f1

array([0.75400697, 0.74609016, 0.75150115, 0.74455631])

In [27]:
auc = cross_val_score(nb, X_train, y_train, scoring='roc_auc', cv = 4, n_jobs=2)
auc

array([0.84814359, 0.83819725, 0.84833194, 0.85240839])

In [28]:
print("Acc: ",acc.mean())
print("f1: ",f1.mean())
print("AUC: ",auc.mean())

Acc:  0.7668533275899203
f1:  0.7490386474417323
AUC:  0.8467702919204902


In [29]:
sc['Bayes']= [acc.mean(), f1.mean(),auc.mean()]

In [30]:
sc

Unnamed: 0,regresja,drzewa,Bayes
acc,0.79426,0.724693,0.766853
f1,0.775618,0.660334,0.749039
AUC,0.878197,0.789479,0.84677


Najlepiej poradziła sobie regresja liniowa