In [1]:
import os
import time
import librosa
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import trange,tqdm

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearnex import patch_sklearn, config_context

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('weather_training.csv')
df.set_index('date', inplace=True)

In [3]:
df.head()

Unnamed: 0_level_0,province,max,min,wind,wind_d,rain,humidi,cloud,pressure
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2009-01-01,Bac Lieu,27,22,17,NNE,6.9,90,71,1010
2010-01-01,Bac Lieu,31,25,20,ENE,0.0,64,24,1010
2011-01-01,Bac Lieu,29,24,14,E,0.0,75,45,1008
2012-01-01,Bac Lieu,30,24,30,E,0.0,79,52,1012
2013-01-01,Bac Lieu,31,25,20,ENE,0.0,70,24,1010


In [4]:
num = ['max', 'min', 'wind' ,'humidi', 'cloud', 'pressure']
cat = ['province','wind_d']

In [5]:
X = df.drop(['rain'], axis=1)
y = df['rain'].astype('str')
X

Unnamed: 0_level_0,province,max,min,wind,wind_d,humidi,cloud,pressure
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2009-01-01,Bac Lieu,27,22,17,NNE,90,71,1010
2010-01-01,Bac Lieu,31,25,20,ENE,64,24,1010
2011-01-01,Bac Lieu,29,24,14,E,75,45,1008
2012-01-01,Bac Lieu,30,24,30,E,79,52,1012
2013-01-01,Bac Lieu,31,25,20,ENE,70,24,1010
...,...,...,...,...,...,...,...,...
2016-12-28,Soc Trang,28,23,8,SSW,75,50,1011
2017-12-28,Soc Trang,30,24,21,ENE,81,50,1011
2018-12-28,Soc Trang,26,24,9,ENE,91,75,1009
2019-12-28,Soc Trang,30,23,11,E,74,6,1012


In [6]:
y

date
2009-01-01    6.9
2010-01-01    0.0
2011-01-01    0.0
2012-01-01    0.0
2013-01-01    0.0
             ... 
2016-12-28    0.0
2017-12-28    7.2
2018-12-28    1.3
2019-12-28    0.0
2020-12-28    0.7
Name: rain, Length: 181960, dtype: object

In [7]:
for index in range(len(y)):
    if y[index] != '0.0':
        y[index] = 'Rain'
    else:
        y[index] = 'No Rain'

In [8]:
min_max_scaler = MinMaxScaler()
X[num] = min_max_scaler.fit_transform(X[num])

label_encoder = LabelEncoder()
for column in cat:
    X[column] = label_encoder.fit_transform(X[column])

X.head()

Unnamed: 0_level_0,province,max,min,wind,wind_d,humidi,cloud,pressure
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2009-01-01,0,0.547619,0.666667,0.301887,5,0.87013,0.71,0.44
2010-01-01,0,0.642857,0.766667,0.358491,1,0.532468,0.24,0.44
2011-01-01,0,0.595238,0.733333,0.245283,0,0.675325,0.45,0.4
2012-01-01,0,0.619048,0.733333,0.54717,0,0.727273,0.52,0.48
2013-01-01,0,0.642857,0.766667,0.358491,1,0.61039,0.24,0.44


In [9]:
y = label_encoder.fit_transform(y)
y

array([1, 0, 0, ..., 1, 0, 1])

In [10]:
X

Unnamed: 0_level_0,province,max,min,wind,wind_d,humidi,cloud,pressure
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2009-01-01,0,0.547619,0.666667,0.301887,5,0.870130,0.71,0.44
2010-01-01,0,0.642857,0.766667,0.358491,1,0.532468,0.24,0.44
2011-01-01,0,0.595238,0.733333,0.245283,0,0.675325,0.45,0.40
2012-01-01,0,0.619048,0.733333,0.547170,0,0.727273,0.52,0.48
2013-01-01,0,0.642857,0.766667,0.358491,1,0.610390,0.24,0.44
...,...,...,...,...,...,...,...,...
2016-12-28,27,0.571429,0.700000,0.132075,11,0.675325,0.50,0.46
2017-12-28,27,0.619048,0.733333,0.377358,1,0.753247,0.50,0.46
2018-12-28,27,0.523810,0.733333,0.150943,1,0.883117,0.75,0.42
2019-12-28,27,0.619048,0.700000,0.188679,0,0.662338,0.06,0.48


In [11]:
X = np.array(X)
y = np.array(y)
y = y.reshape(-1,1)
X = X.astype('float32')
y = y.astype('float32')

print("Shape of X: {}".format(X.shape))
print("Shape of y: {}".format(y.shape))

Shape of X: (181960, 8)
Shape of y: (181960, 1)


In [12]:
SEED = 1337

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)

print("Shape of X_train: {}".format(X_train.shape))
print("Shape of y_train: {}".format(Y_train.shape))
print("Shape of X_test: {}".format(X_test.shape))
print("Shape of y_test: {}".format(Y_test.shape))

Shape of X_train: (127372, 8)
Shape of y_train: (127372, 1)
Shape of X_test: (54588, 8)
Shape of y_test: (54588, 1)


In [15]:
n_estimators = [100, 200, 500, 1000, 2000] # [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)] # Nhiều quá ->
max_features = ['sqrt', 'log2', None] #Trong bai cua co la 2, 4, 5 ,9, 10, 15, 18
min_samples_split = [2, 5, 10] # Trong bai la 5, 11 #Nên lấy số lẻ

max_depth = [2, 5, 8, 10, None]
# [int(x) for x in np.linspace(5, 110, num = 22)] #-> 2, 5, 8, 10, None
# max_depth.append(None)
criterion = ['gini', 'entropy', 'log_loss']

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]


Random_forest_search = {'n_estimators': n_estimators,
                       'max_features': max_features,
                       'max_depth': max_depth,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'bootstrap': bootstrap,
                       'criterion': criterion}
print(Random_forest_search)

Extra_trees_search={'n_estimators': [int(x) for x in np.arange(50, 126, 5)],
                    'max_features': [int(x) for x in np.arange(50, 401, 50)],
                    'min_samples_leaf':  [int(x) for x in np.arange(20, 51, 5)],
                    'min_samples_split': [int(x) for x in np.arange(15, 36, 5)],
                    }
print(Extra_trees_search)

XGBoost_search = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
print(XGBoost_search)

{'n_estimators': [100, 200, 500, 1000, 2000], 'max_features': ['sqrt', 'log2', None], 'max_depth': [2, 5, 8, 10, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy', 'log_loss']}
{'n_estimators': [50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125], 'max_features': [50, 100, 150, 200, 250, 300, 350, 400], 'min_samples_leaf': [20, 25, 30, 35, 40, 45, 50], 'min_samples_split': [15, 20, 25, 30, 35]}
{'min_child_weight': [1, 5, 10], 'gamma': [0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5]}


In [16]:
classifiers = [['ExtraTreesClassifier', ExtraTreesClassifier(random_state=SEED), Extra_trees_search],
              ['RandomForestClassifier', RandomForestClassifier(random_state=SEED), Random_forest_search],
                ['XGBClassifier', XGBClassifier(random_state=SEED), XGBoost_search]]

In [17]:
def search_for_best_models(model, parameter_search, scoring='accuracy', verbose=5):
    search = RandomizedSearchCV(model, parameter_search, n_iter=10, scoring=scoring, verbose = verbose, n_jobs=1, cv=None, random_state=SEED)
    search.fit(X_train, Y_train) 
    best_model = search.best_estimator_
    optimal_params = search.best_params_
    Y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)

    print("Model: {}".format(model.__class__.__name__))
    print("Best parameters: {}".format(search.best_params_))
    print("Best cross-validation score: {:.2f}".format(search.best_score_))
    print("Test set score: {:.2f}".format(accuracy))
    print("-"*50)

    return best_model, search, accuracy, optimal_params

def plot_roc_curve(Y_test, Y_pred, name, title):
    fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)

    auc = roc_auc_score(Y_test, Y_pred)

    plt.plot(fpr, tpr, label="{0} (a = {1:.2f})".format(name.replace("Classifier", ""), auc))
    plt.plot([0, 1], [0, 1], linestyle="dashed")
    plt.title(title)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.savefig("ML__RandomSearchCV_ROC_Curve.png")
    # plt.show()

def plot_confusion_matrix(cm, title):
    df_cm = pd.DataFrame(cm, index = [i for i in ['Queen', 'Queenless']],
                    columns = [i for i in ['Queen', 'Queenless']])
    df_cm_normed = df_cm/df_cm.sum()
    plt.figure(figsize = (10,7))
    sns.heatmap(df_cm, annot=True, fmt="", )
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [18]:
# global cv
# cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=SEED)
# global X_train, Y_train, X_test, Y_test
# print(X_train[:5])

[[17.          0.5         0.53333336  0.1509434   9.          0.74025977
   0.2         0.62      ]
 [26.          0.64285713  0.8333333   0.18867925  9.          0.6883117
   0.49        0.44      ]
 [15.          0.6904762   0.6333333   0.0754717   9.          0.64935064
   0.55        0.4       ]
 [19.          0.71428573  0.76666665  0.24528302  2.          0.53246754
   0.14        0.46      ]
 [20.          0.5         0.56666666  0.13207547  0.          0.7532467
   0.65        0.56      ]]


In [21]:
global X_train, Y_train, X_test, Y_test

In [22]:
patch_sklearn()

def evaluate_classifiers(classifiers):
    models = []
    Accuracy_set = pd.DataFrame(index=None, columns=['Model','Accuracy(Train)','Accuracy(Test)','F1(Train)','F1(Test)', 'Precision(Train)','Precision(Test)', 'Recall(Train)','Recall(Test)', 'Log_loss(Train)','Log_loss(Test)', 'Train_Time(s)', 'Confusion_Matrix(Test)', 'Optimal_Params'])
    for i in tqdm(range(len(classifiers))):
        name = classifiers[i][0]
        model = classifiers[i][1]
        params = classifiers[i][2]
        time_start = time.time()
        best_model, search, accuracy, optimal_params = search_for_best_models(model, params)
        time_end = time.time()

        Y_train_predicted = best_model.predict(X_train)
        Y_test_predicited = best_model.predict(X_test)
        
        plot_roc_curve(Y_test, Y_test_predicited, name, "Machine Learning Algorithms Roc-Curve")

        accuracy_train = accuracy_score(Y_train, Y_train_predicted)
        accuracy_test = accuracy_score(Y_test, Y_test_predicited)

        f1_Score_train = f1_score(Y_train, Y_train_predicted,average='micro')
        f1_Score_test = f1_score(Y_test, Y_test_predicited,average='micro')

        precision_score_train = precision_score(Y_train, Y_train_predicted,average='micro')
        precision_score_test = precision_score(Y_test, Y_test_predicited,average='micro')

        recall_score_train = recall_score(Y_train, Y_train_predicted,average='micro')
        recall_score_test = recall_score(Y_test, Y_test_predicited,average='micro')

        log_loss_train = log_loss(Y_train, best_model.predict_proba(X_train))
        log_loss_test = log_loss(Y_test, best_model.predict_proba(X_test))
        
        cf_matrix = confusion_matrix(Y_test, Y_test_predicited)

        train_time = time_end - time_start
        
        # store the models
        models.append((name,accuracy_test,best_model))

        Accuracy_set = Accuracy_set.append(pd.Series({'Model':name, 'Accuracy(Train)':accuracy_train,'Accuracy(Test)':accuracy_test,'F1(Train)':f1_Score_train,'F1(Test)':f1_Score_test,'Precision(Train)':precision_score_train,'Precision(Test)':precision_score_test,'Recall(Train)':recall_score_train,'Recall(Test)':recall_score_test,'Log_loss(Train)':log_loss_train,'Log_loss(Test)':log_loss_test,'Train_Time(s)':train_time, 'Confusion_Matrix(Test)':cf_matrix, 'Optimal_Params':optimal_params}),ignore_index=True)
        time.sleep(0.1)
    return Accuracy_set, models

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [23]:
Accuracy_set, models = evaluate_classifiers(classifiers)

  0%|          | 0/3 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END max_features=300, min_samples_leaf=45, min_samples_split=20, n_estimators=85;, score=0.893 total time=   9.9s
[CV 2/5] END max_features=300, min_samples_leaf=45, min_samples_split=20, n_estimators=85;, score=0.889 total time=  10.4s
[CV 3/5] END max_features=300, min_samples_leaf=45, min_samples_split=20, n_estimators=85;, score=0.891 total time=   9.0s
[CV 4/5] END max_features=300, min_samples_leaf=45, min_samples_split=20, n_estimators=85;, score=0.890 total time=   8.1s
[CV 5/5] END max_features=300, min_samples_leaf=45, min_samples_split=20, n_estimators=85;, score=0.889 total time=   7.8s
[CV 1/5] END max_features=50, min_samples_leaf=30, min_samples_split=20, n_estimators=115;, score=0.894 total time=  11.0s
[CV 2/5] END max_features=50, min_samples_leaf=30, min_samples_split=20, n_estimators=115;, score=0.892 total time=  11.1s
[CV 3/5] END max_features=50, min_samples_leaf=30, min_samples_split=20, n_est

In [None]:
Accuracy_set.sort_values(by='Accuracy(Test)').style.background_gradient(cmap= plt.cm.Blues)

In [None]:
for index in range(len(Accuracy_set[0])):
    print("Accuracy: ", Accuracy_set[2][index], "Optimal_Params: ", Accuracy_set["Optimal_Params"][index])

In [None]:
for index in range(len(Accuracy_set['Accuracy(Test)'])):
    plot_confusion_matrix(Accuracy_set['Confusion_Matrix(Test)'][index], "{} Confusion matrix".format(Accuracy_set['Model'][index]))