In [241]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

Do đây là phần cải tiến nên mình sẽ bắt đầu từ step3

## 3. Tiền xử lý dữ liệu

### Load dữ liệu

In [242]:

df = pd.read_csv('weatherAUS.csv')
df.shape

(142193, 24)

In [243]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


### Tách dữ liệu

In [244]:
df.drop(['RISK_MM'], axis=1, inplace=True)

In [245]:
X = df.drop(['RainTomorrow'], axis=1)
y = df.RainTomorrow

In [246]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape

((113754, 22), (28439, 22))

### Xử lý dữ liệu bị khuyết

In [247]:
(X_train.isnull().sum()/X_train.shape[0]).sort_values(ascending=False)

Sunshine         0.477469
Evaporation      0.428275
Cloud3pm         0.402342
Cloud9am         0.379213
Pressure9am      0.098564
Pressure3pm      0.098335
WindDir9am       0.070134
WindGustDir      0.065114
WindGustSpeed    0.064763
WindDir3pm       0.026443
Humidity3pm      0.025406
Temp3pm          0.019085
WindSpeed3pm     0.018408
Humidity9am      0.012738
RainToday        0.010013
Rainfall         0.010013
WindSpeed9am     0.009547
Temp9am          0.006505
MinTemp          0.004351
MaxTemp          0.002321
Location         0.000000
Date             0.000000
dtype: float64

#### Xử lý date
Tách Date ra ngày tháng năm trước khi xử lý

In [248]:
X_train['Date'] = pd.to_datetime(X_train['Date'])
X_train['day'] = X_train['Date'].dt.day
X_train['month'] = X_train['Date'].dt.month
X_train['year'] = X_train['Date'].dt.year

X_test['Date'] = pd.to_datetime(X_test['Date'])
X_test['day'] = X_test['Date'].dt.day
X_test['month'] = X_test['Date'].dt.month
X_test['year'] = X_test['Date'].dt.year

X_train.drop(['Date'], axis=1, inplace=True)
X_test.drop(['Date'], axis=1, inplace=True)

#### Fill NA cho các biến số bằng Median

In [249]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.to_list()
num_cols

['MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'day',
 'month',
 'year']

In [250]:
from feature_engine.imputation import MeanMedianImputer

imputer = MeanMedianImputer(imputation_method='median')
imputer.fit(X_train[num_cols])
X_train[num_cols] = imputer.transform(X_train[num_cols])
X_test[num_cols] = imputer.transform(X_test[num_cols])

#### Fill NA cho các biến Categories bằng Mode

In [251]:
cate_cols = X_train.select_dtypes(include=['object']).columns.to_list()
cate_cols

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

In [252]:
from feature_engine.imputation import CategoricalImputer

imputer = CategoricalImputer(imputation_method='frequent')
imputer.fit(X_train[cate_cols])
X_train[cate_cols] = imputer.transform(X_train[cate_cols])
X_test[cate_cols] = imputer.transform(X_test[cate_cols])


In [253]:
assert X_train.isnull().sum().sum() == 0
assert X_test.isnull().sum().sum() == 0

### Xử lý outlier cho biến số

In [254]:
def max_value(df3, variable, top):
    return np.where(df3[variable]>top, top, df3[variable])

for df3 in [X_train, X_test]:
    df3['Rainfall'] = max_value(df3, 'Rainfall', 3.2)
    df3['Evaporation'] = max_value(df3, 'Evaporation', 21.8)
    df3['WindSpeed9am'] = max_value(df3, 'WindSpeed9am', 55)
    df3['WindSpeed3pm'] = max_value(df3, 'WindSpeed3pm', 57)

In [255]:
for var in ['Rainfall', 'Evaporation', 'WindSpeed9am', 'WindSpeed3pm']:
    print(var, X_train[var].max(), X_test[var].max())

Rainfall 3.2 3.2
Evaporation 21.8 21.8
WindSpeed9am 55.0 55.0
WindSpeed3pm 57.0 57.0


### Mã hoá biến hạng mục

#### Mã hoá target

In [256]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

#### Mã hoá 'RainToday'

In [257]:
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['RainToday'])
X_train_target = encoder.fit_transform(X_train)
X_test_target = encoder.transform(X_test)
X_train_target.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday_0,RainToday_1,day,month,year
110803,Witchcliffe,13.9,22.6,0.2,4.8,8.5,S,41.0,SSE,S,...,1013.4,5.0,5.0,18.8,20.4,0,1,25,4,2014
87289,Cairns,22.4,29.4,2.0,6.0,6.3,ENE,33.0,SSE,SE,...,1013.1,7.0,5.0,26.4,27.5,1,0,2,11,2015
134949,AliceSprings,9.7,36.2,0.0,11.4,12.3,E,31.0,NE,N,...,1013.6,1.0,1.0,28.5,35.0,0,1,19,10,2014
85553,Cairns,20.5,30.1,0.0,8.8,11.1,ESE,37.0,SSE,E,...,1010.8,2.0,3.0,27.3,29.4,0,1,30,10,2010
16110,Newcastle,16.8,29.2,0.0,4.8,8.5,W,39.0,N,SE,...,1015.2,5.0,8.0,22.2,27.0,0,1,8,11,2012


In [258]:
X_train_target.shape, X_test_target.shape

((113754, 25), (28439, 25))

In [259]:
cate_cols = X_train_target.select_dtypes(include=['object']).columns.to_list()
cate_cols

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']

#### One-hot các biến category còn lại

In [260]:
from feature_engine.encoding import OneHotEncoder

ohe_enc = OneHotEncoder(
    top_categories=None,
    variables=cate_cols,
    drop_last=False)
ohe_enc.fit(X_train_target)
X_train_encode = ohe_enc.transform(X_train_target)
X_test_endcode = ohe_enc.transform(X_test_target)

In [261]:
X_train_encode.shape, X_test_endcode.shape

((113754, 118), (28439, 118))

In [262]:
X_train_encode.select_dtypes(include=['object']).columns.to_list()

[]

In [263]:
X_test_endcode.select_dtypes(include=['object']).columns.to_list()

[]

In [264]:
X_train_encode.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_ESE,WindDir3pm_SSW,WindDir3pm_ENE,WindDir3pm_SW,WindDir3pm_NNE,WindDir3pm_WNW,WindDir3pm_SSE,WindDir3pm_NE,WindDir3pm_NW,WindDir3pm_W
110803,13.9,22.6,0.2,4.8,8.5,41.0,20.0,28.0,65.0,55.0,...,0,0,0,0,0,0,0,0,0,0
87289,22.4,29.4,2.0,6.0,6.3,33.0,7.0,19.0,71.0,59.0,...,0,0,0,0,0,0,0,0,0,0
134949,9.7,36.2,0.0,11.4,12.3,31.0,15.0,11.0,6.0,2.0,...,0,0,0,0,0,0,0,0,0,0
85553,20.5,30.1,0.0,8.8,11.1,37.0,22.0,19.0,59.0,53.0,...,0,0,0,0,0,0,0,0,0,0
16110,16.8,29.2,0.0,4.8,8.5,39.0,0.0,7.0,72.0,53.0,...,0,0,0,0,0,0,0,0,0,0


#### Rời rạc hoá các biến số

In [265]:
from feature_engine.discretisation import EqualFrequencyDiscretiser

disc = EqualFrequencyDiscretiser(q=10, variables=num_cols)
disc.fit(X_train_encode[num_cols])
X_train_encode[num_cols] = disc.transform(X_train_encode[num_cols])
X_test_endcode[num_cols] = disc.transform(X_test_endcode[num_cols])

In [266]:
X_train_encode.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_ESE,WindDir3pm_SSW,WindDir3pm_ENE,WindDir3pm_SW,WindDir3pm_NNE,WindDir3pm_WNW,WindDir3pm_SSE,WindDir3pm_NE,WindDir3pm_NW,WindDir3pm_W
110803,6,4,0,2,2,5,7,8,3,5,...,0,0,0,0,0,0,0,0,0,0
87289,9,7,2,3,1,2,1,4,5,6,...,0,0,0,0,0,0,0,0,0,0
134949,3,9,0,5,5,2,5,1,0,0,...,0,0,0,0,0,0,0,0,0,0
85553,8,8,0,5,4,4,8,4,2,5,...,0,0,0,0,0,0,0,0,0,0
16110,7,7,0,2,2,4,0,0,5,5,...,0,0,0,0,0,0,0,0,0,0


In [267]:
X_train_encode.shape, X_test_endcode.shape

((113754, 118), (28439, 118))

#### Scale dữ liệu số

In [268]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_encode[num_cols] = scaler.fit_transform(X_train_encode[num_cols])
X_test_endcode[num_cols] = scaler.transform(X_test_endcode[num_cols])

## 4. Feature Selection

In [269]:
# Split data
X_train = X_train_encode.copy()

X_test, X_valid, y_test, y_valid = train_test_split(
    X_test_endcode, y_test, test_size=0.5, random_state=0)

#### Lựa chọn đặc trưng bằng Random Forest

In [270]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

sel_ = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=10))
sel_.fit(X_train, y_train)

random_forest_X_train_set = sel_.transform(X_train)
random_forest_X_valid_set = sel_.transform(X_valid)
random_forest_X_test_set = sel_.transform(X_test)

#### Lựa chọn đặc trưng bằng ROC-AUC

In [271]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

roc_values = []

for feature in X_train.columns:

    # train a decision tree classifier
    clf = DecisionTreeClassifier()
    clf.fit(X_train[feature].values.reshape(-1, 1), y_train)

    # obtain the predictions
    y_scored = clf.predict_proba(X_valid[feature].values.reshape(-1, 1))

    # calculate and store the roc-auc
    roc_values.append(roc_auc_score(y_valid, y_scored[:, 1]))

roc_values = pd.Series(roc_values)
roc_values.index = X_train.columns
selected_feat_roc = roc_values[roc_values > 0.51].index
selected_feat_roc

roc_X_train_set = X_train[selected_feat_roc]
roc_X_valid_set = X_valid[selected_feat_roc]
roc_X_test_set = X_test[selected_feat_roc]

#### Lựa chọn đặc trưng bằng đệ quy

- Loại bỏ đặc trưng bằng đệ quy

In [272]:
from feature_engine.selection import RecursiveFeatureElimination
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=10, max_depth=2 ,random_state=10)
sel_ = RecursiveFeatureElimination(
    variables=None, 
    estimator = model, 
    scoring = 'roc_auc', # the metric we want to evalute
    threshold = 0.0005, # the maximum performance drop allowed to remove a feature
    cv=2, # cross-validation
)
sel_.fit(X_train, y_train)

recursive_ellimination_selected_X_train_set = sel_.transform(X_train)
recursive_ellimination_selected_X_valid_set = sel_.transform(X_valid)
recursive_ellimination_selected_X_test_set = sel_.transform(X_test)

- Thêm đặc trưng bằng đệ quy

In [273]:
from feature_engine.selection import RecursiveFeatureAddition

model = GradientBoostingClassifier(n_estimators=10, max_depth=2 ,random_state=10)
sel_ = RecursiveFeatureAddition(
    variables=None,
    estimator=model,
    scoring='roc_auc',
    threshold=0.0005,
    cv=3,)
sel_.fit(X_train, y_train)

recursive_addition_selected_X_train_set = sel_.transform(X_train)
recursive_addition_selected_X_valid_set = sel_.transform(X_valid)
recursive_addition_selected_X_test_set = sel_.transform(X_test)

## 5. Training Model

In [274]:
from sklearn.metrics import f1_score
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [275]:
def print_analysis_result(model):
    
    print("Model: ", model)
    
    print()
    print("All features:")
    print('='*40)
    print("Train")
    model.fit(X_train, y_train)
    print("F1 score:", f1_score(y_train, model.predict(X_train)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(X_valid)))
    
    print()
    print("Select features with Random Forest:")
    print('='*40)
    print("Train")
    model.fit(random_forest_X_train_set, y_train)
    print("F1 score:", f1_score(y_train, model.predict(random_forest_X_train_set)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(random_forest_X_valid_set)))

    print()
    print("Select features with ROC-AUC:")
    print('='*40)
    print("Train")
    model.fit(roc_X_train_set, y_train)
    print("F1 score:", f1_score(y_train, model.predict(roc_X_train_set)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(roc_X_valid_set)))
    
    print()
    print("Select features with RecursiveFeatureElimination:")
    print('='*40)
    print("Train")
    model.fit(recursive_ellimination_selected_X_train_set, y_train)
    print("F1 score:", f1_score(y_train, model.predict(recursive_ellimination_selected_X_train_set)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(recursive_ellimination_selected_X_valid_set)))
    
    print()
    print("Select features with RecursiveFeatureAddition:")
    print('='*40)
    print("Train")
    model.fit(recursive_addition_selected_X_train_set, y_train)
    print("F1 score:", f1_score(y_train, model.predict(recursive_addition_selected_X_train_set)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(recursive_addition_selected_X_valid_set)))
    
    print('='*40)

#### LogisticRegression

In [276]:
model = LogisticRegression()
print_analysis_result(model)

Model:  LogisticRegression()

All features:
Train
F1 score: 0.5885321100917431
Validation
F1 score: 0.5966126388635951

Select features with Random Forest:
Train
F1 score: 0.5709930715935335
Validation
F1 score: 0.5865595325054785

Select features with ROC-AUC:
Train
F1 score: 0.5728128460686601
Validation
F1 score: 0.5885578504843723

Select features with RecursiveFeatureElimination:
Train
F1 score: 0.5604469480956025
Validation
F1 score: 0.5777206826940723

Select features with RecursiveFeatureAddition:
Train
F1 score: 0.5582065280520385
Validation
F1 score: 0.5746914717259163


#### DecisionTreeClassifier

In [277]:
model = tree.DecisionTreeClassifier()
print_analysis_result(model)

Model:  DecisionTreeClassifier()

All features:
Train
F1 score: 0.9999803956164598
Validation
F1 score: 0.5439754412893324

Select features with Random Forest:
Train
F1 score: 0.999960790464241
Validation
F1 score: 0.5262841694202463

Select features with ROC-AUC:
Train
F1 score: 0.999960790464241
Validation
F1 score: 0.5295987887963665

Select features with RecursiveFeatureElimination:
Train
F1 score: 0.6916912117268306
Validation
F1 score: 0.5380782918149466

Select features with RecursiveFeatureAddition:
Train
F1 score: 0.6225577883610176
Validation
F1 score: 0.564149560117302


#### AdaBoostClassifier

In [278]:
model = AdaBoostClassifier()
print_analysis_result(model)

Model:  AdaBoostClassifier()

All features:
Train
F1 score: 0.5850865696350969
Validation
F1 score: 0.6003286470695636

Select features with Random Forest:
Train
F1 score: 0.5796778535951352
Validation
F1 score: 0.5903213889915034

Select features with ROC-AUC:
Train
F1 score: 0.5781933203088659
Validation
F1 score: 0.5925106068990961

Select features with RecursiveFeatureElimination:
Train
F1 score: 0.5723081196185539
Validation
F1 score: 0.5860036832412523

Select features with RecursiveFeatureAddition:
Train
F1 score: 0.5716477233107873
Validation
F1 score: 0.5855275271589027


#### RandomForestClassifier

In [279]:
model = RandomForestClassifier(n_estimators=100, random_state=10)
print_analysis_result(model)

Model:  RandomForestClassifier(random_state=10)

All features:
Train
F1 score: 0.999960790464241
Validation
F1 score: 0.607211354046797

Select features with Random Forest:
Train
F1 score: 0.9999215840031367
Validation
F1 score: 0.6105461393596987

Select features with ROC-AUC:
Train
F1 score: 0.9999607920015683
Validation
F1 score: 0.6046247156937073

Select features with RecursiveFeatureElimination:
Train
F1 score: 0.7055488934917693
Validation
F1 score: 0.5630898287312128

Select features with RecursiveFeatureAddition:
Train
F1 score: 0.6339243472187802
Validation
F1 score: 0.5744870651204281


#### MLPClassifier

In [280]:
model = model = MLPClassifier(hidden_layer_sizes=(100, 50, ), max_iter=300)
print_analysis_result(model)

Model:  MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300)

All features:
Train
F1 score: 0.8802248838914691
Validation
F1 score: 0.5828843106180666

Select features with Random Forest:
Train
F1 score: 0.7012763326414344
Validation
F1 score: 0.5933639251676668

Select features with ROC-AUC:
Train
F1 score: 0.7459525463693973
Validation
F1 score: 0.6075610572097692

Select features with RecursiveFeatureElimination:
Train
F1 score: 0.5870286435213308
Validation
F1 score: 0.5872071402008182

Select features with RecursiveFeatureAddition:
Train
F1 score: 0.5799607439947658
Validation
F1 score: 0.5893254262416605


Sau khi phân tích thì model được lựa chọn:

- RandomForestClassifier
- F1 score on Validation set: 0.6105461393596987
- List feature được chọn: `random_forest_X_train_set` / `random_forest_X_valid_set` / `random_forest_X_test_set` 

## 6. Hyperparameter turning

In [281]:
from sklearn.model_selection import GridSearchCV

# Define Parameters
max_depth=[2, 8, 16]
n_estimators = [50, 100, 150, 200, 250]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)

clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv = 5)
grid_results = grid.fit(random_forest_X_train_set, y_train)

# Summarize the results in a readable format
print("Best: {0}, using {1}".format(grid_results.cv_results_['mean_test_score'], grid_results.best_params_))
results_df = pd.DataFrame(grid_results.cv_results_)
results_df

Best: [0.80793648 0.80712767 0.80734744 0.80745292 0.80834962 0.84074407
 0.84063858 0.84050671 0.84072649 0.84072649 0.84860315 0.84891963
 0.84890204 0.84948223 0.8502031 ], using {'max_depth': 16, 'n_estimators': 250}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.043501,0.026553,0.055621,0.000909,2,50,"{'max_depth': 2, 'n_estimators': 50}",0.815613,0.80225,0.802602,0.804976,0.814242,0.807936,0.005801,12
1,1.985526,0.044851,0.112472,0.014777,2,100,"{'max_depth': 2, 'n_estimators': 100}",0.806118,0.809942,0.805107,0.80669,0.80778,0.807128,0.001652,15
2,2.880723,0.014911,0.155927,0.000638,2,150,"{'max_depth': 2, 'n_estimators': 150}",0.809327,0.804844,0.805459,0.809635,0.807473,0.807347,0.001949,14
3,3.88754,0.04867,0.206863,0.002099,2,200,"{'max_depth': 2, 'n_estimators': 200}",0.804404,0.804624,0.808756,0.81214,0.807341,0.807453,0.002863,13
4,4.836889,0.027866,0.259415,0.003208,2,250,"{'max_depth': 2, 'n_estimators': 250}",0.804844,0.81047,0.80669,0.808448,0.811297,0.80835,0.002376,11
5,2.692323,0.012443,0.09683,0.001032,8,50,"{'max_depth': 8, 'n_estimators': 50}",0.840842,0.837985,0.839787,0.84304,0.842066,0.840744,0.001764,6
6,5.444556,0.080375,0.189565,0.000587,8,100,"{'max_depth': 8, 'n_estimators': 100}",0.841194,0.838029,0.839831,0.8426,0.841538,0.840639,0.001577,9
7,8.218706,0.080674,0.287471,0.007074,8,150,"{'max_depth': 8, 'n_estimators': 150}",0.841018,0.837721,0.840007,0.842556,0.841231,0.840507,0.001612,10
8,10.782605,0.060251,0.375975,0.002538,8,200,"{'max_depth': 8, 'n_estimators': 200}",0.841326,0.838205,0.839875,0.842381,0.841846,0.840726,0.001512,8
9,13.518085,0.068775,0.473326,0.004954,8,250,"{'max_depth': 8, 'n_estimators': 250}",0.840271,0.838029,0.840051,0.843172,0.84211,0.840726,0.001779,7


In [282]:
model = RandomForestClassifier(n_estimators=250, max_depth=16)
model.fit(random_forest_X_train_set, y_train)
y_pred = model.predict(random_forest_X_test_set)
print('F1 score:', f1_score(y_test, y_pred))

F1 score: 0.6016949152542374
