In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')

Mounted at /content/drive


## Setup

In [None]:
!pip install catboost

In [None]:
import warnings
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
warnings.filterwarnings("ignore")

## Load and show some data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/ProjectBigData/Data/comb_extraSNS_ReINTEL.csv')
df['post_message']=df['post_message'].fillna('none')

In [None]:
print(df.info())
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7609 entries, 0 to 7608
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   post_message  7609 non-null   object
 1   label         7609 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.0+ KB
None


Unnamed: 0,post_message,label
0,Ngày 5 / 5 / 2021 toà_án tỉnh Hoà_Bình sẽ xử s...,1
1,VÀI CHIÊU GÂY SỐT ĐẤT Thái Hạo Giá đất tăng độ...,1
2,BÀN VỀ VỤ TẤT THÀNH CANG BỊ KHAI_TRỪ KHỎI ĐẢNG...,1
3,THÊM 1 ĐẠI_ÚY CÔNG_AN TỐ_CÁO SAI_PHẠM CỦA LÃNH...,1
4,"Thư_giãn , CHUYỆN LẠ “ QUÁI_THAI ” Ở XỨ TA Nhi...",1


## Split data to train and test set

In [None]:
train, test = train_test_split(df, test_size=0.15, random_state=123)
print(train.info())
train.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6467 entries, 3176 to 3582
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   post_message  6467 non-null   object
 1   label         6467 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 151.6+ KB
None


Unnamed: 0,post_message,label
3176,Chị_gái chặn xe cứu_hoả vì làm đứt dây_điện Vụ...,0
2493,Share & tag bạn_bè nhé các em !TP. HCM chí...,0
706,ĐẢNG : THẤY NGƯỜI SANG BẮT QUÀNG LÀM HỌ Một_số...,1
6836,「SAO VIỆT ĐẦU_TIÊN QUYÊN_GÓP PHÒNG_DỊCH COVID-...,0
7286,Ý_tưởng hay quá Mỗi copy . chia_sẻ lan_toả và ...,0


In [None]:
print(test.info())
test.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1142 entries, 1129 to 790
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   post_message  1142 non-null   object
 1   label         1142 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 26.8+ KB
None


Unnamed: 0,post_message,label
1129,CƯỚI HỎI THỜI COVID - 19 Chiều 28 Tháng_Giêng ...,1
7071,< URL > Lê_Đức_Nhân . Giám_đốc Bệnh_viện Đà_Nẵ...,0
7431,Sốc với những loại “ bùa ngải ” có thật dấu_hi...,1
6254,Uỷ_ban Thường_vụ Quốc_hội đã ban_hành Nghị_quy...,0
2357,Toang rồi Nghệ_An ơi ...UBND xã Nghĩa Thuận . ...,1


In [None]:
print('train size: ', train.shape, '\ntest size: ', test.shape)

train size:  (6467, 2) 
test size:  (1142, 2)


## Function

In [None]:
def get_metrics(y_test, y_pred_proba):
  print('ACCURACY_SCORE: ', round(accuracy_score(y_test, y_pred_proba>=0.5), 4), '\n')
  print('ROC_AUC_SCORE: ', round(roc_auc_score(y_test, y_pred_proba), 4), '\n')
  print('F1_SCORE: ', round(f1_score(y_test, y_pred_proba>=0.5, average='macro'), 4), '\n')
  print('CONFUSION_MATRIX:\n', confusion_matrix(y_test, y_pred_proba>=0.5),'\n')

In [None]:
def get_text_data(x):
    return [record[0] for record in x]
transformer_text = FunctionTransformer(get_text_data)

In [None]:
pipeline = Pipeline([
    ('features', FeatureUnion([
            ('text_features', Pipeline([
                ('selector', transformer_text),
                ('tfidf', TfidfVectorizer(max_features=100000, ngram_range=(1,2))),
            ]))
    ])),
])

## Data Pre-train

In [None]:
X_train = train.drop(['label'], axis=1).to_numpy()
X_test = test.drop(['label'], axis=1).to_numpy()

In [None]:
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)
y_train = train['label'].values
y_test = test['label'].values

print(X_train.shape)
print(X_test.shape)

(6467, 100000)
(1142, 100000)


In [None]:
import joblib

# Lưu pipeline đã train
pipeline_path = "/content/drive/MyDrive/ProjectBigData/code/save_models_hdf5/pipeline.pkl"
joblib.dump(pipeline, pipeline_path)

['/content/drive/MyDrive/ProjectBigData/code/save_models_hdf5/pipeline.pkl']

## Train and evaluate

In [None]:
model = LGBMClassifier()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test) [:,1]
get_metrics(y_test, y_pred_proba)

ACCURACY_SCORE:  0.9046 

ROC_AUC_SCORE:  0.9426 

F1_SCORE:  0.8787 

CONFUSION_MATRIX:
 [[780  27]
 [ 82 253]] 



## Table score

In [None]:
list_model = [LogisticRegression(), DecisionTreeClassifier(),
              RandomForestClassifier(), CatBoostClassifier(verbose=200),
              LGBMClassifier(), XGBClassifier(),
              SVC(kernel='linear', probability=True)]

list_model_name, list_accuracy_score, list_f1_score, list_roc_auc = [], [], [], []

In [None]:
path = '/content/drive/MyDrive/ProjectBigData/code/save_models_hdf5/'
for i, model in enumerate(list_model):
  print(f"{type(model).__name__}:")
  model.fit(X_train, y_train)
  y_pred_proba = model.predict_proba(X_test) [:,1]
  list_model_name.append(type(model).__name__);
  list_accuracy_score.append(accuracy_score(y_test, y_pred_proba>=0.5))
  list_f1_score.append(f1_score(y_test, y_pred_proba>=0.5, average='macro'))
  list_roc_auc.append(roc_auc_score(y_test, y_pred_proba))

  print('save model: ', list_model_name[i], '\n')
  model_file = f"{list_model_name[i]}.hdf5"
  if isinstance(model, CatBoostClassifier):
    # Lưu mô hình CatBoost bằng cách sử dụng phương thức save_model
    model.save_model(path + model_file)
  else:
    # Lưu các mô hình khác bằng cách sử dụng h5py
    joblib.dump(model, path + model_file)


LogisticRegression:
save model:  LogisticRegression 

DecisionTreeClassifier:
save model:  DecisionTreeClassifier 

RandomForestClassifier:
save model:  RandomForestClassifier 

CatBoostClassifier:
Learning rate set to 0.022861
0:	learn: 0.6796309	total: 1.36s	remaining: 22m 42s
200:	learn: 0.3239973	total: 3m 49s	remaining: 15m 11s
400:	learn: 0.2755264	total: 7m 36s	remaining: 11m 21s
600:	learn: 0.2339344	total: 11m 21s	remaining: 7m 32s
800:	learn: 0.2035052	total: 15m 7s	remaining: 3m 45s
999:	learn: 0.1810970	total: 18m 53s	remaining: 0us
save model:  CatBoostClassifier 

LGBMClassifier:
save model:  LGBMClassifier 

XGBClassifier:
save model:  XGBClassifier 

SVC:
save model:  SVC 



In [None]:
table_cols = {'Model name': list_model_name,
              'Accuracy score': list_accuracy_score,
              'Macro-F1 score': list_f1_score,
              'ROC-AUC score': list_roc_auc}

table = pd.DataFrame(table_cols)
table = table.sort_values(by=['Accuracy score'], ascending=False).reset_index(drop=True)
table

Unnamed: 0,Model name,Accuracy score,Macro-F1 score,ROC-AUC score
0,LGBMClassifier,0.904553,0.878728,0.942555
1,SVC,0.900175,0.878538,0.951695
2,XGBClassifier,0.894046,0.864537,0.941822
3,CatBoostClassifier,0.888792,0.855366,0.936851
4,RandomForestClassifier,0.872154,0.831298,0.935762
5,LogisticRegression,0.864273,0.819452,0.934395
6,DecisionTreeClassifier,0.844133,0.812995,0.813527


In [None]:
import joblib

path = '/content/drive/MyDrive/ProjectBigData/code/save_models_hdf5/'

lgbm = joblib.load(path + 'LGBMClassifier.hdf5')
lr_model = joblib.load(path + 'LogisticRegression.hdf5')
svm_model = joblib.load(path + 'SVC.hdf5')
xgb = joblib.load(path + 'XGBClassifier.hdf5')
rf = joblib.load(path + 'RandomForestClassifier.hdf5')
dt = joblib.load(path + 'DecisionTreeClassifier.hdf5')

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
lgbm_predict = lgbm.predict_proba(X_test)[:, 1]
lr_model_predict = lr_model.predict_proba(X_test)[:, 1]
svm_model_predict = svm_model.predict_proba(X_test)[:, 1]
xgb_predict = xgb.predict_proba(X_test)[:, 1]
rf_predict = rf.predict_proba(X_test)[:, 1]
dt_predict = dt.predict_proba(X_test)[:, 1]

In [None]:
lgbm_predict = np.round(lgbm_predict)
lr_model_predict = np.round(lr_model_predict)
svm_model_predict = np.round(svm_model_predict)
xgb_predict = np.round(xgb_predict)
rf_predict = np.round(rf_predict)
dt_predict = np.round(dt_predict)

In [None]:
lgbm_accuracy = accuracy_score(y_test, lgbm_predict)
lgbm_f1_score = f1_score(y_test, lgbm_predict, average='macro')
lgbm_roc_auc = roc_auc_score(y_test, lgbm_predict)

# Print the evaluation metrics
print('Ensemble Model:')
print('Accuracy Score:', lgbm_accuracy)
print('F1 Score:', lgbm_f1_score)
print('ROC AUC Score:', lgbm_roc_auc)

Ensemble Model:
Accuracy Score: 0.9045534150612959
F1 Score: 0.8787278298187415
ROC AUC Score: 0.8608833157631914


In [None]:
accuracy = accuracy_score(y_test, lr_model_predict)
f1_score = f1_score(y_test, lr_model_predict, average='macro')
roc_auc = roc_auc_score(y_test, lr_model_predict)

# Print the evaluation metrics
print('Ensemble Model:')
print('Accuracy Score:', accuracy)
print('F1 Score:', f1_score)
print('ROC AUC Score:', roc_auc)

Ensemble Model:
Accuracy Score: 0.8642732049036778
F1 Score: 0.8194517968631267
ROC AUC Score: 0.7939725165991603


In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

accuracy = accuracy_score(y_test, svm_model_predict)
f1_score_value = f1_score(y_test, svm_model_predict, average='macro')
roc_auc = roc_auc_score(y_test, svm_model_predict)

# Print the evaluation metrics
print('Ensemble Model:')
print('Accuracy Score:', accuracy)
print('F1 Score:', f1_score_value)
print('ROC AUC Score:', roc_auc)

Ensemble Model:
Accuracy Score: 0.9001751313485113
F1 Score: 0.8780974710852758
ROC AUC Score: 0.8734986776156393


In [None]:
accuracy = accuracy_score(y_test, xgb_predict)
f1_score_value = f1_score(y_test, xgb_predict, average='macro')
roc_auc = roc_auc_score(y_test, xgb_predict)

# Print the evaluation metrics
print('Ensemble Model:')
print('Accuracy Score:', accuracy)
print('F1 Score:', f1_score_value)
print('ROC AUC Score:', roc_auc)

Ensemble Model:
Accuracy Score: 0.8940455341506129
F1 Score: 0.8645374114648432
ROC AUC Score: 0.8455917438828164


In [None]:
accuracy = accuracy_score(y_test, rf_predict)
f1_score_value = f1_score(y_test, rf_predict, average='macro')
roc_auc = roc_auc_score(y_test, rf_predict)

# Print the evaluation metrics
print('Ensemble Model:')
print('Accuracy Score:', accuracy)
print('F1 Score:', f1_score_value)
print('ROC AUC Score:', roc_auc)

Ensemble Model:
Accuracy Score: 0.8677758318739054
F1 Score: 0.8232877484410787
ROC AUC Score: 0.7964508313451331


In [None]:
accuracy = accuracy_score(y_test, dt_predict)
f1_score_value = f1_score(y_test, dt_predict, average='macro')
roc_auc = roc_auc_score(y_test, dt_predict)

# Print the evaluation metrics
print('Ensemble Model:')
print('Accuracy Score:', accuracy)
print('F1 Score:', f1_score_value)
print('ROC AUC Score:', roc_auc)

Ensemble Model:
Accuracy Score: 0.8441330998248686
F1 Score: 0.8129949365598046
ROC AUC Score: 0.8146405518874031


## Model weighted ensemble

In [None]:
# Combine the predictions using the defined weights
ensemble_pred = (svm_model_predict * 0.4)  + (lgbm_predict * 0.6)
ensemble_pred = np.round(ensemble_pred)

In [None]:
# Calculate evaluation metrics for the ensemble model
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)
ensemble_f1_score = f1_score(y_test, ensemble_pred, average='macro')
ensemble_roc_auc = roc_auc_score(y_test, ensemble_pred)

# Print the evaluation metrics
print('Ensemble Model:')
print('Accuracy Score:', ensemble_accuracy)
print('F1 Score:', ensemble_f1_score)
print('ROC AUC Score:', ensemble_roc_auc)

Ensemble Model:
Accuracy Score: 0.9124343257443083
F1 Score: 0.8904051745647876
ROC AUC Score: 0.8769350274649058


In [None]:
# Combine the predictions using the defined weights
ensemble_pred = ( xgb_predict* 0.5)  + (svm_model_predict * 0.5)
ensemble_pred = np.round(ensemble_pred)

# Calculate evaluation metrics for the ensemble model
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)
ensemble_f1_score = f1_score(y_test, ensemble_pred, average='macro')
ensemble_roc_auc = roc_auc_score(y_test, ensemble_pred)

# Print the evaluation metrics
print('Ensemble Model:')
print('Accuracy Score:', ensemble_accuracy)
print('F1 Score:', ensemble_f1_score)
print('ROC AUC Score:', ensemble_roc_auc)

Ensemble Model:
Accuracy Score: 0.9063047285464098
F1 Score: 0.8833041260988526
ROC AUC Score: 0.871725018032514


In [None]:
# Combine the predictions using the defined weights
ensemble_pred = ( xgb_predict* 0.4)  + (lgbm_predict * 0.6)
ensemble_pred = np.round(ensemble_pred)

# Calculate evaluation metrics for the ensemble model
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)
ensemble_f1_score = f1_score(y_test, ensemble_pred, average='macro')
ensemble_roc_auc = roc_auc_score(y_test, ensemble_pred)

# Print the evaluation metrics
print('Ensemble Model:')
print('Accuracy Score:', ensemble_accuracy)
print('F1 Score:', ensemble_f1_score)
print('ROC AUC Score:', ensemble_roc_auc)

Ensemble Model:
Accuracy Score: 0.9036777583187391
F1 Score: 0.8774893210322028
ROC AUC Score: 0.8593907784497586


### Weight 3 model

In [None]:
three_ensemble_pred = (svm_model_predict * 0.4)  + (lgbm_predict * 0.4) + (xgb_predict * 0.2 )
three_ensemble_pred = np.round(three_ensemble_pred)
# Calculate evaluation metrics for the ensemble model
ensemble_accuracy = accuracy_score(y_test, three_ensemble_pred)
ensemble_f1_score = f1_score(y_test, three_ensemble_pred, average='macro')
ensemble_roc_auc = roc_auc_score(y_test, three_ensemble_pred)

# Print the evaluation metrics
print('Ensemble Model:')
print('Accuracy Score:', ensemble_accuracy)
print('F1 Score:', ensemble_f1_score)
print('ROC AUC Score:', ensemble_roc_auc)

Ensemble Model:
Accuracy Score: 0.9133099824868651
F1 Score: 0.8913942910388618
ROC AUC Score: 0.877554606151399


In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Train the individual models
svm_model = SVC(kernel='linear', probability=True)
lr_model = LogisticRegression()
rf_model = RandomForestClassifier()

svm_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
svm_pred = svm_model.predict_proba(X_test)[:, 1]
lr_pred = lr_model.predict_proba(X_test)[:, 1]
rf_pred = rf_model.predict_proba(X_test)[:, 1]

# Calculate the weights based on the AUC scores of individual models
svm_auc = roc_auc_score(y_test, svm_pred)
lr_auc = roc_auc_score(y_test, lr_pred)
rf_auc = roc_auc_score(y_test, rf_pred)

total_auc = svm_auc + lr_auc + rf_auc
svm_weight = svm_auc / total_auc
lr_weight = lr_auc / total_auc
rf_weight = rf_auc / total_auc

# Combine the predictions using weighted averaging
ensemble_pred = (svm_pred * 0.4) + (lr_pred * 0.2) + (rf_pred * 0.4)

# Calculate the AUC score of the ensemble model
ensemble_auc = roc_auc_score(y_test, ensemble_pred)

print('Ensemble AUC Score:', ensemble_auc)

In [None]:
pip install trainval_classifier

In [None]:
import option

In [None]:
from importlib import import_module

In [None]:
import shutil

In [4]:
import logging

logging.basicConfig(level=logging.INFO, filename='/content/log.log', filemode='w')

logging.info('ghghghghghghddddddddghg')

In [7]:
import logging

def init_logger(log_file=None, log_file_level=logging.NOTSET):
    log_format = logging.Formatter("%(message)s")
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)

    console_handler = logging.StreamHandler()
    console_handler.setFormatter(log_format)
    logger.handlers = [console_handler]

    if log_file and log_file != '':
        if not os.path.isdir('/logs'):
            os.makedirs('/logs')
        log_file = os.path.join('/logs/', log_file)
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(log_file_level)
        file_handler.setFormatter(log_format)
        logger.addHandler(file_handler)
    return logger
import os
from datetime import datetime
LOGGER = init_logger(datetime.now().strftime('%d%b%Y_%H-%M-%S.log'))
LOGGER.info("[TRAINER] Start TRAIN process...")

[TRAINER] Start TRAIN process...
INFO:__main__:[TRAINER] Start TRAIN process...
