In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-win_amd64.whl (101.0 MB)
     ------------------------------------ 101.0/101.0 MB 445.7 kB/s eta 0:00:00
Collecting graphviz
  Using cached graphviz-0.20.1-py3-none-any.whl (47 kB)
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.2 graphviz-0.20.1


In [3]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-win_amd64.whl (1.0 MB)
     ---------------------------------------- 1.0/1.0 MB 1.4 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.5


In [5]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.6-py3-none-win_amd64.whl (70.9 MB)
     -------------------------------------- 70.9/70.9 MB 806.9 kB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6
Note: you may need to restart the kernel to use updated packages.


In [1]:
import warnings
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('comb_extraSNS_REINTEL.csv')
df['post_message']=df['post_message'].fillna('none')

In [3]:
print(df.info())
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7609 entries, 0 to 7608
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   post_message  7609 non-null   object
 1   label         7609 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.0+ KB
None


Unnamed: 0,post_message,label
0,Ngày 5 / 5 / 2021 toà_án tỉnh Hoà_Bình sẽ xử s...,1
1,VÀI CHIÊU GÂY SỐT ĐẤT Thái Hạo Giá đất tăng độ...,1
2,BÀN VỀ VỤ TẤT THÀNH CANG BỊ KHAI_TRỪ KHỎI ĐẢNG...,1
3,THÊM 1 ĐẠI_ÚY CÔNG_AN TỐ_CÁO SAI_PHẠM CỦA LÃNH...,1
4,"Thư_giãn , CHUYỆN LẠ “ QUÁI_THAI ” Ở XỨ TA Nhi...",1


In [4]:
train, test = train_test_split(df, test_size=0.15, random_state=123)
print(train.info())
train.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6467 entries, 3176 to 3582
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   post_message  6467 non-null   object
 1   label         6467 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 151.6+ KB
None


Unnamed: 0,post_message,label
3176,Chị_gái chặn xe cứu_hoả vì làm đứt dây_điện Vụ...,0
2493,Share & tag bạn_bè nhé các em !TP. HCM chí...,0
706,ĐẢNG : THẤY NGƯỜI SANG BẮT QUÀNG LÀM HỌ Một_số...,1
6836,「SAO VIỆT ĐẦU_TIÊN QUYÊN_GÓP PHÒNG_DỊCH COVID-...,0
7286,Ý_tưởng hay quá Mỗi copy . chia_sẻ lan_toả và ...,0


In [5]:
print(test.info())
test.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1142 entries, 1129 to 790
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   post_message  1142 non-null   object
 1   label         1142 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 26.8+ KB
None


Unnamed: 0,post_message,label
1129,CƯỚI HỎI THỜI COVID - 19 Chiều 28 Tháng_Giêng ...,1
7071,< URL > Lê_Đức_Nhân . Giám_đốc Bệnh_viện Đà_Nẵ...,0
7431,Sốc với những loại “ bùa ngải ” có thật dấu_hi...,1
6254,Uỷ_ban Thường_vụ Quốc_hội đã ban_hành Nghị_quy...,0
2357,Toang rồi Nghệ_An ơi ...UBND xã Nghĩa Thuận . ...,1


In [6]:
print('train size: ', train.shape, '\ntest size: ', test.shape)

train size:  (6467, 2) 
test size:  (1142, 2)


In [7]:
def get_metrics(y_test, y_pred_proba):
  print('ACCURACY_SCORE: ', round(accuracy_score(y_test, y_pred_proba>=0.5), 4), '\n')
  print('ROC_AUC_SCORE: ', round(roc_auc_score(y_test, y_pred_proba), 4), '\n')
  print('F1_SCORE: ', round(f1_score(y_test, y_pred_proba>=0.5, average='macro'), 4), '\n')
  print('CONFUSION_MATRIX:\n', confusion_matrix(y_test, y_pred_proba>=0.5),'\n')

In [8]:
def get_text_data(x):
    return [record[0] for record in x]
transformer_text = FunctionTransformer(get_text_data)

In [9]:
pipeline = Pipeline([
    ('features', FeatureUnion([
            ('text_features', Pipeline([
                ('selector', transformer_text),
                ('tfidf', TfidfVectorizer(max_features=100000, ngram_range=(1,2))),
            ]))
    ])),
])

In [10]:
X_train = train.drop(['label'], axis=1).to_numpy()
X_test = test.drop(['label'], axis=1).to_numpy()

In [11]:
print(X_test.shape)

(1142, 1)


In [12]:
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)
y_train = train['label'].values
y_test = test['label'].values

print(X_train.shape)
print(X_test.shape)

(6467, 100000)
(1142, 100000)


In [13]:
import joblib

# Lưu pipeline đã train
pipeline_path = "C:\\Users\\ADMIN\\BigData\\models\\pipeline.pkl"
joblib.dump(pipeline, pipeline_path)

['C:\\Users\\ADMIN\\BigData\\models\\pipeline.pkl']

In [23]:
print(X_test)

  (0, 1162)	0.06774986072154572
  (0, 1231)	0.19876154894599396
  (0, 10496)	0.1384424404311213
  (0, 10533)	0.25972665700666214
  (0, 13593)	0.06883950350616476
  (0, 13594)	0.06972478968035031
  (0, 14048)	0.0599801851621782
  (0, 14106)	0.1580551845913275
  (0, 14647)	0.0952831690430908
  (0, 14772)	0.13475231347433914
  (0, 18482)	0.11828747246748442
  (0, 21013)	0.08109760452020263
  (0, 21033)	0.10253594212677439
  (0, 29738)	0.15765092262139352
  (0, 29758)	0.23318092690542586
  (0, 30946)	0.1545919222606974
  (0, 34418)	0.13076278618329673
  (0, 34448)	0.20728119634998513
  (0, 43573)	0.0931618941898987
  (0, 43700)	0.22899740600401236
  (0, 53632)	0.1501115888380671
  (0, 53764)	0.203591069393203
  (0, 53806)	0.25071361565803957
  (0, 71520)	0.13197794722691433
  (0, 71535)	0.22899740600401236
  :	:
  (775, 40080)	0.2284922756613221
  (775, 40095)	0.1382489202668535
  (775, 41781)	0.10045005938740424
  (775, 42045)	0.254130845673887
  (775, 43299)	0.23841300078049418
  (775, 4

## SVM

In [14]:
modelSVM = SVC(kernel='linear', probability=True)
modelSVM.fit(X_train, y_train)

In [15]:
y_pred_proba = modelSVM.predict_proba(X_test) [:,1]

scores = get_metrics(y_test, y_pred_proba)

ACCURACY_SCORE:  0.8993 

ROC_AUC_SCORE:  0.9517 

F1_SCORE:  0.8774 

CONFUSION_MATRIX:
 [[755  52]
 [ 63 272]] 



In [16]:
print('accuracy_score', accuracy_score(y_test, y_pred_proba>=0.5))
print('f1_score', f1_score(y_test, y_pred_proba>=0.5, average='macro'))
print('roc_auc_score', roc_auc_score(y_test, y_pred_proba))


accuracy_score 0.8992994746059545
f1_score 0.8773619703513482
roc_auc_score 0.9517320460892563


In [17]:
## Save model
path = 'C:\\Users\\ADMIN\\BigData\\models\\'
model_file = f"modelSVM_to_treaming.hdf5"
joblib.dump(modelSVM, path + model_file)

['C:\\Users\\ADMIN\\BigData\\models\\modelSVM_to_treaming.hdf5']

## Train multi model together

In [16]:
list_model = [LogisticRegression(), DecisionTreeClassifier(),
              RandomForestClassifier(),
              LGBMClassifier(), XGBClassifier(),
              SVC(kernel='linear', probability=True)]

list_model_name, list_accuracy_score, list_f1_score, list_roc_auc = [], [], [], []

In [17]:
path = 'C:\\Users\\ADMIN\\BigData\\models\\'
for i, model in enumerate(list_model):
  print(f"{type(model).__name__}:")
  model.fit(X_train, y_train)
  y_pred_proba = model.predict_proba(X_test) [:,1]
  list_model_name.append(type(model).__name__);
  list_accuracy_score.append(accuracy_score(y_test, y_pred_proba>=0.5))
  list_f1_score.append(f1_score(y_test, y_pred_proba>=0.5, average='macro'))
  list_roc_auc.append(roc_auc_score(y_test, y_pred_proba))

  print('save model: ', list_model_name[i], '\n')
  model_file = f"{list_model_name[i]}.hdf5"
  if isinstance(model, CatBoostClassifier):
    # Lưu mô hình CatBoost bằng cách sử dụng phương thức save_model
    model.save_model(model_file)
  else:
    # Lưu các mô hình khác bằng cách sử dụng h5py
    joblib.dump(model, path + model_file)

LogisticRegression:
save model:  LogisticRegression 

DecisionTreeClassifier:
save model:  DecisionTreeClassifier 

RandomForestClassifier:
save model:  RandomForestClassifier 

LGBMClassifier:
save model:  LGBMClassifier 

XGBClassifier:
save model:  XGBClassifier 

SVC:
save model:  SVC 



In [18]:
table_cols = {'Model name': list_model_name,
              'Accuracy score': list_accuracy_score,
              'Macro-F1 score': list_f1_score,
              'ROC-AUC score': list_roc_auc}

table = pd.DataFrame(table_cols)
table = table.sort_values(by=['Accuracy score'], ascending=False).reset_index(drop=True)
table

Unnamed: 0,Model name,Accuracy score,Macro-F1 score,ROC-AUC score
0,SVC,0.917526,0.845416,0.927291
1,LGBMClassifier,0.89433,0.790726,0.909552
2,XGBClassifier,0.891753,0.780873,0.884008
3,RandomForestClassifier,0.884021,0.71444,0.902653
4,LogisticRegression,0.862113,0.644748,0.920509
5,DecisionTreeClassifier,0.853093,0.751617,0.758463
