In [1]:
import pandas as pd
import numpy as np
import csv
import json
from datasets import Dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

# For chinese word segmentation
import jieba
import re

import torch
# ML - Logistic Regression
from sklearn.linear_model import LogisticRegression
# ML - Random Forest
from sklearn.ensemble import RandomForestClassifier
# add more ML
# LLMs - BERT
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

print(torch.cuda.is_available())
print(torch.version.cuda)
# 12.9.90 and below available
print(torch.__version__)


True
12.6
2.6.0+cu126


In [2]:
# Clean csv datasets, keep only text and labels
csv_file_list = [
    'weibo_long_text_posts_chinese.csv',
    'weibo_COVID_news_posts_chinese.csv',
    'weibo_comment_posts_chinese.csv'
]

def clean_csv(input_file):
    df = pd.read_csv(input_file)
    # rename the cleaned csv file by add '_cleaned'
    file_name = input_file.split('.')
    output_file = f'{".".join(file_name[:-1])}_cleaned.{file_name[-1]}'
    # keep only text and labels
    text_column = 'text'
    label_column = 'label'
    df_cleaned = df[[text_column, label_column]].copy()
    # simple data cleaning
    df_cleaned = df_cleaned.dropna(subset=[text_column])
    df_cleaned = df_cleaned[df_cleaned[text_column].str.strip() != '']
    df_cleaned = df_cleaned[df_cleaned[label_column].astype(int).isin([0, 1])]
    # write to new csv file
    df_cleaned.to_csv(output_file, index=False, encoding='utf-8')
    print(f'{input_file} cleaning completed')

# for loop to clean multiple datasets
for file in csv_file_list:
    clean_csv(file)

weibo_long_text_posts_chinese.csv cleaning completed
weibo_COVID_news_posts_chinese.csv cleaning completed
weibo_comment_posts_chinese.csv cleaning completed


In [8]:
csv_file = "weibo_COVID_news_posts_chinese_cleaned.csv"
# 1:real news 0:conspiracy theory
df = pd.read_csv(csv_file, nrows=10)
print(df)

                                                text  label
0  【#崔天凯称外媒抹黑中国援助物资是ABC思维#：Anything But China】新冠病...      1
1  【#香港失业率5.2%创十年新高#】香港特区政府统计处19日公布，2月至4月经季节性调整的失...      1
2  【泪目！#9分钟的中国抗疫图卷#，你看见自己了吗？】这张#中国抗疫图卷#，时长9分钟，它记录...      1
3  【#美国新冠肺炎超221万#：#美国日新增确诊超3万例#】据美国约翰斯·霍普金斯大学疫情实时...      1
4  【#钟南山称不从全球范围内控制好不可能战胜疫情#】3月12日，广东省人民政府新闻办公室举行新...      1
5  【正在直播：#杭州通报最新疫情防控工作#】根据杭州市新型冠状病毒肺炎疫情防控指挥部工作要求，...      1
6  【#日本全国紧急状态将延长#】据日本广播协会（NHK）电视台统计，截至4日10时30分（北京...      1
7  【千里为邻，战疫必胜！#湖北捐助黑龙江首批医用物资#启程赴绥芬河】15日11：06，湖北向黑...      1
8  【继续加油！#北京连续4天零新增#：#北京中风险地区15个#】7月9日0时至24时，北京无新...      1
9  【#习近平同美国总统特朗普通电话#】国家主席习近平27日应约同美国总统特朗普通电话。　　习近...      1


In [9]:
csv_file = "weibo_long_text_posts_chinese_cleaned.csv"
# 1:real posts 0:conspiracy theory
df = pd.read_csv(csv_file, nrows=10)
print(df)

                                                text  label
0  长时间大强度的运动，会导致身体机能失调，免疫功能下降，并且运动损伤风险增加。因此，特别忌讳平...      0
1  因现有研究显示ACE2是新型冠状病毒入侵人体的关键，网传服用ACEI（血管紧张素转化酶抑制剂...      0
2  在居家防疫期间，为确保运动安全有效，运动强度必须适宜。强度过低，没有锻炼效果，但是长时间大强...      0
3  有传闻称：「病患遗体解剖发现死者肺部出现大量痰栓，而痰栓是由呼吸机使用所产生，致人缺氧而死。...      0
4  近日，有人在朋友圈兜售某公司生产的新冠病毒抗体检测试剂盒，单价150元，并宣称可以家庭自行使...      0
5  因现有研究显示ACE2是新型冠状病毒入侵人体的关键，网传服用ACEI（血管紧张素转化酶抑制剂...      0
6  网传的新闻截图原文「CDCconfirmsfirstcoronaviruscaseof"un...      0
7  28日，科技部社会发展科技司司长吴远彬表示，目前研究结论显示，呼吸道飞沫和密切接触传播仍然是...      0
8  据香港文汇网报道，香港渔护署发现一名新冠肺炎确诊患者饲养的宠物狗对病毒测试呈弱阳性反应。香港...      1
9  近日，部分网民转发「乐陵十三人染sk5病毒，参与抢救的医生已被隔离」的信息，其实，该谣言早在...      0


In [6]:
csv_file = "weibo_comment_posts_chinese_cleaned.csv"
# 1:real posts 0:conspiracy theory
df = pd.read_csv(csv_file, nrows=10)
print(df)

                                                text  label
0  人间惨剧：今天下午约14点，宁波妇儿医院，一妇女携带一婴儿在住院楼跳楼，后抢救无效死亡。具体...      0
1  再去武大，已无牌坊！非要拆掉？@章立凡 @袁裕来律师 @老徐时评 @徐昕 @杨锦麟 @左小祖...      0
2  中国最美丽的乡村"江西婺源"一"教师打死学生" 昨晚，在被誉为中国最美丽的乡村江西省婺源县清...      0
3  忍者QS：江苏省东海县女镇党委书记徐艳，因不愿陪县委书记关永健上床，竟然被警察毒打致子宫破裂...      0
4  《北大猛男，持刀刺官！！！》“可歌可泣”的是王同学投案自首之后冷冷说了一句话是 “我并不后悔...      0
5  好心人帮忙转发下！　　　　　　　 　 　昨日福建省泉州市警察局抓到几个拐卖小孩犯罪团伙，现场...      0
6  【怎么鉴定地沟油】炒菜时放一颗剥皮的蒜头(蒜子)，蒜子对黄曲霉素最敏感。如果蒜子变红色就是地...      0
7  湖南省交通厅原副厅长李晓希在今年两会上说；目前，我国《刑法》对贪污受贿量刑太轻了。如果贪污受...      0
8                                           让历史照进现实！      1
9  转来的，有懂阿拉伯语的吗，给翻译翻译！——叙利亚标语：中国，你们的道德比你们的产品还垃圾 ！...      0


In [2]:
# process and word segmentation for chinese text
def process_chinese_text(text):
    # remove all non-chinese characters (\u4e00-\u9fa5) and non-english characters (a-zA-Z)
    text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z]', ' ', text)
    # split continuous chinese text into separate words there's no space between chinese words
    words = jieba.lcut(text)
    # remove words that have no real meaning
    stopwords = {'的', '地', '得', '了', '着', '呢', '吗', '吧', '呀', '啊', '把', '被', '对', '往', '从', '由', '为', '给'}
    words = [w for w in words if len(w) > 1 or (len(w) == 1 and w not in stopwords)]
    return " ".join(words)


In [3]:
# ML - Logistic Regression
def logistic_regression_score(input_file):
    df = pd.read_csv(input_file)
    df['process_text'] = df['text'].apply(process_chinese_text)
    X = df['process_text']
    y = df['label']

    # train set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # TfidfVectorizer for convert text to numerical features
    # Logistic regression cannot directly process text
    vectorizer = TfidfVectorizer()
    X_train_vectorizer = vectorizer.fit_transform(X_train)
    X_test_vectorizer = vectorizer.transform(X_test)

    # train logistic regression model
    logistic_regression = LogisticRegression()
    logistic_regression.fit(X_train_vectorizer, y_train)

    # f1 score and accuracy
    y_predict = logistic_regression.predict(X_test_vectorizer)
    f1 = f1_score(y_test, y_predict, average='weighted')
    accuracy = accuracy_score(y_test, y_predict)

    print(f'f1 score of Logistic Regression about {input_file}: {f1}')
    print(f'accuracy of Logistic Regression about {input_file}: {accuracy}')
    return f1, accuracy

In [4]:
# ML - Random Forest
def random_forest(input_file):
    df = pd.read_csv(input_file)
    df['process_text'] = df['text'].apply(process_chinese_text)
    X = df['process_text']
    y = df['label']

    # train set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # TfidfVectorizer for convert text to numerical features
    vectorizer = TfidfVectorizer()
    X_train_vect = vectorizer.fit_transform(X_train)
    X_test_vect = vectorizer.transform(X_test)

    # train random forest model
    ranfor = RandomForestClassifier(random_state=50)
    ranfor.fit(X_train_vect, y_train)

    # f1 score and accuracy
    y_predict = ranfor.predict(X_test_vect)
    f1 = f1_score(y_test, y_predict, average='weighted')
    accuracy = accuracy_score(y_test, y_predict)

    print(f'f1 score of Random Forest about {input_file}: {f1}')
    print(f'accuracy of Random Forest about {input_file}: {accuracy}')
    return f1, accuracy

In [5]:
# LLMs - BERT
# Some of the ideas come from https://zhuanlan.zhihu.com/p/700074905
def LLMs_BERT(input_file):
    df = pd.read_csv(input_file)
    X = df['text']
    y = df['label']

    # train set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # load BERT pre-trained word tokenizer
    tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

    # convert text into an acceptable input for the model
    def tokenize(batch):
        return tokenizer(batch['text'], padding=True, truncation=True, max_length=256)

    # convert Pandas DataFrame to Dataset format
    train_df = pd.DataFrame({'text': X_train, 'labels': y_train})
    train_dataset = Dataset.from_pandas(train_df)
    test_df = pd.DataFrame({'text': X_test, 'labels': y_test})
    test_dataset = Dataset.from_pandas(test_df)

    train_dataset = train_dataset.map(tokenize, batched=True)
    test_dataset = test_dataset.map(tokenize, batched=True)

    # load model
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)
    model = model.to('cuda')

    # set train parameters
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        report_to='none',
        no_cuda=False,
        fp16=True,
        dataloader_pin_memory=True
    )

    # a functions for calculating model evaluation metrics
    def compute_metrics(eval_prediction):
        predictions, labels = eval_prediction
        predictions = np.argmax(predictions, axis=1)
        return {'f1' : f1_score(labels, predictions, average='weighted'), 'accuracy' : accuracy_score(labels, predictions)}

    # set train parameters
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_results = trainer.evaluate()
    
    f1 = eval_results['eval_f1']
    accuracy = eval_results['eval_accuracy']
    print(f'f1 score of BERT about {input_file}: {f1}')
    print(f'accuracy of BERT about {input_file}: {accuracy}')
    return f1, accuracy

In [6]:
# Use a loop to iterate over multiple data sets
datasets_list = [
    'weibo_COVID_news_posts_chinese_cleaned.csv',
    'weibo_long_text_posts_chinese_cleaned.csv',
    'weibo_comment_posts_chinese.csv'
]
# new list to store the scores output by the function
score_results = []

for dataset in datasets_list:
    logistic_regression_f1, logistic_regression_accuracy = logistic_regression_score(dataset)
    random_forest_f1, random_forest_accuracy = random_forest(dataset)
    BERT_f1, BERT_accuracy = LLMs_BERT(dataset)
    
    # add the results to list
    score_results.append({
        'dataset_name': dataset,
        'model': 'Logistic Regression',
        'f1 score': logistic_regression_f1,
        'accuracy': logistic_regression_accuracy
    })
    score_results.append({
        'dataset_name': dataset,
        'model': 'Random Forest',
        'f1 score': random_forest_f1,
        'accuracy': random_forest_accuracy
    })
    score_results.append({
        'dataset_name': dataset,
        'model': 'BERT-chinese',
        'f1 score': BERT_f1,
        'accuracy': BERT_accuracy
    })
    

# convert to DataFrame and print the table
score_results_df = pd.DataFrame(score_results)
print(score_results_df.to_string(index=False))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\root\AppData\Local\Temp\jieba.cache
Loading model cost 0.519 seconds.
Prefix dict has been built successfully.


f1 score of Logistic Regression about weibo_COVID_news_posts_chinese_cleaned.csv: 0.8710155034439545
accuracy of Logistic Regression about weibo_COVID_news_posts_chinese_cleaned.csv: 0.8931116389548693
f1 score of Random Forest about weibo_COVID_news_posts_chinese_cleaned.csv: 0.9174519438432565
accuracy of Random Forest about weibo_COVID_news_posts_chinese_cleaned.csv: 0.9263657957244655


Map:   0%|          | 0/1683 [00:00<?, ? examples/s]

Map:   0%|          | 0/421 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.0536


f1 score of BERT about weibo_COVID_news_posts_chinese_cleaned.csv: 0.9903860045495189
accuracy of BERT about weibo_COVID_news_posts_chinese_cleaned.csv: 0.9904988123515439
f1 score of Logistic Regression about weibo_long_text_posts_chinese_cleaned.csv: 0.8877627627627626
accuracy of Logistic Regression about weibo_long_text_posts_chinese_cleaned.csv: 0.8977777777777778
f1 score of Random Forest about weibo_long_text_posts_chinese_cleaned.csv: 0.9180076628352491
accuracy of Random Forest about weibo_long_text_posts_chinese_cleaned.csv: 0.9222222222222223


Map:   0%|          | 0/1797 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.1198


f1 score of BERT about weibo_long_text_posts_chinese_cleaned.csv: 0.9778559517842902
accuracy of BERT about weibo_long_text_posts_chinese_cleaned.csv: 0.9777777777777777
                              dataset_name               model  f1 score  accuracy
weibo_COVID_news_posts_chinese_cleaned.csv Logistic Regression  0.871016  0.893112
weibo_COVID_news_posts_chinese_cleaned.csv       Random Forest  0.917452  0.926366
weibo_COVID_news_posts_chinese_cleaned.csv        BERT-chinese  0.990386  0.990499
 weibo_long_text_posts_chinese_cleaned.csv Logistic Regression  0.887763  0.897778
 weibo_long_text_posts_chinese_cleaned.csv       Random Forest  0.918008  0.922222
 weibo_long_text_posts_chinese_cleaned.csv        BERT-chinese  0.977856  0.977778
