In [239]:
import pandas as pd
fake_train = 'dataset/fake_train.csv'
fake_test = 'dataset/fake_test.csv'
fake_valid = 'dataset/fake_valid.csv'
train_df = pd.read_csv(fake_train)
test_df = pd.read_csv(fake_test)
valid_df = pd.read_csv(fake_valid)

print(train_df.info())
print(test_df.info())
print(valid_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10269 entries, 0 to 10268
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10269 non-null  object 
 1   label                 10269 non-null  int64  
 2   statement             10269 non-null  object 
 3   subject               10269 non-null  object 
 4   speaker               10269 non-null  object 
 5   job_title             7367 non-null   object 
 6   state_info            8058 non-null   object 
 7   party_affiliation     10269 non-null  object 
 8   barely_true_counts    10269 non-null  float64
 9   false_counts          10269 non-null  float64
 10  half_true_counts      10269 non-null  float64
 11  mostly_true_counts    10269 non-null  float64
 12  pants_on_fire_counts  10269 non-null  float64
 13  context               10169 non-null  object 
dtypes: float64(5), int64(1), object(8)
memory usage: 1.1+ MB
None
<class '

### =========================== 數據分析 ===========================

數據集龐大，採直接drop掉有空值的欄位

In [240]:
train_df = train_df.dropna()
test_df = test_df.dropna()
valid_df = valid_df.dropna()
print(train_df.info())
print(test_df.info())
print(valid_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 6745 entries, 0 to 10268
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    6745 non-null   object 
 1   label                 6745 non-null   int64  
 2   statement             6745 non-null   object 
 3   subject               6745 non-null   object 
 4   speaker               6745 non-null   object 
 5   job_title             6745 non-null   object 
 6   state_info            6745 non-null   object 
 7   party_affiliation     6745 non-null   object 
 8   barely_true_counts    6745 non-null   float64
 9   false_counts          6745 non-null   float64
 10  half_true_counts      6745 non-null   float64
 11  mostly_true_counts    6745 non-null   float64
 12  pants_on_fire_counts  6745 non-null   float64
 13  context               6745 non-null   object 
dtypes: float64(5), int64(1), object(8)
memory usage: 790.4+ KB
None
<class 'pand

In [241]:
# 了解label 有幾項
train_df.label.unique().tolist()
train_df.speaker.unique().tolist()

['dwayne-bohac',
 'scott-surovell',
 'barack-obama',
 'robin-vos',
 'duey-stroebel',
 'robert-menendez',
 'bernie-s',
 'mitt-romney',
 'george-will',
 'gwen-moore',
 'jack-lew',
 'dennis-richardson',
 'hillary-clinton',
 'planned-parenthood-action-fund',
 'nancy-pelosi',
 'ted-nugent',
 'pamela-geller',
 'peter-kinder',
 'nicholas-kettle',
 'shelley-moore-capito',
 'rick-scott',
 'tom-cotton',
 'ted-cruz',
 'lee-leffingwell',
 'kelly-ayotte',
 'marco-rubio',
 'jerry-patterson',
 'john-boehner',
 'rick-perry',
 'ken-cuccinelli',
 'andrew-cuomo',
 'sid-miller',
 'jim-barksdale',
 'david-raynor',
 'donald-trump',
 'john-mccain',
 'rudy-giuliani',
 'john-barrasso',
 'john-depetro',
 'garnet-coleman',
 'terry-mcauliffe',
 'alfredo-gutierrez',
 'elena-kagan',
 'maurice-ferre',
 'jeanne-shaheen',
 'susan-happ',
 'bill-white',
 'bob-goodlatte',
 'cory-booker',
 'joe-biden',
 'jim-skaggs',
 'robert-sarvis',
 'will-weatherford',
 'mark-shields',
 'alison-lundergan-grimes',
 'mitch-mcconnell',
 '

将文本中提到的政党名称转换成数值标识符
模型兼容性：大多数机器学习模型无法直接处理文本数据。将文本转换成数值（如通过将政党名称映射到特定的ID）可以让模型更容易地处理这些信息。

数据简化：在您的数据集中，可能有多个变量或列与政党相关。通过将政党名称映射到唯一的ID上，可以简化这些信息，使得每个政党都有一个唯一、一致的标识符，减少数据的复杂性。

数据分析：通过将文本分类转换为数值ID，您可以更容易地分析数据集中政党分布的模式。例如，您可以快速计算每个政党在数据集中出现的频率，或者探索政党与其他变量之间的关系。

减少数据噪声：在原始数据中，相同政党的名称可能因为大小写、缩写或拼写错误而有所不同。将政党名称标准化并映射到一个唯一ID可以减少这种类型的噪声，提高数据质量。

特征工程：这种转换是一种特征工程的形式，它可以帮助提高模型的性能。通过识别数据中最频繁的政党并为它们分配ID，您可以创建一个有意义的特征，该特征可能对于预测任务（如分类新闻文章为真实或假新闻）来说是有用的。

泛化能力：为不频繁出现或未知的政党分配一个通用ID（如方法中使用的len(set(frequent_parties.values()))）可以提高模型对新或罕见政党的泛化能力。

In [242]:
#################
######Party######
#################

# 首先获取前5个最频繁出现的政党及其计数
frequent_parties_series = train_df['party_affiliation'].str.lower().value_counts()[:5]

# 转换为字典，此时键是政党名称，值是出现的频次
frequent_parties_dict = frequent_parties_series.to_dict()
print(frequent_parties_dict)
# 如果您想将政党名称映射到它们的排名（而不是频次）
frequent_parties = {party: rank for rank, (party, freq) in enumerate(frequent_parties_dict.items(), start=0)}
print(frequent_parties)
def get_party_id(party):
  if isinstance(party, str):
    matched = [pt for pt in frequent_parties if pt in party.lower() ]
    if len(matched)>0:
      return frequent_parties[matched[0]]
    else:
      return len(set(frequent_parties.values())) 
  else:
    return len(set(frequent_parties.values())) 
  

train_df['party_affiliation'] = train_df['party_affiliation'].apply(get_party_id)
valid_df['party_affiliation'] = valid_df['party_affiliation'].apply(get_party_id)
test_df['party_affiliation'] = test_df['party_affiliation'].apply(get_party_id)

print(len(set(frequent_parties.values())))


{'republican': 3405, 'democrat': 2621, 'none': 367, 'independent': 126, 'newsmaker': 36}
{'republican': 0, 'democrat': 1, 'none': 2, 'independent': 3, 'newsmaker': 4}
5


In [243]:
train_df

Unnamed: 0,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,2635.json,0,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,0,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,1,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,2,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,1,70.0,71.0,160.0,163.0,9.0,Denver
5,12465.json,3,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,0,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece
7,153.json,1,"""I'm the only person on this stage who has wor...",ethics,barack-obama,President,Illinois,1,70.0,71.0,160.0,163.0,9.0,"a Democratic debate in Philadelphia, Pa."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10256,13344.json,5,"Recently though, the media has reported on tho...",elections,john-rafferty,State Senator,Pennsylvania,0,0.0,0.0,0.0,0.0,1.0,a debate.
10257,13239.json,4,Stopped by Smiley Cookie to pick up some great...,food,donald-trump,President-Elect,New York,0,63.0,114.0,51.0,37.0,61.0,a Facebook post.
10259,11018.json,4,The Supreme Courts views are radically out of ...,"gays-and-lesbians,polls,supreme-court",ted-cruz,Senator,Texas,0,36.0,33.0,15.0,19.0,8.0,an interview on NPR
10260,2930.json,1,"When it comes to the state deficit, Wisconsin ...",state-budget,alberta-darling,"State Senator, 8th District",Wisconsin,0,1.0,1.0,2.0,1.0,1.0,a television interview


In [244]:
#################
#######Job#######
#################

frequent_job_series = train_df['job_title'].str.lower().value_counts()[:15]
# frequent_job_series
# 转换为字典，此时键是政党名称，值是出现的频次
frequent_job_dict = frequent_job_series.to_dict()
frequent_job = {party: rank for rank, (party, freq) in enumerate(frequent_job_dict.items(), start=0)}
print(frequent_job)

def get_job_id(job):
    if isinstance(job, str):
        matched = [jt for jt in frequent_job if jt in job.lower()]
        if len(matched) > 0:
            return frequent_job[matched[0]]
        else:
            return len(set(frequent_job.values()))
    else:
        return len(set(frequent_job.values()))

# 确保使用正确的函数名
train_df['job_title'] = train_df['job_title'].apply(get_job_id)
valid_df['job_title'] = valid_df['job_title'].apply(get_job_id)
test_df['job_title'] = test_df['job_title'].apply(get_job_id)

# 打印出job_id列的唯一值来验证结果
# print(train_df['job_id'].unique())
train_df

{'u.s. senator': 0, 'president': 1, 'governor': 2, 'president-elect': 3, 'u.s. representative': 4, 'presidential candidate': 5, 'state senator': 6, 'state representative': 7, 'former governor': 8, 'milwaukee county executive': 9, 'senator': 10, 'u.s. house of representatives': 11, 'attorney': 12, 'congressman': 13, 'governor of new jersey': 14}


Unnamed: 0,id,label,statement,subject,speaker,job_title,state_info,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,2635.json,0,Says the Annies List political group supports ...,abortion,dwayne-bohac,7,Texas,0,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,15,Virginia,1,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,2,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,1,Illinois,1,70.0,71.0,160.0,163.0,9.0,Denver
5,12465.json,3,The Chicago Bears have had more starting quart...,education,robin-vos,15,Wisconsin,0,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece
7,153.json,1,"""I'm the only person on this stage who has wor...",ethics,barack-obama,1,Illinois,1,70.0,71.0,160.0,163.0,9.0,"a Democratic debate in Philadelphia, Pa."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10256,13344.json,5,"Recently though, the media has reported on tho...",elections,john-rafferty,6,Pennsylvania,0,0.0,0.0,0.0,0.0,1.0,a debate.
10257,13239.json,4,Stopped by Smiley Cookie to pick up some great...,food,donald-trump,1,New York,0,63.0,114.0,51.0,37.0,61.0,a Facebook post.
10259,11018.json,4,The Supreme Courts views are radically out of ...,"gays-and-lesbians,polls,supreme-court",ted-cruz,10,Texas,0,36.0,33.0,15.0,19.0,8.0,an interview on NPR
10260,2930.json,1,"When it comes to the state deficit, Wisconsin ...",state-budget,alberta-darling,6,Wisconsin,0,1.0,1.0,2.0,1.0,1.0,a television interview


In [245]:
#################
#####Context#####
#################

frequent_context = {'news release' : 0, 'interview' : 1, 'press release' : 2, 
                   'speech' : 3, 'tv' : 4, 'tweet' : 5, 'campaign' : 6, 
                   'television' : 4, 'debate' : 7, 'news conference' : 8, 
                   'facebook' : 5, 'press conference' : 8, 'radio' : 9, 
                   'e-mail' : 10, 'email' : 10, 'mail' : 10, 'social media' : 5,
                   'twitter' : 5, 'blog':11, 'article':11,'comment':12, 'web':11}

print(frequent_context)


def get_venue_id(venue):
  if isinstance(venue, str):
    matched = [ven for ven in frequent_context if ven in venue.lower() ]
    if len(matched)>0:
      return frequent_context[matched[0]]
    else:
      return len(set(frequent_context.values())) 
  else:
    return len(set(frequent_context.values()))
  

train_df['context'] = train_df['context'].apply(get_venue_id)
valid_df['context'] = valid_df['context'].apply(get_venue_id)
test_df['context'] = test_df['context'].apply(get_venue_id)

print(len(set(frequent_context.values())))

# train_df

{'news release': 0, 'interview': 1, 'press release': 2, 'speech': 3, 'tv': 4, 'tweet': 5, 'campaign': 6, 'television': 4, 'debate': 7, 'news conference': 8, 'facebook': 5, 'press conference': 8, 'radio': 9, 'e-mail': 10, 'email': 10, 'mail': 10, 'social media': 5, 'twitter': 5, 'blog': 11, 'article': 11, 'comment': 12, 'web': 11}
13


In [246]:
# 移除部分欄位
train_df = train_df.drop(columns=['id', 'speaker', 'state_info', 'subject'])
test_df = test_df.drop(columns=['id', 'speaker', 'state_info', 'subject'])
valid_df = valid_df.drop(columns=['id', 'speaker', 'state_info', 'subject'])


In [247]:
train_df

Unnamed: 0,label,statement,job_title,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,0,Says the Annies List political group supports ...,7,0,0.0,1.0,0.0,0.0,0.0,10
1,1,When did the decline of coal start? It started...,15,1,0.0,0.0,1.0,1.0,0.0,3
2,2,"Hillary Clinton agrees with John McCain ""by vo...",1,1,70.0,71.0,160.0,163.0,9.0,13
5,3,The Chicago Bears have had more starting quart...,15,0,0.0,3.0,2.0,5.0,1.0,13
7,1,"""I'm the only person on this stage who has wor...",1,1,70.0,71.0,160.0,163.0,9.0,7
...,...,...,...,...,...,...,...,...,...,...
10256,5,"Recently though, the media has reported on tho...",6,0,0.0,0.0,0.0,0.0,1.0,7
10257,4,Stopped by Smiley Cookie to pick up some great...,1,0,63.0,114.0,51.0,37.0,61.0,5
10259,4,The Supreme Courts views are radically out of ...,10,0,36.0,33.0,15.0,19.0,8.0,1
10260,1,"When it comes to the state deficit, Wisconsin ...",6,0,1.0,1.0,2.0,1.0,1.0,1


### 文本清理

In [248]:
import re

# 定义清理文本的函数
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # 移除标点符号
    text = re.sub(r'\d+', '', text)  # 移除数字
    text = text.lower()  # 转换为小写
    return text

# 假设df是你的DataFrame，并且它有一个名为'statement'的列包含文本数据
train_df['statement'] = train_df['statement'].apply(clean_text)
test_df['statement'] = test_df['statement'].apply(clean_text)
valid_df['statement'] = valid_df['statement'].apply(clean_text)

train_df.to_csv('dataset/after_train.csv', index=False)
test_df.to_csv('dataset/after_test.csv', index=False)
valid_df.to_csv('dataset/after_valid.csv', index=False)

### 文本向量化 TF_IDF
使用fit_transform方法来拟合向量化器，并基于训练数据构建TF-IDF模型。对于测试集和验证集，应使用transform方法来转换文本数据。
將訓練集數據轉換成TF-IDF向量，得到了一個「稀疏矩陣」，涉及大量数据时节省内存的有效方式，因为它只存储非零元素的信息。
已经有了向量化后的数据，可以开始使用这些数据来训练机器学习模型了。


#### 解讀稀疏矩陣
6745：表示有6745個文本数据点（如新闻文章、评论等）被转换成了向量。
10371：表示在所有文本數據中，向量化器识别出了10371个唯一词项构成的词汇表。
128201：在所有转换后的向量中，共有128201个非零TF-IDF值，意味着这些词在对应的文档中至少出现过一次，并且具有一定的重要性。

In [249]:
'''
from sklearn.feature_extraction.text import TfidfVectorizer

# 实例化TF-IDF向量化器
tfidf_vectorizer = TfidfVectorizer()

# 使用訓練數據擬合TF-IDF向量化器，並轉換為訓練數據
# 假设已经創建了一个名為'cleaned_text'的列，通過合併'context'和'statement'得到
train_df['clean_statement'] = train_df['statement'] + " " + train_df['context']
tfidf_train_vectors = tfidf_vectorizer.fit_transform(train_df['clean_statement'])

test_df['clean_statement'] = test_df['statement'] + " " + test_df['context']
tfidf_test_vectors = tfidf_vectorizer.transform(test_df['clean_statement'])

valid_df['clean_statement'] = valid_df['statement'] + " " + valid_df['context']
tfidf_valid_vectors = tfidf_vectorizer.transform(valid_df['clean_statement'])

# drop掉context和statement
train_df = train_df.drop(columns=['context', 'statement'])
test_df = test_df.drop(columns=['context', 'statement'])
valid_df = valid_df.drop(columns=['context', 'statement'])

print(tfidf_train_vectors)
train_df
'''

'\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\n# 实例化TF-IDF向量化器\ntfidf_vectorizer = TfidfVectorizer()\n\n# 使用訓練數據擬合TF-IDF向量化器，並轉換為訓練數據\n# 假设已经創建了一个名為\'cleaned_text\'的列，通過合併\'context\'和\'statement\'得到\ntrain_df[\'clean_statement\'] = train_df[\'statement\'] + " " + train_df[\'context\']\ntfidf_train_vectors = tfidf_vectorizer.fit_transform(train_df[\'clean_statement\'])\n\ntest_df[\'clean_statement\'] = test_df[\'statement\'] + " " + test_df[\'context\']\ntfidf_test_vectors = tfidf_vectorizer.transform(test_df[\'clean_statement\'])\n\nvalid_df[\'clean_statement\'] = valid_df[\'statement\'] + " " + valid_df[\'context\']\ntfidf_valid_vectors = tfidf_vectorizer.transform(valid_df[\'clean_statement\'])\n\n# drop掉context和statement\ntrain_df = train_df.drop(columns=[\'context\', \'statement\'])\ntest_df = test_df.drop(columns=[\'context\', \'statement\'])\nvalid_df = valid_df.drop(columns=[\'context\', \'statement\'])\n\nprint(tfidf_train_vectors)\ntrain_df\n'

### BERT

#### BERT的文本處理

In [229]:
'''
import tensorflow as tf
from transformers import BertTokenizer
import pandas as pd

# 加载预训练的Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 假设train_df, test_df, valid_df已根据您的指示进行了加载和预处理
# 例如，这里我们假设已经完成了清理文本的步骤

# 函数：将文本转换为BERT模型的输入格式
def encode_sentences(tokenizer, sentences, max_length):
    input_ids = []
    attention_masks = []

    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sentence,                      # 输入文本
                            add_special_tokens=True,       # 添加 '[CLS]' 和 '[SEP]'
                            max_length=max_length,         # 填充 & 截断长度
                            padding='max_length',          # 填充至最大长度
                            truncation=True,               # 显式激活截断
                            return_attention_mask=True,    # 构造注意力掩码
                            return_tensors='tf',           # 返回tensorflow张量
                      )
        
        # 添加编码后的句子
        input_ids.append(encoded_dict['input_ids'])
        # 和对应的注意力掩码（区分padding与非padding）
        attention_masks.append(encoded_dict['attention_mask'])

    # 转换为tensorflow格式
    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)
    
    return input_ids, attention_masks


# 编码数据集
max_length = 128  # 可以根据需要调整
train_inputs, train_masks = encode_sentences(tokenizer, train_df['statement'].tolist(), max_length)
valid_inputs, valid_masks = encode_sentences(tokenizer, valid_df['statement'].tolist(), max_length)
test_inputs, test_masks = encode_sentences(tokenizer, test_df['statement'].tolist(), max_length)

# 将标签转换为tensorflow格式
train_labels = tf.convert_to_tensor(train_df['label'].tolist())
valid_labels = tf.convert_to_tensor(valid_df['label'].tolist())
test_labels = tf.convert_to_tensor(test_df['label'].tolist())

# 接下来，您可以根据这些准备好的输入和标签来定义和训练您的TensorFlow模型。
'''

In [238]:
'''
from transformers import TFBertForSequenceClassification
import tensorflow as tf

def train_and_evaluate_bert(train_inputs, train_masks, train_labels, valid_inputs, valid_masks, valid_labels, test_inputs, test_masks, test_labels, num_labels=6, learning_rate=5e-5, epochs=3, batch_size=32):
    # 加载预训练的BERT模型，适配于序列分类任务
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
    
    # 对于M1/M2 Mac用户，使用legacy版本的Adam优化器以获得更好的性能
    optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate)

    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
    
    # 将数据包装进tf.data.Dataset中以便训练
    train_dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': train_inputs, 'attention_mask': train_masks},
        train_labels
    )).shuffle(100).batch(batch_size)

    valid_dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': valid_inputs, 'attention_mask': valid_masks},
        valid_labels
    )).batch(batch_size)

    # 训练模型
    history = model.fit(
        train_dataset,
        validation_data=valid_dataset,
        epochs=epochs
    )

    # 评估模型
    test_dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': test_inputs, 'attention_mask': test_masks},
        test_labels
    )).batch(batch_size)

    eval_result = model.evaluate(test_dataset)
    print(f"Test loss: {eval_result[0]}, Test accuracy: {eval_result[1]}")
    
    return model, history

model, history = train_and_evaluate_bert(train_inputs, train_masks, train_labels, valid_inputs, valid_masks, valid_labels, test_inputs, test_masks, test_labels)
'''

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
  1/211 [..............................] - ETA: 1:12:16 - loss: 1.8978 - accuracy: 0.1250

KeyboardInterrupt: 

### 顯示關鍵字圖片

In [178]:
'''
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 繪製文字雲圖像
def wordFigure(text):
    text = " ".join([sentence for sentence in text])
    wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(text)
    plt.figure(figsize=(15, 9))  # 設定圖片大小為15x9英寸
    plt.imshow(wordcloud, interpolation='bilinear')  # 顯示文字雲，使用雙線性插值法讓顯示更平滑
    plt.axis('off')  # 不顯示軸標籤
    plt.show()  # 顯示圖像
    

print(wordFigure(train_df['statement']))
print(wordFigure(train_df['statement'][train_df['label']==5]))
print(wordFigure(train_df['statement'][train_df['label']==3]))
'''

'\nfrom wordcloud import WordCloud\nimport matplotlib.pyplot as plt\n\n# 繪製文字雲圖像\ndef wordFigure(text):\n    text = " ".join([sentence for sentence in text])\n    wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(text)\n    plt.figure(figsize=(15, 9))  # 設定圖片大小為15x9英寸\n    plt.imshow(wordcloud, interpolation=\'bilinear\')  # 顯示文字雲，使用雙線性插值法讓顯示更平滑\n    plt.axis(\'off\')  # 不顯示軸標籤\n    plt.show()  # 顯示圖像\n    \n\nprint(wordFigure(train_df[\'statement\']))\nprint(wordFigure(train_df[\'statement\'][train_df[\'label\']==5]))\nprint(wordFigure(train_df[\'statement\'][train_df[\'label\']==3]))\n'

Training