In [11]:
import pandas as pd
import json

# Gossipcop数据集提取与处理

目标：提取Gossipcop数据集中的文章和标签，用于Grover的判别模型微调训练或直接检测

提取内容：所有人类书写的真实合法文章，以及机器生成的虚假文章，其余暂时均不采用。

In [12]:
def json2df(data_filename):
    # 从文件中读取 JSON 数据
    with open(data_filename, 'r') as file:
        data_json = json.load(file)

    # 使用 Pandas 将 JSON 数据转换为 DataFrame
    data_df = pd.DataFrame.from_dict(data_json, orient='index')
    return data_df
    


In [13]:
# load data
data_filenames = ['gossipcop_v3-1_style_based_fake.json',
                  'gossipcop_v3-2_content_based_fake.json',
                  'gossipcop_v3-3_integration_based_fake_tn200.json',
                  'gossipcop_v3-4_story_based_fake.json',
                  'gossipcop_v3-5_style_based_legitimate.json',
                  'gossipcop_v3-7_integration_based_legitimate_tn300.json']

gossip_list = []

for data_filename in data_filenames:
    gossip_data = json2df(data_filename)
    gossip_data.reset_index(drop=True, inplace=True)
    gossip_list.append(gossip_data)

In [14]:
# save and print info of the data
for i in range(len(gossip_list)):
    print(gossip_list[i].info())
    csv_name = data_filenames[i].replace('json','csv')
    gossip_list[i].to_csv(f'./csv/{csv_name}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15729 entries, 0 to 15728
Data columns (total 7 columns):
origin_id          15729 non-null object
origin_label       15729 non-null object
origin_text        15729 non-null object
generated_text     15729 non-null object
generated_tone     15729 non-null object
generated_label    15729 non-null object
has_top_img        15729 non-null int64
dtypes: int64(1), object(6)
memory usage: 860.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11941 entries, 0 to 11940
Data columns (total 5 columns):
origin_id              11941 non-null object
origin_label           11941 non-null object
origin_text            11941 non-null object
generated_text_glm4    11941 non-null object
has_top_img            11941 non-null int64
dtypes: int64(1), object(4)
memory usage: 466.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2697 entries, 0 to 2696
Data columns (total 8 columns):
doc_1_id          2697 non-null object
doc_1_label    

## 处理v3-1 style based fake

In [49]:
print(gossip_list[0].columns)
print(gossip_list[0]['origin_label'].unique())
print(gossip_list[0]['generated_label'].unique())

Index(['origin_id', 'origin_label', 'origin_text', 'generated_text',
       'generated_tone', 'generated_label', 'has_top_img'],
      dtype='object')
['fake' 'legitimate']
['fake' 'real']


In [81]:
# 创建一个新的 DataFrame，将 origin_text 和 generated_text 合并为 article 列
origin_df = gossip_list[0][['origin_text', 'origin_label']].rename(columns={'origin_text': 'article', 'origin_label': 'label'})
origin_df = origin_df.loc[origin_df['label']=='legitimate']
origin_df['label'] = 'human'
generated_df = gossip_list[0][['generated_text', 'generated_label']].rename(columns={'generated_text': 'article', 'generated_label': 'label'})
generated_df = generated_df.loc[generated_df['label']=='fake']
generated_df['label'] = 'machine'

# 合并两个 DataFrame
combined_df = pd.concat([origin_df, generated_df], ignore_index=True)
print(combined_df['label'].unique())
combined_df['split'] = 'test'
combined_df.head()

['human']


Unnamed: 0,article,label,split
0,"Born and raised in the Philippines, Lea Salong...",human,test
1,"We love all the Disney Princesses, but we're a...",human,test
2,Source: Angela George/CC BY-SA 3.0\n\nOne of m...,human,test
3,The boyfriend of Los Angeles police officer-tu...,human,test
4,James Woods has received a letter that he can ...,human,test


In [82]:
# 将 DataFrame 转换为 JSON Lines 格式
json_lines = combined_df.to_json(orient='records', lines=True)

# 将 JSON Lines 保存为文件
with open('output_all_test_1_human.jsonl', 'w') as file:
    file.write(json_lines)

## 处理v3-2 content based fake
由原始文本改变内容生成假新闻

In [46]:
print(gossip_list[1].columns)
print(gossip_list[1]['origin_label'].unique())

Index(['origin_id', 'origin_label', 'origin_text', 'generated_text_glm4',
       'has_top_img'],
      dtype='object')
['legitimate']


In [58]:
# 创建一个新的 DataFrame，将 origin_text 和 generated_text 合并为 article 列
origin_df_1 = gossip_list[1][['origin_text', 'origin_label']].rename(columns={'origin_text': 'article', 'origin_label': 'label'})
# origin_df_1 = origin_df_1.loc[origin_df_1['label']=='real']
origin_df_1['label'] = 'human'
generated_df_1 = gossip_list[1][['generated_text_glm4']].rename(columns={'generated_text_glm4': 'article'})
# generated_df = generated_df.loc[generated_df['label']=='fake']
generated_df_1['label'] = 'machine'

# 合并两个 DataFrame
combined_df_1 = pd.concat([origin_df_1, generated_df_1], ignore_index=True)

print(combined_df_1['label'].unique())
combined_df_1['split'] = 'test'
combined_df_1.head()

['human' 'machine']


Unnamed: 0,article,label,split
0,"Born and raised in the Philippines, Lea Salong...",human,test
1,"We love all the Disney Princesses, but we're a...",human,test
2,Source: Angela George/CC BY-SA 3.0\n\nOne of m...,human,test
3,The boyfriend of Los Angeles police officer-tu...,human,test
4,James Woods has received a letter that he can ...,human,test


In [59]:
# 将 DataFrame 转换为 JSON Lines 格式
json_lines_1 = combined_df_1.to_json(orient='records', lines=True)

# 将 JSON Lines 保存为文件
with open('output_all_test_2.jsonl', 'w') as file:
    file.write(json_lines_1)

## 处理v3-3 integration based fake

In [60]:
print(gossip_list[2].columns)
print(gossip_list[2]['doc_1_label'].unique())
print(gossip_list[2]['doc_2_label'].unique())

Index(['doc_1_id', 'doc_1_label', 'doc_1_text', 'doc_2_id', 'doc_2_label',
       'doc_2_text', 'generated_text', 'has_top_img'],
      dtype='object')
['legitimate']
['fake']


In [61]:
# 创建一个新的 DataFrame，将 origin_text 和 generated_text 合并为 article 列
origin_df_2 = gossip_list[2][['doc_1_text', 'doc_1_label']].rename(columns={'doc_1_text': 'article', 'doc_1_label': 'label'})
origin_df_2 = origin_df_2.loc[origin_df_2['label']=='legitimate']
origin_df_2['label'] = 'human'
generated_df_2 = gossip_list[2][['generated_text']].rename(columns={'generated_text': 'article'})
# generated_df = generated_df.loc[generated_df['label']=='fake']
generated_df_2['label'] = 'machine'

# 合并两个 DataFrame
combined_df_2 = pd.concat([origin_df_2, generated_df_2], ignore_index=True)

print(combined_df_2['label'].unique())
combined_df_2['split'] = 'test'
combined_df_2.head()

['human' 'machine']


Unnamed: 0,article,label,split
0,Do you feel it in your fingers? Do you feel it...,human,test
1,The dispute between Kesha Rose Sebert and her ...,human,test
2,Getty 'They feel like their family is complet...,human,test
3,What is Reza Farahan's Net Worth and Salary? ...,human,test
4,"Update (September 21, 2:50 P.M.): A source clo...",human,test


In [62]:
# 将 DataFrame 转换为 JSON Lines 格式
json_lines_2 = combined_df_2.to_json(orient='records', lines=True)

# 将 JSON Lines 保存为文件
with open('output_all_test_3.jsonl', 'w') as file:
    file.write(json_lines_2)

## 处理v3-4 story based fake

In [63]:
print(gossip_list[3].columns)
print(gossip_list[3]['origin_label'].unique())

Index(['origin_id', 'origin_label', 'origin_text', 'origin_title',
       'generated_text', 'has_top_img'],
      dtype='object')
['fake' 'real']


In [64]:
# 创建一个新的 DataFrame，将 origin_text 和 generated_text 合并为 article 列
origin_df_3 = gossip_list[3][['origin_text', 'origin_label']].rename(columns={'origin_text': 'article', 'origin_label': 'label'})
origin_df_3 = origin_df_3.loc[origin_df_3['label']=='real']
origin_df_3['label'] = 'human'
generated_df_3 = gossip_list[3][['generated_text']].rename(columns={'generated_text': 'article'})
# generated_df = generated_df.loc[generated_df['label']=='fake']
generated_df_3['label'] = 'machine'

# 合并两个 DataFrame
combined_df_3 = pd.concat([origin_df_3, generated_df_3], ignore_index=True)

print(combined_df_3['label'].unique())
combined_df_3['split'] = 'test'
combined_df_3.head()

['human' 'machine']


Unnamed: 0,article,label,split
0,"Born and raised in the Philippines, Lea Salong...",human,test
1,"We love all the Disney Princesses, but we're a...",human,test
2,Source: Angela George/CC BY-SA 3.0\n\nOne of m...,human,test
3,The boyfriend of Los Angeles police officer-tu...,human,test
4,James Woods has received a letter that he can ...,human,test


In [65]:
# 将 DataFrame 转换为 JSON Lines 格式
json_lines_3 = combined_df_3.to_json(orient='records', lines=True)

# 将 JSON Lines 保存为文件
with open('output_all_test_4.jsonl', 'w') as file:
    file.write(json_lines_3)

## 合并与去重

In [73]:
combined_df_all = pd.concat([combined_df, combined_df_1, combined_df_2, combined_df_3], ignore_index=True)
print(len(combined_df_all))

60224


In [71]:
norepeat_combined_df_all = combined_df_all.drop_duplicates(subset=['article', 'label'], keep='first')
print(len(norepeat_combined_df_all))

47413


In [76]:
# 保存csv
norepeat_combined_df_all.to_csv('gossip_all_test_combined_norepeat.csv')

# 将 DataFrame 转换为 JSON Lines 格式
json_lines_all = norepeat_combined_df_all.to_json(orient='records', lines=True)
# 将 JSON Lines 保存为文件
with open('gossip_all_test_combined_norepeat.jsonl', 'w') as file:
    file.write(json_lines_all)