In [6]:
import json
import re
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.stats import chi2
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.stats import chi2
from sklearn.feature_selection import chi2

# Load the JSON file
with open('gossipcop_v3-1_style_based_fake.json') as f:
    fake_data = json.load(f)
with open('gossipcop_v3-5_style_based_legitimate.json') as f:
    real_data=json.load(f)

# Preprocess the data
def preprocess_text(text):
    # Remove hyperlinks
    text = re.sub(r'http\S+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove emojis
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens


# Feature selection
def chi2_feature_selection(X, y):
    chi2_scores = []
    for feature in zip(*X):
        chi2_score, p_value = chi2(frequency_table(feature, y))
        chi2_scores.append((chi2_score, p_value))
    return chi2_scores

def frequency_table(feature, y):
    contingency_table = [[0, 0], [0, 0]]
    for i, label in enumerate(y):
        contingency_table[int(label == 'legitimate')][int(feature[i])] += 1
    return contingency_table

def frequency_feature_selection(X):
    feature_counts = Counter()
    for feature in X:
        feature_counts.update(feature)
    return feature_counts.most_common()

for item in fake_data:
    fake_data[item]['generated_text'] = preprocess_text(fake_data[item]['generated_text'])
for item in real_data:
    real_data[item]['generated_text_t015'] = preprocess_text(real_data[item]['generated_text_t015'])

# Extract features (X) and labels (y)
X = [fake_data[item]['generated_text'] for item in fake_data] + [real_data[item]['generated_text_t015'] for item in real_data]
y = [fake_data[item]['generated_label'] for item in fake_data] + [real_data[item]['generated_label'] for item in real_data]

In [7]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
X_flattened = [" ".join(sublist) for sublist in X]
vectorizer = CountVectorizer(max_features=25000)
X_vectorized = vectorizer.fit_transform(X_flattened)
X_vectorized = X_vectorized.toarray()
# 根据y中的标签在X_vectorized中添加0或1

# 将y中的标签转换为0和1
y_transformed = [0 if label == 'fake' else 1 for label in y]

# 将y_transformed转换为numpy数组，并改变其形状以便能够与X_vectorized进行水平堆叠
y_transformed_array = np.array(y_transformed).reshape(-1, 1)

# 将转换后的标签数组添加到X_vectorized的最后一列
X_vectorized_modified = np.hstack((X_vectorized, y_transformed_array))




In [8]:
# 把X_vetorized输出到txt文件中
import numpy as np
from sklearn.utils import shuffle

# # 按行随机打乱X_vectorized_modified
X_vectorized_modified = shuffle(X_vectorized_modified, random_state=42)
np.savetxt("gossipcop_style_based_dataset.txt", X_vectorized_modified, fmt='%d')