In [2]:
!pip install nltk



In [3]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

In [4]:
# Download necessary NLTK resources
nltk_resources = ['stopwords', 'punkt', 'averaged_perceptron_tagger', 'wordnet']
for resource in nltk_resources:
    nltk.download(resource, quiet=True)

In [5]:
# Define paths
base_data_path = '/workspaces/fake_news_analysis/data/'
raw_data_path = f'{base_data_path}raw/'
preprocessed_data_path = f'{base_data_path}preprocessed/'

In [6]:
# Read news data
def load_news_data(type, category):
    path = f'{raw_data_path}{type}_{category}_news.csv'
    return pd.read_csv(path)

news_datasets = {f"{type}_{category}": load_news_data(type, category)
                 for type in ['politifact', 'gossipcop']
                 for category in ['real', 'fake']}

In [7]:
# Print dataset shapes
for name, dataset in news_datasets.items():
    print(f"{name} rows: {dataset.shape[0]},", f"nulls: {dataset['text'].isna().sum()}")

politifact_real rows: 624, nulls: 182
politifact_fake rows: 432, nulls: 89
gossipcop_real rows: 16817, nulls: 3274
gossipcop_fake rows: 5323, nulls: 1070


In [8]:
def preprocess_text(text):

    # Tokenize the text into words
    tokens = word_tokenize(text)

    # Lowercase
    tokens = [word.lower() for word in tokens]

    # Remove stopwords
    stop_words_dic = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words_dic]

    # Part-of-Speech tagging
    tagged_tokens = pos_tag(tokens)

    # Normalize case (except for proper nouns)
    tokens = [word for  word, tag in tagged_tokens if tag != 'NNP' and tag != 'NNPS']

    # Remove punctuation and numbers, keeping only alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()]

    # Lemmatize tokens using the appropriate WordNet tag
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]

    return tokens

def preprocess_data(news_datasets, label_mappings):
    processed_datasets = {}
    for key, df in news_datasets.items():
        df_clean = df[['id', 'text']].dropna()  # Ensure text column is not empty
        df_clean['processed_text'] = df_clean['text'].apply(preprocess_text)
        
        # Apply all label mappings
        for mapping in label_mappings:
            for label_key, label_value in mapping.items():
                if label_key in key:
                    df_clean[label_value['label']] = label_value['value']
        
        processed_datasets[key] = df_clean
    
    return processed_datasets

In [9]:
label_mappings = [
    {'politifact': {'label': 'topic', 'value': 'politics'}, 'gossipcop': {'label': 'topic', 'value': 'entertainment'}},
    {'real': {'label': 'type', 'value': 'real'}, 'fake': {'label': 'type', 'value': 'fake'}}
]

# Applying preprocessing and labeling
processed_datasets = preprocess_data(news_datasets, label_mappings)

In [10]:
# Concatenate real and fake news data
fake = pd.concat([processed_datasets['politifact_fake'], processed_datasets['gossipcop_fake']])
real = pd.concat([processed_datasets['politifact_real'], processed_datasets['gossipcop_real']])

In [11]:
# Preview data
print(fake.head())
print(real.head())

                id                                               text  \
1  politifact15156  The West Texas Federal Appeals Court, operatin...   
3  politifact14355  The former Paralympic athlete reportedly tried...   
4  politifact15371  President Trump and his administration just vo...   
5  politifact14404  aderito Report\n\nReport Change comment\n\nRem...   
6  politifact13919  About Trendolizer™\n\nTrendolizer™ (patent pen...   

                                      processed_text     topic  type  
1  [west, texas, federal, appeal, court, operatin...  politics  fake  
3  [former, paralympic, athlete, reportedly, trie...  politics  fake  
4  [president, trump, administration, voted, unit...  politics  fake  
5  [aderito, report, report, change, comment, rem...  politics  fake  
6  [patent, pending, automatically, scan, interne...  politics  fake  
                id                                               text  \
0  politifact14984  SMALL BUSINESS ECONOMIC TRENDS\n\nSmall Bu

In [14]:
# Save preprocessed data
fake.to_csv(f'{preprocessed_data_path}fake_preprocessed.csv')
real.to_csv(f'{preprocessed_data_path}real_preprocessed.csv')