In [1]:
import re

def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation (but keep spaces)
    text = re.sub(r'\s+', ' ', text).strip()    # Normalize multiple spaces to one
    return text



In [2]:
import pandas as pd

real = pd.read_csv("../data/raw/True.csv")
fake = pd.read_csv("../data/raw/Fake.csv")

real['label'] = "REAL"
fake["label"] = "FAKE"

# print(real.isnull().sum())
# print(fake.isnull().sum())

In [3]:
cols = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']

train = pd.read_csv("../data/raw/train.tsv",sep="\t",header=None,names=cols)
test = pd.read_csv("../data/raw/test.tsv",sep="\t",header=None,names=cols)
valid = pd.read_csv("../data/raw/valid.tsv",sep="\t",header=None,names=cols)

liar = pd.concat([train,test,valid]).reset_index(drop=True)

fake_labels = ['pants-fire', 'false', 'barely-true']
real_labels = ['half-true', 'mostly-true', 'true']

liar['label'] = liar['label'].apply(lambda x: 'FAKE' if x in fake_labels else 'REAL')

liar['text'] = liar['statement'].apply(clean_text)
liar = liar[['text','label']]


# Save
liar.to_csv("../data/processed/liar_clean.csv", index=False)

In [4]:
print(liar.isnull().sum())

text     0
label    0
dtype: int64


In [5]:
#welFake
welfake = pd.read_csv("../data/raw/WELFake_Dataset.csv")

# print(welfake.columns.tolist())

# welfake = welfake.rename(columns={'text_column_name':'text','label_column_name':'label'})
welfake['label'] = welfake['label'].map({1:'REAL',0:'FAKE'})

welfake = welfake.sample(50000,random_state=42)

welfake['text'] = welfake['text'].apply(clean_text)
welfake = welfake[['text','label']]

welfake.to_csv("../data/processed/welfake_clean.csv.gz")

In [7]:
welfake.to_csv("../data/processed/welfake_clean.csv")


In [6]:
print(welfake.isnull().sum())

text     0
label    0
dtype: int64


In [12]:
fever = pd.read_json("../data/raw/FEVER.jsonl",lines=True)

fever = fever[fever['label'].isin (['SUPPORTS',"REFUTES"])]

fever['label'] = fever['label'].map({
    'SUPPORTS':"REAL",
    'REFUTES':"FAKE"
})

fever = fever[['claim','label']]
fever = fever.rename(columns={'claim':'text'})
fever = fever.dropna(subset=['text','label'])

print(fever.shape)

(109810, 2)


In [13]:
fever.head()

Unnamed: 0,text,label
0,Nikolaj Coster-Waldau worked with the Fox Broa...,REAL
1,Roman Atwood is a content creator.,REAL
2,"History of art includes architecture, dance, s...",REAL
3,Adrienne Bailon is an accountant.,FAKE
5,Homeland is an American television spy thrille...,REAL


In [None]:
df = pd.concat([real,fake,liar,welfake]).reset_index(drop=True)
df['text'] = (df['title']+" "+df["text"]).apply(clean_text)
df = df[["text","label"]]

df.to_csv("../data/processed/merged_clean_dataset.csv")

In [11]:
df.to_csv("../data/processed/merged_clean_dataset.csv.gz")

In [None]:
df.head()

Unnamed: 0,text,label
0,as us budget fight looms republicans flip thei...,REAL
1,us military to accept transgender recruits on ...,REAL
2,senior us republican senator let mr mueller do...,REAL
3,fbi russia probe helped by australian diplomat...,REAL
4,trump wants postal service to charge much more...,REAL


In [None]:
df.info()
df['label'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107689 entries, 0 to 107688
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    107689 non-null  object
 1   label   107689 non-null  object
dtypes: object(2)
memory usage: 1.6+ MB


label
REAL    54220
FAKE    53469
Name: count, dtype: int64

In [37]:
merged = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [38]:
merged.info()
merged['label'].value_counts()
# merged.dropna()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107689 entries, 0 to 107688
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    107689 non-null  object
 1   label   107689 non-null  object
dtypes: object(2)
memory usage: 1.6+ MB


label
REAL    54220
FAKE    53469
Name: count, dtype: int64

In [39]:
import numpy as np

merged['text'].replace("nan", np.nan, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged['text'].replace("nan", np.nan, inplace=True)


In [40]:
merged.dropna(subset=['text'],inplace=True)

In [41]:
merged.head()
print(merged.info())


<class 'pandas.core.frame.DataFrame'>
Index: 44898 entries, 0 to 107688
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    44898 non-null  object
 1   label   44898 non-null  object
dtypes: object(2)
memory usage: 1.0+ MB
None


In [42]:
merged['label'].value_counts()


label
FAKE    23481
REAL    21417
Name: count, dtype: int64

In [43]:
merged.to_csv("../data/processed/merged_clean_dataset.csv")
merged.to_csv("../data/processed/merged_clean_dataset.csv.gz")
