In [1]:
import re

def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation (but keep spaces)
    text = re.sub(r'\s+', ' ', text).strip()    # Normalize multiple spaces to one
    return text



In [2]:
import pandas as pd

real = pd.read_csv("../data/raw/True.csv")
fake = pd.read_csv("../data/raw/Fake.csv")

real['label'] = "REAL"
fake["label"] = "FAKE"

# print(real.isnull().sum())
# print(fake.isnull().sum())

In [3]:
cols = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']

train = pd.read_csv("../data/raw/train.tsv",sep="\t",header=None,names=cols)
test = pd.read_csv("../data/raw/test.tsv",sep="\t",header=None,names=cols)
valid = pd.read_csv("../data/raw/valid.tsv",sep="\t",header=None,names=cols)

liar = pd.concat([train,test,valid]).reset_index(drop=True)

fake_labels = ['pants-fire', 'false', 'barely-true']
real_labels = ['half-true', 'mostly-true', 'true']

liar['label'] = liar['label'].apply(lambda x: 'FAKE' if x in fake_labels else 'REAL')

liar['text'] = liar['statement'].apply(clean_text)
liar = liar[['text','label']]


# Save
liar.to_csv("../data/processed/liar_clean.csv", index=False)

In [4]:
print(liar.isnull().sum())

text     0
label    0
dtype: int64


In [5]:
#welFake
welfake = pd.read_csv("../data/raw/WELFake_Dataset.csv")

# print(welfake.columns.tolist())

# welfake = welfake.rename(columns={'text_column_name':'text','label_column_name':'label'})
welfake['label'] = welfake['label'].map({1:'REAL',0:'FAKE'})

welfake = welfake.sample(50000,random_state=42)

welfake['text'] = welfake['text'].apply(clean_text)
welfake = welfake[['text','label']]

welfake.to_csv("../data/processed/welfake_clean.csv.gz")

In [7]:
welfake.to_csv("../data/processed/welfake_clean.csv")


In [6]:
print(welfake.isnull().sum())

text     0
label    0
dtype: int64


In [12]:
fever = pd.read_json("../data/raw/FEVER.jsonl",lines=True)

fever = fever[fever['label'].isin (['SUPPORTS',"REFUTES"])]

fever['label'] = fever['label'].map({
    'SUPPORTS':"REAL",
    'REFUTES':"FAKE"
})

fever = fever[['claim','label']]
fever = fever.rename(columns={'claim':'text'})
fever = fever.dropna(subset=['text','label'])

print(fever.shape)

(109810, 2)


In [13]:
fever.head()

Unnamed: 0,text,label
0,Nikolaj Coster-Waldau worked with the Fox Broa...,REAL
1,Roman Atwood is a content creator.,REAL
2,"History of art includes architecture, dance, s...",REAL
3,Adrienne Bailon is an accountant.,FAKE
5,Homeland is an American television spy thrille...,REAL


In [20]:
indian = pd.read_csv('../data/raw/indian.csv')

indian = indian[['text','label']]
indian = indian.dropna(subset=['label','text'])


In [21]:
indian.head()

Unnamed: 0,text,label
0,Payal has accused filmmaker Anurag Kashyap of ...,REAL
1,A four-minute-long video of a woman criticisin...,FAKE
2,"Republic Poll, a fake Twitter account imitatin...",FAKE
3,"Delhi teen finds place on UN green list, turns...",REAL
4,Delhi: A high-level meeting underway at reside...,REAL


In [22]:
indian.isnull().sum()

text     0
label    0
dtype: int64

In [75]:
df = pd.concat([real,fake,liar,welfake,fever,indian]).reset_index(drop=True)
if 'title' in df.columns:
    df['text'] = (df['title'].fillna('')+" "+df['text'].fillna(''))
else:
    df['text'] = df['text'].fillna('')

df['text'] = (df['title']+" "+df["text"]).apply(clean_text)
df = df[["text","label"]]

df.to_csv("../data/processed/merged_clean_dataset.csv")

In [76]:
df['label'].value_counts()

label
REAL    136105
FAKE     85115
Name: count, dtype: int64

In [65]:
df.to_csv("../data/processed/merged_clean_dataset.csv.gz")

In [77]:
df.sample(random_state=42,frac=0.3)

Unnamed: 0,text,label
17336,spain state prosecutor asks for custody for ca...,REAL
19254,new zealands ruling nationals win most votes n...,REAL
179582,,FAKE
30372,nra thugs threaten lawmakers with bullets for ...,FAKE
3899,trumps pick for army secretary drops out offic...,REAL
...,...,...
188548,,FAKE
102147,,FAKE
87741,,REAL
28801,bristol palin cheers for christian students fo...,FAKE


In [78]:
df.info()
df['label'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221220 entries, 0 to 221219
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    221220 non-null  object
 1   label   221220 non-null  object
dtypes: object(2)
memory usage: 3.4+ MB


label
REAL    136105
FAKE     85115
Name: count, dtype: int64

In [96]:
df['label'].value_counts()

label
REAL    136105
FAKE     85115
Name: count, dtype: int64

In [97]:
merged = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [98]:
merged.info()
merged['label'].value_counts()
# merged.dropna()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221220 entries, 0 to 221219
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    221220 non-null  object
 1   label   221220 non-null  object
dtypes: object(2)
memory usage: 3.4+ MB


label
REAL    136105
FAKE     85115
Name: count, dtype: int64

In [99]:
merged.isnull().sum()

text     0
label    0
dtype: int64

In [100]:
merged['label'].value_counts()

label
REAL    136105
FAKE     85115
Name: count, dtype: int64

In [101]:
merged.dropna(subset=['text'],inplace=True)

In [102]:
print(merged.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221220 entries, 0 to 221219
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    221220 non-null  object
 1   label   221220 non-null  object
dtypes: object(2)
memory usage: 3.4+ MB
None


In [103]:
merged['label'].value_counts()


label
REAL    136105
FAKE     85115
Name: count, dtype: int64

In [104]:
merged.to_csv("../data/processed/merged_clean_dataset.csv")
merged.to_csv("../data/processed/merged_clean_dataset.csv.gz")
