In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# no pre-processing for test claims
claims = pd.read_csv("../../../data/raw_claims.csv", sep=",", index_col=0)

In [3]:
claims.head()

Unnamed: 0,ID,claim,date,truth_rating
0,http://data.gesis.org/claimskg/creative_work/0...,Actual video of Iraqi soldier saying goodbye t...,2021-07-07,FALSE
1,http://data.gesis.org/claimskg/creative_work/0...,Bus launched in August 2020 in Pakistan falls ...,2020-08-25,FALSE
2,http://data.gesis.org/claimskg/creative_work/0...,Another man was responsible for the assault th...,2018-09-28,OTHER
3,http://data.gesis.org/claimskg/creative_work/0...,U.S. President Joe Biden visited the Tree of L...,2021-09-03,FALSE
4,http://data.gesis.org/claimskg/creative_work/0...,Turkey legs sold in Disney's theme are actuall...,2017-03-13,FALSE


### Uniqueness of IDs

In [4]:
claims.ID.describe()

count                                                 20557
unique                                                20537
top       http://data.gesis.org/claimskg/creative_work/3...
freq                                                      2
Name: ID, dtype: object

In [5]:
# drop duplicates
claims = claims.drop_duplicates(subset="ID")

In [6]:
claims.ID.describe()

count                                                 20537
unique                                                20537
top       http://data.gesis.org/claimskg/creative_work/0...
freq                                                      1
Name: ID, dtype: object

### Author is not the fact checking site

Since we do not plan to use the author as a feature, we might ignore this removal in the future

In [23]:
#(claims.claim_author == claims.review_instance).value_counts()

False    19564
True      1374
dtype: int64

In [24]:
#claims[claims.claim_author == claims.review_instance].review_instance.value_counts()

fatabyyano    1374
Name: review_instance, dtype: int64

In [25]:
#claims = claims[claims.review_instance != "fatabyyano"]

In [26]:
#(claims.claim_author == claims.review_instance).value_counts()

False    19564
dtype: int64

### Claim text is present 

In [7]:
claims.shape

(20537, 4)

In [8]:
claims = claims.drop(index = claims[claims.claim.isnull()].index)

In [9]:
claims.shape

(20497, 4)

### Claim author is present (ignored in final version)

In [28]:
#claims[claims.claim_author.isnull()]

Unnamed: 0,ID,claim_author,review_instance,claim,date,truth_rating,review_url


In [29]:
#claims[claims.claim_author == " "]

Unnamed: 0,ID,claim_author,review_instance,claim,date,truth_rating,review_url


### Claim duplicates

In [10]:
c = claims.claim.value_counts().values

In [11]:
doubles = claims.claim.value_counts().index[(c > 1)]

In [12]:
doubles

Index(['Cette vidéo montre Bernard-Henri Lévy parmi des 'jihadistes' au Mali',
       'Refugees or illegal immigrants living in Britain get a total yearly benefit of £29,900.',
       'Un visuel démontrant le faible impact de l'épidémie de Covid-19 en France',
       'Ce sont des décisions prises par Donald Trump',
       'Shamima Begum is back in the UK.',
       'Woman takes revenge on her cheating husband by advertising his infidelities on a billboard.',
       'false', 'Covid-19 has a survival rate of 99.8%.'],
      dtype='object')

In [13]:
inconsistant_rating = []
for c in doubles:
    # for each claim check if all ratings are the same -> returns True if inconsistencies are found
    r = claims[claims.claim == c].truth_rating.unique().shape != (1,)
    inconsistant_rating.append(r)
    

In [15]:
inconsistant_rating

[False, False, False, False, True, False, False, True]

In [16]:
doubles[pd.Series(inconsistant_rating).values == True]

Index(['Shamima Begum is back in the UK.', 'Covid-19 has a survival rate of 99.8%.'], dtype='object')

Could be solved more elegant in the future.

In [17]:
claims[claims.claim == 'Shamima Begum is back in the UK.']

Unnamed: 0,ID,claim,date,truth_rating
7620,http://data.gesis.org/claimskg/creative_work/5...,Shamima Begum is back in the UK.,2020-05-19,OTHER
9107,http://data.gesis.org/claimskg/creative_work/7...,Shamima Begum is back in the UK.,2020-10-08,FALSE


In [18]:
claims[claims.claim == 'Covid-19 has a survival rate of 99.8%.']

Unnamed: 0,ID,claim,date,truth_rating
2121,http://data.gesis.org/claimskg/creative_work/1...,Covid-19 has a survival rate of 99.8%.,2021-02-19,OTHER
17464,http://data.gesis.org/claimskg/creative_work/d...,Covid-19 has a survival rate of 99.8%.,2021-06-18,FALSE


#### Remove ambigious claims and reduce duplicate claims to single row

In [20]:
claims.shape

(20497, 4)

In [21]:
claims = claims[claims.claim != 'Shamima Begum is back in the UK.']
claims = claims[claims.claim != 'Covid-19 has a survival rate of 99.8%.']


In [22]:
claims = claims.drop_duplicates(subset="claim")

In [23]:
claims.shape

(20487, 4)

#### Claims with invalid date

In [24]:
claims.date.isnull().value_counts()

False    20487
Name: date, dtype: int64

### Remove claims that are too short

In [25]:
def count_words(claim):
        tokens = claim.split(" ")
        return len(tokens)

In [26]:
claims["n_token"] = claims["claim"].apply(count_words)

In [27]:
# Claims that has less than 3 token are unreasonable
claims = claims[claims.n_token > 2]

In [28]:
claims.shape

(20483, 5)

### Remove all claims that are not english (hopefully)

In [29]:
from langdetect import DetectorFactory, detect

In [30]:
def detect_language(claim):
    try:
        lang = detect(claim)
    except:
        lang = "none"
    return lang

In [31]:
# NOTE: Runs approximately 10 mins!
# reproducibility
DetectorFactory.seed = 0

# fist detect the language for each claim
claims["claim_language"] = claims["claim"].apply(lambda x: detect_language(x))

In [40]:
claims[claims.claim_language != "en"]["truth_rating"].value_counts()

FALSE    1586
OTHER    1310
TRUE       62
Name: truth_rating, dtype: int64

In [36]:
claims[claims.claim_language == "en"]["truth_rating"].value_counts()

FALSE    11143
OTHER     3369
TRUE      3013
Name: truth_rating, dtype: int64

Since the dataset is very unbalanced we try to keep the claims that are labeled "TRUE" or "OTHER" by translating them into english. The claims labeled "FALSE" are removed, since translation is expensive.

In [45]:
from deep_translator import GoogleTranslator

In [61]:
def fast_translate(claim):
    to_en = GoogleTranslator(target='en')
    t_claim = to_en.translate(claim)
    # catch some intranslateable sentences
    if t_claim is None:
        return ""
    return t_claim

In [62]:
to_translate = claims[claims["claim_language"] != "en"]

In [63]:
to_translate = to_translate[to_translate["truth_rating"] != "FALSE"]

In [64]:
# translate the claims (will take some time! ~20min)
to_translate["claim"] = to_translate["claim"].apply(lambda x: fast_translate(x))

In [77]:
# recount n_token
to_translate["n_token"] = to_translate["claim"].apply(count_words)

In [78]:
# for convinience
#to_translate.reset_index(drop=True).to_csv("translated_claims_backup.csv")

### Merge the dataframes

In [79]:
claims = claims[claims["claim_language"] == "en"]

In [80]:
claims.shape

(17525, 6)

In [83]:
claims = pd.concat([claims, to_translate], axis=0)

In [85]:
#double check
claims.ID.describe()

count                                                 18897
unique                                                18897
top       http://data.gesis.org/claimskg/creative_work/0...
freq                                                      1
Name: ID, dtype: object

In [86]:
claims = claims.drop("claim_language", axis=1)

In [87]:
# final shape
claims.shape

(18897, 5)

### Save the pre-processed claims

In [89]:
claims.reset_index(drop=True).to_csv("../../../data/preprocessed_claims.csv")