In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

In [2]:
# no pre-processing for test claims
claims = pd.read_csv("../../../data/raw_claims_new.csv", sep=",", index_col=0)

In [3]:
claims.head()

Unnamed: 0,ID,claim,date,truth_rating
0,http://data.gesis.org/claimskg/creative_work/0...,"The EPA 'wants to hire 230,000 new government ...",2011-10-21,FALSE
1,http://data.gesis.org/claimskg/creative_work/0...,Actual video of Iraqi soldier saying goodbye t...,2021-06-09,FALSE
2,http://data.gesis.org/claimskg/creative_work/0...,Bus launched in August 2020 in Pakistan falls ...,2020-08-17,FALSE
3,http://data.gesis.org/claimskg/creative_work/0...,Another man was responsible for the assault th...,2018-09-28,OTHER
4,http://data.gesis.org/claimskg/creative_work/0...,U.S. President Joe Biden visited the Tree of L...,2021-09-03,FALSE


### Uniqueness of IDs

In [4]:
claims.ID.describe()

count                                                 47017
unique                                                44463
top       http://data.gesis.org/claimskg/creative_work/7...
freq                                                    976
Name: ID, dtype: object

In [5]:
# drop duplicates
claims = claims.drop_duplicates(subset="ID")

In [6]:
claims.ID.describe()

count                                                 44463
unique                                                44463
top       http://data.gesis.org/claimskg/creative_work/0...
freq                                                      1
Name: ID, dtype: object

### Author is not the fact checking site

Since we do not plan to use the author as a feature, we might ignore this removal in the future

In [23]:
#(claims.claim_author == claims.review_instance).value_counts()

False    19564
True      1374
dtype: int64

In [24]:
#claims[claims.claim_author == claims.review_instance].review_instance.value_counts()

fatabyyano    1374
Name: review_instance, dtype: int64

In [25]:
#claims = claims[claims.review_instance != "fatabyyano"]

In [26]:
#(claims.claim_author == claims.review_instance).value_counts()

False    19564
dtype: int64

### Claim text is present 

In [7]:
claims.shape

(44463, 4)

In [8]:
claims = claims.drop(index = claims[claims.claim.isnull()].index)

In [9]:
claims.shape

(44387, 4)

### Claim author is present (ignored in final version)

In [28]:
#claims[claims.claim_author.isnull()]

Unnamed: 0,ID,claim_author,review_instance,claim,date,truth_rating,review_url


In [29]:
#claims[claims.claim_author == " "]

Unnamed: 0,ID,claim_author,review_instance,claim,date,truth_rating,review_url


### Claim duplicates

In [10]:
c = claims.claim.value_counts().values

In [11]:
doubles = claims.claim.value_counts().index[(c > 1)]

In [12]:
doubles

Index([''Nude Image'',
       ''New species of coral…it was found near the Galápagos Islands…..WOW.'',
       'On the Trans-Pacific Partnership.',
       ''Two handfuls of cashews equivalent to a Prozac dose'',
       ''Nrc started in assam.They have begun evicting people from their homes,the media doesn't show it ,they are already bought and gagged ,so it is our responsibility now to share this video.'',
       '“U.S. military at the White House arresting Congress.”',
       'Refugees or illegal immigrants living in Britain get a total yearly benefit of £29,900.',
       'On a cap-and-trade plan.',
       ''റോക്കറ്റ് വിക്ഷേപിച്ച് ഏതാനും നിമിഷങ്ങൾക്കുള്ളിൽ വായുവിൽ പൊട്ടിത്തെറിക്കുന്ന ഒരു വീഡിയോ വൈറൽ ആകുന്നു. ഈ വീഡിയോ ഒരു ഇന്ത്യൻ മിസൈലിന്റെ പരീക്ഷണം പരാജയപ്പെട്ടതാണെന്ന അവകാശവാദത്തോടെ ആണ് പ്രചരിക്കുന്നത്.'',
       ''What #BoycottTanishq is on Twitter and what it actually is. From one lakh twelve thousand to 200 rupees.'',
       ''पन्द्रह जून के बाद फिर से हो सकता है सम्पूर्ण लॉकडॉउन गृ

In [13]:
inconsistant_rating = []
for c in doubles:
    # for each claim check if all ratings are the same -> returns True if inconsistencies are found
    r = claims[claims.claim == c].truth_rating.unique().shape != (1,)
    inconsistant_rating.append(r)
    

In [14]:
inconsistant_rating

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False]

In [15]:
doubles[pd.Series(inconsistant_rating).values == True]

Index(['Shamima Begum is back in the UK.', 'Covid-19 has a survival rate of 99.8%.'], dtype='object')

Could be solved more elegant in the future.

In [16]:
claims[claims.claim == 'Shamima Begum is back in the UK.']

Unnamed: 0,ID,claim,date,truth_rating
17113,http://data.gesis.org/claimskg/creative_work/5...,Shamima Begum is back in the UK.,2020-05-19,OTHER
21325,http://data.gesis.org/claimskg/creative_work/7...,Shamima Begum is back in the UK.,2020-10-08,FALSE


In [17]:
claims[claims.claim == 'Covid-19 has a survival rate of 99.8%.']

Unnamed: 0,ID,claim,date,truth_rating
4884,http://data.gesis.org/claimskg/creative_work/1...,Covid-19 has a survival rate of 99.8%.,2021-02-19,OTHER
39832,http://data.gesis.org/claimskg/creative_work/d...,Covid-19 has a survival rate of 99.8%.,2021-06-18,FALSE


#### Remove ambigious claims and reduce duplicate claims to single row

In [18]:
claims.shape

(44387, 4)

In [19]:
claims = claims[claims.claim != 'Shamima Begum is back in the UK.']
claims = claims[claims.claim != 'Covid-19 has a survival rate of 99.8%.']


In [20]:
claims = claims.drop_duplicates(subset="claim")

In [21]:
claims.shape

(44343, 4)

#### Claims with invalid date

In [22]:
claims.date.isnull().value_counts()

False    44343
Name: date, dtype: int64

### Remove claims that are too short

In [23]:
def count_words(claim):
        tokens = claim.split(" ")
        return len(tokens)

In [30]:
claims["n_token"] = claims["claim"].progress_apply(count_words)

100%|██████████| 44300/44300 [00:00<00:00, 82652.27it/s] 


In [31]:
# Claims that has less than 3 token are unreasonable
claims = claims[claims.n_token > 2]

In [32]:
claims.shape

(44300, 5)

### Remove all claims that are not english (hopefully)

In [27]:
from langdetect import DetectorFactory, detect

In [28]:
def detect_language(claim):
    try:
        lang = detect(claim)
    except:
        lang = "none"
    return lang

In [33]:
# NOTE: Runs approximately 10 mins!
# reproducibility
DetectorFactory.seed = 0

# fist detect the language for each claim
claims["claim_language"] = claims["claim"].progress_apply(lambda x: detect_language(x))

100%|██████████| 44300/44300 [09:47<00:00, 75.45it/s] 


In [34]:
claims[claims.claim_language != "en"]["truth_rating"].value_counts()

FALSE    3692
OTHER    1394
TRUE      118
Name: truth_rating, dtype: int64

In [35]:
claims[claims.claim_language == "en"]["truth_rating"].value_counts()

FALSE    23684
OTHER     8893
TRUE      6519
Name: truth_rating, dtype: int64

Since the dataset is very unbalanced we try to keep the claims that are labeled "TRUE" or "OTHER" by translating them into english. The claims labeled "FALSE" are removed, since translation is expensive.

In [36]:
from deep_translator import GoogleTranslator

In [37]:
def fast_translate(claim):
    to_en = GoogleTranslator(target='en')
    t_claim = to_en.translate(claim)
    # catch some intranslateable sentences
    if t_claim is None:
        return ""
    return t_claim

In [38]:
to_translate = claims[claims["claim_language"] != "en"]

In [39]:
to_translate = to_translate[to_translate["truth_rating"] != "FALSE"]

In [40]:
# translate the claims (will take some time! ~20min)
to_translate["claim"] = to_translate["claim"].progress_apply(lambda x: fast_translate(x))

100%|██████████| 1512/1512 [16:03<00:00,  1.57it/s]


In [41]:
# recount n_token
to_translate["n_token"] = to_translate["claim"].apply(count_words)

In [78]:
# for convinience
#to_translate.reset_index(drop=True).to_csv("translated_claims_backup.csv")

### Merge the dataframes

In [42]:
claims = claims[claims["claim_language"] == "en"]

In [43]:
claims.shape

(39096, 6)

In [44]:
claims = pd.concat([claims, to_translate], axis=0)

In [45]:
#double check
claims.ID.describe()

count                                                 40608
unique                                                40608
top       http://data.gesis.org/claimskg/creative_work/0...
freq                                                      1
Name: ID, dtype: object

In [46]:
claims = claims.drop("claim_language", axis=1)

In [47]:
# final shape
claims.shape

(40608, 5)

### Save the pre-processed claims

In [48]:
claims.reset_index(drop=True).to_csv("../../../data/preprocessed_claims_new.csv")