In [115]:
from gensim import corpora, models, similarities
import pandas as pd
import jieba
from collections import Counter

In [146]:
# read in data

pubhealth = pd.read_csv("PUBHEALTH/train.tsv", sep='\t')
dev = pd.read_csv("PUBHEALTH/dev.tsv", sep='\t')
test = pd.read_csv("PUBHEALTH/test.tsv", sep='\t')

print(len(pubhealth))

# drop columns with no claims
pubhealth = pubhealth.dropna(subset='claim')
print(len(pubhealth))


dev = dev.dropna(subset='claim')
test = test.dropna(subset='claim')

9832
9824


In [147]:
# format tags columns in df

def format_tags(df):
    tags = []
    tag_lists = []

    for subjects in df.subjects:
        if type(subjects) is str:
            s = subjects.split(",")
        else:
            if type(subjects) is list:
                s = subjects
            else:
                s = []
        s = [t.lstrip().rstrip() for t in s]
        tag_lists.append(s)
        for tag in s:
            tags.append(tag)
    df['tags'] = tag_lists
    return df, tags

In [150]:
# format

pubhealth, pubhealth_tags = format_tags(pubhealth)
dev, dev_tags = format_tags(dev)
test, test_tags = format_tags(test)

## Use Gensim model with whole training set

In [151]:
# format pubhealth dataset for gensim

claims = list(pubhealth["claim"])
texts = [jieba.lcut(text) for text in claims]
dictionary = corpora.Dictionary(texts)
feature_cnt = len(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)     

In [152]:
# claim = 'Dryer sheets are one of the very worst things from a chemical allergy standpoint.'
claim = "Chlorine in water causes gray hair."

In [153]:
# claim

kw_vector = dictionary.doc2bow(jieba.lcut(claim))

# similarity

index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt)
sim = index[tfidf[kw_vector]]

In [154]:
sim_df = pd.DataFrame({"claim": claims, "similarity": sim})
sim = sim_df.sort_values("similarity", ascending=False).iloc[0]['similarity']
top_claim = sim_df.sort_values("similarity", ascending=False).iloc[0]['claim']

print("INITIAL CLAIM: ", claim)
print("")
print(sim)
print("")
print("SIMILAR CLAIM: ", top_claim)

INITIAL CLAIM:  Chlorine in water causes gray hair.

0.31056017

SIMILAR CLAIM:  People are putting hair removal creams in conditioner, causing hair loss.


In [155]:
pubhealth[pubhealth['claim'] == top_claim]

Unnamed: 0,claim_id,claim,date_published,explanation,fact_checkers,main_text,sources,label,subjects,tags
1572,36272,People are putting hair removal creams in cond...,"August 1, 2019",Is Someone Putting Nair in Hair Conditioner Bo...,Kim LaCapria,A pair of posts shared to Facebook on July 28 ...,https://www.truthorfiction.com/officemax-bulle...,unproven,"Fact Checks, Viral Content","[Fact Checks, Viral Content]"


# Gensim model with only health tagged claims

In [156]:
health_tags = ['Health', 'Health News', "Health Care", 'Medical', 'Public Health']

In [158]:
def health(x):
    for t in health_tags:
        if t in x:
            return True
    return False

mask = pubhealth.tags.apply(lambda x: health(x))
health_df = pubhealth[mask]

In [159]:
# format pubhealth dataset for gensim

claims = list(health_df["claim"])
texts = [jieba.lcut(text) for text in claims]
dictionary = corpora.Dictionary(texts)
feature_cnt = len(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus) 

In [160]:
#claim = 'Dryer sheets are one of the very worst things from a chemical allergy standpoint.'
#claim = "Chlorine in water causes gray hair."
# claim = 'Cranberry juice is good for UTI.'
claim = 'My hypothesis is that you should train your body how to properly and quickly flush itself of lactic acid without supplements.'

In [161]:
# claim

kw_vector = dictionary.doc2bow(jieba.lcut(claim))

# similarity

index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt)
sim = index[tfidf[kw_vector]]

In [162]:
sim_df = pd.DataFrame({"claim": claims, "similarity": sim})
sim = sim_df.sort_values("similarity", ascending=False).iloc[0]['similarity']
top_claim = sim_df.sort_values("similarity", ascending=False).iloc[0]['claim']

print("INITIAL CLAIM: ", claim)
print("")
print(sim)
print("")
print("SIMILAR CLAIM: ", top_claim)

INITIAL CLAIM:  My hypothesis is that you should train your body how to properly and quickly flush itself of lactic acid without supplements.

0.2626656

SIMILAR CLAIM:  Mixing cream of tartar with orange juice will flush nicotine from your body and help you quit smoking faster. 


In [163]:
pubhealth[pubhealth['claim'] == top_claim]

Unnamed: 0,claim_id,claim,date_published,explanation,fact_checkers,main_text,sources,label,subjects,tags
8485,34070,Mixing cream of tartar with orange juice will ...,"October 24, 2019",Because no published support exists for the cl...,Alex Kasprak,A viral story on Shareably first published in ...,,False,Medical,[Medical]


## Gensim model with only health tagged claims tested on dev set

In [165]:
dev_mask = dev.tags.apply(lambda x: health(x))
dev_health_df = dev[mask]

  dev_health_df = dev[mask]
