In [1]:
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import genesis
nltk.download('genesis')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
genesis_ic = wn.ic(genesis, False, 0.0)
import copy
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import roc_auc_score

[nltk_data] Downloading package genesis to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# Format tags columns in df

def format_tags(df):
    tags = []
    tag_lists = []

    for subjects in df.subjects:
        if type(subjects) is str:
            s = subjects.split(",")
        else:
            if type(subjects) is list:
                s = subjects
            else:
                s = []
        s = [t.lstrip().rstrip() for t in s]
        tag_lists.append(s)
        for tag in s:
            tags.append(tag)
    df['tags'] = tag_lists
    return df, tags

In [3]:
# Select claims from relevant categories
health_tags = ['Health', 'Health News', "Health Care", 'Medical', 'Public Health']

In [4]:
# Helper function for masking dataframe with relevant tags
def health(x):
    for t in health_tags:
        if t in x:
            return True
    return False

In [5]:
# Importing the dataset
data_file_path = "/Users/gwenythportillowightman/OneDrive - Johns Hopkins/fall-2022/interpretable_ml_design/PUBHEALTH/train.tsv"          

df = pd.read_csv(data_file_path, sep='\t')
df, df_tags = format_tags(df)

mask = df['tags'].apply(lambda x: health(x))
df = df[mask]

# text_col contains the column name of where claims are found
# answer_col contains the column name of where post labels (true, false, etc.) are found
text_col = "text"
answer_col = "label"

# Rename the claim column to "text" and label column to "label_categorical"
df.rename(columns = {"claim": "text", "label": "label_categorical"}, inplace = True)
# Make the categorical labels into numbers (0, 1, 2, 3)
df["label"] = pd.factorize(df["label_categorical"])[0]
df = df.dropna(subset=[text_col])
df.reset_index(drop=True, inplace=True)

# Make a copy of the 'text' column
df['text_original'] = df['text']

print(f"Shape of df {df.shape}")

Shape of df (3596, 12)


In [6]:
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [ps.lemmatize(word) for word in text if not word in s]
    text = ' '.join(text)
    return text

In [7]:
import re
nltk.download('stopwords')
s = stopwords.words('english')

ps = nltk.wordnet.WordNetLemmatizer()
for i in range(df.shape[0]):
    text = df.loc[i,'text']
    text = preprocess_text(text)
    df.loc[i, 'text'] = text
    X_train = df['text']
y_train = df['label']

print("preprocessed claims")
print(df['text'][:10])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


preprocessed claims
0    britain reveal trial criterion coronavirus ant...
1    u say result encouraging healthcare delivery r...
2    latest trial j j talc litigation get way calif...
3       democrat hoping flip house trash talking trump
4        sex tech woman led startup pop ce gadget show
5                             waxed apple cause cancer
6    rhode island become second state mandate vacci...
7    brazil city lurch lockdown amid virus crisis r...
8    slovakia new government sharply ramp coronavir...
9                       coronavirus simply common cold
Name: text, dtype: object


In [8]:
class KNN_NLC_Classifer():
    def __init__(self, k=3, distance_type = 'path'):
        self.k = k
        self.distance_type = distance_type

    # This function is used for training
    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train

    # This function runs the K(n) nearest neighbour algorithm and
    # returns the label with closest match. 
    # Predict returns the n similar sentences as a list of tuples [(sentence, score), (sentence, score), ...]
    # Takes in only one input at a time
    def predict(self, x_test):
        test_corpus = []
        
        # Preprocess the full x_test input
        x_test_copy = copy.deepcopy(x_test)
        x_test_copy = preprocess_text(x_test_copy)
        
        # Preprocess sentences of the input
        sentences = sent_tokenize(x_test)
        for sentence in sentences:
            sentence = preprocess_text(sentence)
            test_corpus.append(sentence)
            
        test_corpus.append(x_test_copy)
            
        self.x_test = test_corpus
    
        # {score: [(index of sentence in `test_corpus`, similar sentence index in `dataset`)], ...}
        all_top_scores_dict = {}

        # Iterate over sentences of the input
        for i in range(len(self.x_test)):
            print(f"Getting similar sentences for {self.x_test[i]}")
            
            # {score: similar_sentence_index_in_`dataset`, ...}
            score_to_index_dict = {}
            
            # Iterate over training examples and find sentence similarity scores
            for j in range(self.x_train.shape[0]): 
                score = self.document_similarity(self.x_test[i], self.x_train[j])
                score_to_index_dict[score] = j

            sorted_scores = list(score_to_index_dict.keys())
            sorted_scores.sort(reverse=True)

            # Get the top k similar sentences for the current sentence (x_test[i])
            for k in range(self.k):
                score = sorted_scores[k]
                
                if score in all_top_scores_dict:
                    all_top_scores_dict[score].append( (i, score_to_index_dict[score]) )
                else:
                    all_top_scores_dict[score] = [ (i, score_to_index_dict[score]) ]
                    
        # Get the top k scoring sentences and similar sentences from all_top_scores_dict
        sorted_scores = list(all_top_scores_dict.keys())
        sorted_scores.sort(reverse=True)
        
        # [ ((index_of_sentence_in_input, index_of_similar_sentence_in_`dataset`), score), ...]
        similar_texts_list = []
        
        for k in range(self.k):
            score = sorted_scores[k]
            new_tuple = (all_top_scores_dict[score], score)
            similar_texts_list.append(new_tuple)

        return similar_texts_list
    
    def convert_tag(self, tag):
        """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
        tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
        try:
            return tag_dict[tag[0]]
        except KeyError:
            return None

    def doc_to_synsets(self, doc):
        """
            Returns a list of synsets in document.
            Tokenizes and tags the words in the document doc.
            Then finds the first synset for each word/tag combination.
        If a synset is not found for that combination it is skipped.

        Args:
            doc: string to be converted

        Returns:
            list of synsets
        """
        tokens = word_tokenize(doc+' ')
        
        l = []
        tags = nltk.pos_tag([tokens[0] + ' ']) if len(tokens) == 1 else nltk.pos_tag(tokens)
        
        for token, tag in zip(tokens, tags):
            syntag = self.convert_tag(tag[1])
            syns = wn.synsets(token, syntag)
            if (len(syns) > 0):
                l.append(syns[0])
        return l  
    
    def similarity_score(self, s1, s2, distance_type = 'path'):
        """
        Calculate the normalized similarity score of s1 onto s2
        For each synset in s1, finds the synset in s2 with the largest similarity value.
        Sum of all of the largest similarity values and normalize this value by dividing it by the
        number of largest similarity values found.

        Args:
          s1, s2: list of synsets from doc_to_synsets

        Returns:
          normalized similarity score of s1 onto s2
        """
        s1_largest_scores = []

        for i, s1_synset in enumerate(s1, 0):
            max_score = 0
            for s2_synset in s2:
                if distance_type == 'path':
                    score = s1_synset.path_similarity(s2_synset, simulate_root = False)
                else:
                    score = s1_synset.wup_similarity(s2_synset)                  
                if score != None:
                    if score > max_score:
                        max_score = score

            if max_score != 0:
                s1_largest_scores.append(max_score)

        mean_score = np.mean(s1_largest_scores)

        return mean_score 
    
    def document_similarity(self, doc1, doc2):
        """Finds the similarity between doc1 and doc2"""

        synsets1 = self.doc_to_synsets(doc1)
        synsets2 = self.doc_to_synsets(doc2)
          
        return (self.similarity_score(synsets1, synsets2) + self.similarity_score(synsets2, synsets1)) / 2


In [10]:
# 4. Train the Classifier
classifier = KNN_NLC_Classifer(k=3, distance_type='path')
classifier.fit(X_train, y_train)

input = 'They are coated in toxic chemicals. Dryer sheets are one of the very worst things from a chemical allergy standpoint.'

y_pred = classifier.predict(input)
print(y_pred)
print()
print("Top 3 similar examples:")

for i, result in enumerate (y_pred):
#     print(f"index of sentence in input & similar sentence index in dataset: {result[0]}, score: {result[1]}")
    original_sentence_index = result[0][0][0]
    similar_sentence_index = result[0][0][1]
    similar_sentence_data = df.iloc[[similar_sentence_index]].values.tolist()[0]
    text_original_column_index = 11
    label_categorical_column_index = 7
    print(f'Original sentence: {similar_sentence_data[text_original_column_index]}')
    print(f'Label: {similar_sentence_data[label_categorical_column_index]}')
    print()

Getting similar sentences for coated toxic chemical
Getting similar sentences for dryer sheet one worst thing chemical allergy standpoint
Getting similar sentences for coated toxic chemical dryer sheet one worst thing chemical allergy standpoint
[([(0, 2041)], 0.7743055555555556), ([(0, 3378)], 0.7026515151515151), ([(0, 680)], 0.6563956876456877)]

Top 3 similar examples:
Original sentence: English Channel dolphins carry ‘toxic cocktail’ of chemicals.
Label: true

Original sentence: Born Basic Anti-Bac hand sanitizer was recalled in the U.S. after being found to contain methanol, a poisonous chemical.
Label: true

Original sentence: States, military clash on cleanup of toxic chemicals.
Label: true

