# Reddit HealthAdviceChecKer Bot Model

## Imports

In [7]:
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import genesis
nltk.download('genesis')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
genesis_ic = wn.ic(genesis, False, 0.0)
import copy
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import roc_auc_score
from collections import Counter

[nltk_data] Downloading package genesis to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Filtering the dataset to health claims only

In [8]:
# Format tags columns in df

def format_tags(df):
    tags = []
    tag_lists = []

    for subjects in df.subjects:
        if type(subjects) is str:
            s = subjects.split(",")
        else:
            if type(subjects) is list:
                s = subjects
            else:
                s = []
        s = [t.lstrip().rstrip() for t in s]
        tag_lists.append(s)
        for tag in s:
            tags.append(tag)
    df['tags'] = tag_lists
    return df, tags

In [9]:
# Select claims from relevant categories
health_tags = ['Health', 'Health News', "Health Care", 'Medical', 'Public Health', 'ADHD', 'Health / Medical', 'Medical Myths', 'diet']


In [10]:
# Helper function for masking dataframe with relevant tags
def health(x):
    for t in health_tags:
        if t in x:
            return True
    return False

## Read in data

In [11]:
# Importing the dataset
data_file_path = "/Users/gwenythportillowightman/OneDrive - Johns Hopkins/fall-2022/interpretable_ml_design/PUBHEALTH/train.tsv"          
df = pd.read_csv(data_file_path, sep='\t')
print(df.shape)

df, df_tags = format_tags(df)

mask = df['tags'].apply(lambda x: health(x))
df = df[mask]

# text_col contains the column name of where claims are found
# answer_col contains the column name of where post labels (true, false, etc.) are found
text_col = "text"
answer_col = "label"

# Rename the claim column to "text" and label column to "label_categorical"
df.rename(columns = {"claim": "text", "label": "label_categorical"}, inplace = True)
# Make the categorical labels into numbers (0, 1, 2, 3)
df["label"] = pd.factorize(df["label_categorical"])[0]
df = df.dropna(subset=[text_col])
df.reset_index(drop=True, inplace=True)

# Make a copy of the 'text' column
df['text_original'] = df['text']

print(f"Shape of df {df.shape}")

(9832, 9)
Shape of df (3638, 12)


In [12]:
already_appended = False

In [13]:
# Append the extra training sentences
if not already_appended:

    extra_sentences_file_path = '/Users/gwenythportillowightman/OneDrive - Johns Hopkins/fall-2022/interpretable_ml_design/extra_sentences.csv'
    extra_sentences_df = pd.read_csv(extra_sentences_file_path)

    df = pd.concat([df, extra_sentences_df])
    print(df.shape)
    
    already_appended = True

(3660, 12)


### Prepare to preprocess text claims

In [14]:
import re
nltk.download('stopwords')
s = stopwords.words('english')

def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [ps.lemmatize(word) for word in text if not word in s]
    text = ' '.join(text)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Preprocess data

In [17]:
ps = nltk.wordnet.WordNetLemmatizer()
for index, row in df.iterrows():
    text = row['text']
    text = preprocess_text(text)
    df.loc[index, 'text'] = text
    X_train = df['text']
y_train = df['label']

print("Preprocessed claims")
print(df['text'][:10])

Preprocessed claims
0    case imported fruit vegetable contaminated hiv...
1    year old montana died lead poisoning eating ca...
2     sure wash fruit vegetable country report hiv aid
3    lead candle help reach daily nutritional healt...
4    apple cider vinegar mixed water speed metaboli...
5    apple cider vinegar great way lose weight drin...
6    dryer sheet full toxic chemical keep away chil...
7               using dryer sheet laundry give allergy
8    spoonful safflower oil day keep pound away saf...
9           trying lose weight used safflower oil cook
Name: text, dtype: object


## KNN Model

Returns the top k most similar sentences from training data

In [41]:
class KNN_Model():
    def __init__(self, k=3, distance_type = 'path', preprocess=True):
        self.k = k
        self.distance_type = distance_type
        self.preprocess = preprocess

    # This function is used for training
    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train
        
    def split_input(self, input_sentence):
        test_corpus = []
        
        # Preprocess the full x_test input
        input_sentence_copy = copy.deepcopy(input_sentence)
        if self.preprocess:
            input_sentence_copy = preprocess_text(input_sentence_copy)
        
        # Preprocess sentences of the input
        sentences = sent_tokenize(input_sentence)
        for sentence in sentences:
            if self.preprocess:
                sentence = preprocess_text(sentence)
            test_corpus.append(sentence)
            
        if len(test_corpus) > 1:
            test_corpus.append(input_sentence_copy)
        
        print(test_corpus)
        
        return test_corpus

    # Returns the k most similar sentences for the input sentence
    # Predict returns the n similar sentences as a list of tuples [(sentence, score), (sentence, score), ...]
    # Takes in only one input at a time
    def predict(self, x_test):
        test_corpus = self.split_input(x_test)
            
        self.x_test = test_corpus
    
        # {score: [(index of sentence in `test_corpus`, similar sentence index in `dataset`)], ...}
        all_top_scores_dict = {}

        # Iterate over sentences of the input
        for i in range(len(self.x_test)):
            print(f"------- Getting similar sentences for \"{self.x_test[i]}\" ({i+1}/{len(self.x_test)}) ------")
            
            # {score: similar_sentence_index_in_`dataset`, ...}
            score_to_index_dict = {}
            
            # Iterate over training examples and find sentence similarity scores
            for j in range(self.x_train.shape[0]): 
                print(f'WHYBHIOJKHIJOJBKHBIJOJKBH BKJI {self.x_train[j]}')
                new_sentence = self.x_train[j].tolist()[0]
                print(f'i,j {self.x_test[i]} ,,, {new_sentence}')
                score = self.document_similarity(self.x_test[i], new_sentence)
                score_to_index_dict[score] = j

            sorted_scores = list(score_to_index_dict.keys())
            sorted_scores.sort(reverse=True)

            # Get the top k similar sentences for the current sentence (x_test[i])
            for k in range(self.k):
                score = sorted_scores[k]
                
                if score in all_top_scores_dict:
                    all_top_scores_dict[score].append( (i, score_to_index_dict[score]) )
                else:
                    all_top_scores_dict[score] = [ (i, score_to_index_dict[score]) ]
                    
        # Get the top k scoring sentences and similar sentences from all_top_scores_dict
        sorted_scores = list(all_top_scores_dict.keys())
        sorted_scores.sort(reverse=True)
        
        # [ ((index_of_sentence_in_input, index_of_similar_sentence_in_`dataset`), score), ...]
        similar_texts_list = []
        
        for k in range(self.k):
            score = sorted_scores[k]
            new_tuple = (all_top_scores_dict[score], score)
            similar_texts_list.append(new_tuple)

        return similar_texts_list
    
    def convert_tag(self, tag):
        """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
        tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
        try:
            return tag_dict[tag[0]]
        except KeyError:
            return None

    def doc_to_synsets(self, doc):
        """
            Returns a list of synsets in document.
            Tokenizes and tags the words in the document doc.
            Then finds the first synset for each word/tag combination.
        If a synset is not found for that combination it is skipped.

        Args:
            doc: string to be converted

        Returns:
            list of synsets
        """
        print(f"DOC DOC DOC {doc}")
        tokens = word_tokenize(doc+' ')
        
        l = []
        tags = nltk.pos_tag([tokens[0] + ' ']) if len(tokens) == 1 else nltk.pos_tag(tokens)
        
        for token, tag in zip(tokens, tags):
            syntag = self.convert_tag(tag[1])
            syns = wn.synsets(token, syntag)
            if (len(syns) > 0):
                l.append(syns[0])
        return l  
    
    def similarity_score(self, s1, s2, distance_type = 'path'):
        """
        Calculate the normalized similarity score of s1 onto s2
        For each synset in s1, finds the synset in s2 with the largest similarity value.
        Sum of all of the largest similarity values and normalize this value by dividing it by the
        number of largest similarity values found.

        Args:
          s1, s2: list of synsets from doc_to_synsets

        Returns:
          normalized similarity score of s1 onto s2
        """
        s1_largest_scores = []

        for i, s1_synset in enumerate(s1, 0):
            max_score = 0
            for s2_synset in s2:
                if distance_type == 'path':
                    score = s1_synset.path_similarity(s2_synset, simulate_root = False)
                else:
                    score = s1_synset.wup_similarity(s2_synset)                  
                if score != None:
                    if score > max_score:
                        max_score = score

            if max_score != 0:
                s1_largest_scores.append(max_score)

        mean_score = np.mean(s1_largest_scores)

        return mean_score 
    
    def document_similarity(self, doc1, doc2):
        """Finds the similarity between doc1 and doc2"""

        synsets1 = self.doc_to_synsets(doc1)
        synsets2 = self.doc_to_synsets(doc2)
          
        return (self.similarity_score(synsets1, synsets2) + self.similarity_score(synsets2, synsets1)) / 2


SyntaxError: EOL while scanning string literal (<ipython-input-41-b5e9c3e92920>, line 54)

## Single example (dryer sheet)

In [None]:
k_value = 3
preprocess = False

classifier = KNN_Model(preprocess=preprocess, k=k_value, distance_type='path')
classifier.fit(X_train, y_train)

In [None]:
input_text = 'They are coated in toxic chemicals. Dryer sheets are one of the very worst things from a chemical allergy standpoint.'

input_sentences = classifier.split_input(input_text)

y_pred = classifier.predict(input_text)

In [None]:
print()
# print(f"Top {k_value} similar examples:")

unique_similar_sentences = []
all_similar_sentences = []
similar_sentence_to_original_sentence_dict = {}  # Value is a tuple like (original_sentence, score)

# Print out the k most similar sentences (across all sentences in input)
for i, result in enumerate (y_pred):
    original_sentence_index = result[0][0][0]
    
    if original_sentence_index == len(input_sentences):
        original_sentence = input_text
    else:
        original_sentence = input_sentences[original_sentence_index]
    
    similar_sentence_index = result[0][0][1]
    similar_sentence_data = df.iloc[[similar_sentence_index]].values.tolist()[0]
    
    text_original_column_index = 11
    label_categorical_column_index = 7
    
    score = result[1]
    
    similar_sentence = similar_sentence_data[text_original_column_index]
    all_similar_sentences.append(similar_sentence)
    
    original_sentence_score_tuple = (original_sentence, score)
    if similar_sentence in similar_sentence_to_original_sentence_dict:
        similar_sentence_to_original_sentence_dict[similar_sentence].append(original_sentence_score_tuple)
    else:
        similar_sentence_to_original_sentence_dict[similar_sentence] = [original_sentence_score_tuple]
        
    if similar_sentence in unique_similar_sentences:
        continue
    else:
        unique_similar_sentences.append(similar_sentence)
    
#     print(f'Original sentence: {original_sentence}')
#     print(f'Similar sentence: {similar_sentence}')
#     print(f'Label: {similar_sentence_data[label_categorical_column_index]}')
#     print(f'Score: {score}')
#     print()
    
similar_sentence_counter = Counter(all_similar_sentences)

In [None]:
most_common = similar_sentence_counter.most_common()

print("Most common similar sentences for input text")
print()

for (similar_sentence, count) in most_common:
    print(f'SIMILAR SENTENCE: \'{similar_sentence}\'; COUNT: {count}')
    original_sentence_score_tuple_list = similar_sentence_to_original_sentence_dict[similar_sentence]
    
    # Sort the tuples by the length of the first object in the tuple so that if the full input_text is 
    #    one of the similar sentences, it will be printed first
    original_sentence_score_tuple_list.sort(key=lambda x: len(x[0]), reverse=True)
    
    print('ORIGINAL SENTENCES:')
    for original_sentence_score_tuple in original_sentence_score_tuple_list:
        original_sentence = original_sentence_score_tuple[0]
        score = original_sentence_score_tuple[1]
        if original_sentence == input_text:
            print(f'   - [***FULL INPUT TEXT***] {original_sentence}  ({score})')
        else:
            print(f'   - {original_sentence}  ({score})')
        
    print()
    

## Use examples from PUBHEALTH

In [39]:
def prepare_df(data_file_path):
    if data_file_path.endswith('tsv'):
        df = pd.read_csv(data_file_path, sep='\t')
    else:  # assume csv
        df = pd.read_csv(data_file_path)

    df, df_tags = format_tags(df)

    mask = df['tags'].apply(lambda x: health(x))
    df = df[mask]

    # text_col contains the column name of where claims are found
    # answer_col contains the column name of where post labels (true, false, etc.) are found
    text_col = "text"
    answer_col = "label"

    # Rename the claim column to "text" and label column to "label_categorical"
    df.rename(columns = {"claim": "text", "label": "label_categorical"}, inplace = True)
    # Make the categorical labels into numbers (0, 1, 2, 3)
    df["label"] = pd.factorize(df["label_categorical"])[0]
    df = df.dropna(subset=[text_col])
    df.reset_index(drop=True, inplace=True)

    # Make a copy of the 'text' column
    df['text_original'] = df['text']
    
    return df

### Functions to process an input

In [19]:
def process_input_text(input_text, classifier):
    input_sentences = classifier.split_input(input_text)

    y_pred = classifier.predict(input_text)
    
    unique_similar_sentences = []
    all_similar_sentences = []
    similar_sentence_to_original_sentence_dict = {}  # Value is a tuple like (original_sentence, score)

    # Print out the k most similar sentences (across all sentences in input)
    for i, result in enumerate (y_pred):
        original_sentence_index = result[0][0][0]

        if original_sentence_index == len(input_sentences):
            original_sentence = input_text
        else:
            original_sentence = input_sentences[original_sentence_index]

        similar_sentence_index = result[0][0][1]
        similar_sentence_data = df.iloc[[similar_sentence_index]].values.tolist()[0]

        text_original_column_index = 11
        label_categorical_column_index = 7
#         label_column_index = 

        score = result[1]

        similar_sentence = similar_sentence_data[text_original_column_index]
        all_similar_sentences.append(similar_sentence)

        original_sentence_score_tuple = (original_sentence, score)
        if similar_sentence in similar_sentence_to_original_sentence_dict:
            similar_sentence_to_original_sentence_dict[similar_sentence].append(original_sentence_score_tuple)
        else:
            similar_sentence_to_original_sentence_dict[similar_sentence] = [original_sentence_score_tuple]

        if similar_sentence in unique_similar_sentences:
            continue
        else:
            unique_similar_sentences.append(similar_sentence)

    similar_sentence_counter = Counter(all_similar_sentences)
    
    most_common = similar_sentence_counter.most_common()

    print(" ~~~ Most common similar sentences for input text ~~~")
    print()

    for (similar_sentence, count) in most_common:
        print(f'SIMILAR SENTENCE: \'{similar_sentence}\'; COUNT: {count}')
        original_sentence_score_tuple_list = similar_sentence_to_original_sentence_dict[similar_sentence]

        # Sort the tuples by the length of the first object in the tuple so that if the full input_text is 
        #    one of the similar sentences, it will be printed first
        original_sentence_score_tuple_list.sort(key=lambda x: len(x[0]), reverse=True)

        print()

### 10 examples of test data

In [None]:
test_data_file_path = "/Users/gwenythportillowightman/OneDrive - Johns Hopkins/fall-2022/interpretable_ml_design/PUBHEALTH/test.tsv"          

test_df = prepare_df(test_data_file_path)

test_data_subset = test_df[:10]

for index, row in test_df_subset.iterrows():
    input_text = row['text']
    process_input_text(input_text, classifier)


### Examples from [User Study Examples](https://docs.google.com/spreadsheets/d/1BF-PR27TVwq9P6pcZQ9eHCOuQVP9nT989nmta5CbcGM/edit#gid=63935314)

In [39]:
# Classifier for user study examples
k_value = 5
preprocess = False

classifier = KNN_Model(preprocess=preprocess, k=k_value, distance_type='path')
classifier.fit(X_train, y_train)

In [40]:
user_study_examples_file_path = '/Users/gwenythportillowightman/OneDrive - Johns Hopkins/fall-2022/interpretable_ml_design/user_study_examples.csv'

USE_df = pd.read_csv(user_study_examples_file_path)

for index, row in USE_df.iterrows():
    input_text = row['Claims']
    process_input_text(input_text, classifier)

['You can get HIV from fruits or vegetables from other countries.']
['You can get HIV from fruits or vegetables from other countries.']
------- Getting similar sentences for "You can get HIV from fruits or vegetables from other countries." (1/1) ------
i,j You can get HIV from fruits or vegetables from other countries. ,,, case imported fruit vegetable contaminated hiv positive blood
DOC DOC DOC You can get HIV from fruits or vegetables from other countries.
DOC DOC DOC case imported fruit vegetable contaminated hiv positive blood
i,j You can get HIV from fruits or vegetables from other countries. ,,, year old montana died lead poisoning eating candy cane shaped candle
DOC DOC DOC You can get HIV from fruits or vegetables from other countries.
DOC DOC DOC year old montana died lead poisoning eating candy cane shaped candle
i,j You can get HIV from fruits or vegetables from other countries. ,,, sure wash fruit vegetable country report hiv aid
DOC DOC DOC You can get HIV from fruits or v

AttributeError: 'str' object has no attribute 'tolist'