In [1]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import genesis
nltk.download('genesis')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
genesis_ic = wn.ic(genesis, False, 0.0)

import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import roc_auc_score

[nltk_data] Downloading package genesis to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data]   Unzipping corpora/genesis.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [81]:
class KNN_NLC_Classifer():
    def __init__(self, k=1, distance_type = 'path'):
        self.k = k
        self.distance_type = distance_type

    # This function is used for training
    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train

    # This function runs the K(1) nearest neighbour algorithm and
    # returns the label with closest match. 
    def predict(self, x_test):
        self.x_test = x_test
        y_predict = []

        for i in range(len(x_test)):
            max_sim = 0
            max_index = 0
            for j in range(self.x_train.shape[0]):
                temp = self.document_similarity(x_test[i], self.x_train[j])
                if temp > max_sim:
                    max_sim = temp
                    max_index = j
                    print(f"Similar sentence: {self.x_train[j]}")
            y_predict.append(self.y_train[max_index])
        return y_predict
    
    def convert_tag(self, tag):
        """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
        tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
        try:
            return tag_dict[tag[0]]
        except KeyError:
            return None

    def doc_to_synsets(self, doc):
        """
            Returns a list of synsets in document.
            Tokenizes and tags the words in the document doc.
            Then finds the first synset for each word/tag combination.
        If a synset is not found for that combination it is skipped.

        Args:
            doc: string to be converted

        Returns:
            list of synsets
        """
        tokens = word_tokenize(doc+' ')
        
        l = []
        tags = nltk.pos_tag([tokens[0] + ' ']) if len(tokens) == 1 else nltk.pos_tag(tokens)
        
        for token, tag in zip(tokens, tags):
            syntag = self.convert_tag(tag[1])
            syns = wn.synsets(token, syntag)
            if (len(syns) > 0):
                l.append(syns[0])
        return l  
    
    def similarity_score(self, s1, s2, distance_type = 'path'):
        """
        Calculate the normalized similarity score of s1 onto s2
        For each synset in s1, finds the synset in s2 with the largest similarity value.
        Sum of all of the largest similarity values and normalize this value by dividing it by the
        number of largest similarity values found.

        Args:
          s1, s2: list of synsets from doc_to_synsets

        Returns:
          normalized similarity score of s1 onto s2
        """
        s1_largest_scores = []

        for i, s1_synset in enumerate(s1, 0):
            max_score = 0
            for s2_synset in s2:
                if distance_type == 'path':
                    score = s1_synset.path_similarity(s2_synset, simulate_root = False)
                else:
                    score = s1_synset.wup_similarity(s2_synset)                  
                if score != None:
                    if score > max_score:
                        max_score = score

            if max_score != 0:
                s1_largest_scores.append(max_score)

        mean_score = np.mean(s1_largest_scores)

        return mean_score 
    
    def document_similarity(self,doc1, doc2):
        """Finds the symmetrical similarity between doc1 and doc2"""

        synsets1 = self.doc_to_synsets(doc1)
        synsets2 = self.doc_to_synsets(doc2)
          
        return (self.similarity_score(synsets1, synsets2) + self.similarity_score(synsets2, synsets1)) / 2


In [23]:
doc1 = 'I like rains'
doc2 = 'I like showers'
x = KNN_NLC_Classifer(k=1)
print("Test Similarity Score: ", x.document_similarity(doc1, doc2))

Test Similarity Score:  0.6946386946386947


In [61]:
# format tags columns in df

def format_tags(df):
    tags = []
    tag_lists = []

    for subjects in df.subjects:
        if type(subjects) is str:
            s = subjects.split(",")
        else:
            if type(subjects) is list:
                s = subjects
            else:
                s = []
        s = [t.lstrip().rstrip() for t in s]
        tag_lists.append(s)
        for tag in s:
            tags.append(tag)
    df['tags'] = tag_lists
    return df, tags

In [65]:
health_tags = ['Health', 'Health News', "Health Care", 'Medical', 'Public Health']


In [66]:
def health(x):
    for t in health_tags:
        if t in x:
            return True
    return False

In [80]:
# 1. Importing the dataset
#we'll use the demo dataset available at Watson NLC Classifier Demo.
FILENAME = "/Users/gwenythportillowightman/OneDrive - Johns Hopkins/fall-2022/interpretable_ml_design/PUBHEALTH/train.tsv"          

dataset = pd.read_csv(FILENAME, sep='\t')
dataset, dataset_tags = format_tags(dataset)

mask = dataset['tags'].apply(lambda x: health(x))
dataset = dataset[mask]

text_col = "text"
answer_col = "label"

dataset.rename(columns = {"claim": "text", "label":"label_categorical"}, inplace = True)
dataset["label"] = pd.factorize(dataset["label_categorical"])[0]
dataset = dataset.dropna(subset=[text_col])
dataset.reset_index(drop=True, inplace=True)
                  
Num_Words = dataset.shape[0]

print(dataset.head(15))
print("\nSize of input file is ", dataset.shape)

   claim_id                                               text  \
0      8713  Britain to reveal trial criteria for coronavir...   
1      2768  U.S. says results encouraging for healthcare d...   
2      2717  Latest trial in J&J talc litigations gets unde...   
3      5793  Democrats hoping to flip House not just trash-...   
4      2981  Sex tech from women-led startups pops up at CE...   
5     29528                         Waxed apples cause cancer.   
6     15232  Rhode Island will become just the second state...   
7      7453  Brazil cities lurch to lockdowns amid virus cr...   
8      8069  Slovakia's new government to sharply ramp up c...   
9     26723       The coronavirus is “simply the common cold.”   
10     5243  Massachusetts to help test addiction treatment...   
11     4639  ALS patient behind ice bucket challenge: I wil...   
12     3801  State Senate leader outlines agenda as lawmake...   
13    14336  "Rick Scott's Starbucks heckler Says Rick Scot...   
14     873

In [70]:
dataset.columns

Index(['claim_id', 'text', 'date_published', 'explanation', 'fact_checkers',
       'main_text', 'sources', 'label_categorical', 'subjects', 'tags',
       'label'],
      dtype='object')

In [71]:
print(dataset['text'])

0       Britain to reveal trial criteria for coronavir...
1       U.S. says results encouraging for healthcare d...
2       Latest trial in J&J talc litigations gets unde...
3       Democrats hoping to flip House not just trash-...
4       Sex tech from women-led startups pops up at CE...
                              ...                        
3591    Miami-Dade is "the first community in the worl...
3592    Venezuela expands quarantine as number of coro...
3593    AstraZeneca's infant respiratory drug prioriti...
3594    Testicular cancer deaths double with after 40 ...
3595     The FDA published “conclusive proof” that the...
Name: text, Length: 3596, dtype: object


In [72]:
import re
nltk.download('stopwords')
s = stopwords.words('english')
#add additional stop words
s.extend(['today', 'tomorrow', 'outside', 'out', 'there'])
ps = nltk.wordnet.WordNetLemmatizer()
for i in range(dataset.shape[0]):
    review = dataset.loc[i,'text']
#     review = re.sub('[^a-zA-Z]', ' ', dataset.loc[i,'text'])
    review = review.lower()
    review = review.split()
    review = [ps.lemmatize(word) for word in review if not word in s]
    review = ' '.join(review)
    dataset.loc[i, 'text'] = review
    X_train = dataset['text']
y_train = dataset['label']
print("Below is the sample of training text after removing the stop words")
print(dataset['text'][:10])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gwenythportillowightman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Below is the sample of training text after removing the stop words
0    britain reveal trial criterion coronavirus ant...
1    u.s. say result encouraging healthcare deliver...
2    latest trial j&j talc litigation get way calif...
3      democrat hoping flip house trash-talking trump.
4       sex tech women-led startup pop ce gadget show.
5                            waxed apple cause cancer.
6    rhode island become second state mandate vacci...
7    brazil city lurch lockdown amid virus crisis r...
8    slovakia's new government sharply ramp coronav...
9                    coronavirus “simply common cold.”
Name: text, dtype: object


In [83]:
# 4. Train the Classifier
classifier = KNN_NLC_Classifer(k=1, distance_type='path')
classifier.fit(X_train, y_train)

final_test_list = ['Cranberries help UTIs.']

test_corpus = []
for i in range(len(final_test_list)):
    review = re.sub('[^a-zA-Z]', ' ', final_test_list[i])
    review = review.lower()
    review = review.split()

    review = [ps.lemmatize(word) for word in review if not word in s]
    review = ' '.join(review)
    test_corpus.append(review)

print("predicting")
y_pred_final = classifier.predict(test_corpus)

output_df = pd.DataFrame(data = {'text': final_test_list, 'numerical_pred_label': y_pred_final})
# output_df['answer'] = np.where(output_df['code']==1, 'Temperature','Conditions')
print(output_df)

predicting
Similar sentence: britain reveal trial criterion coronavirus antibody tests.
Similar sentence: latest trial j&j talc litigation get way california.
Similar sentence: massachusetts help test addiction treatment rating system.
Similar sentence: new california law help animals, fire victims, immigrants.
Similar sentence: unilever say new milkshake help control appetite.
Similar sentence: online tool help beachgoers avoid dirty waters.


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Similar sentence: pre-ski preparation help hitting slopes.
Similar sentence: starving help live longer? maybe not.
                     text  numerical_pred_label
0  Cranberries help UTIs.                     0
