# Medical Misinformation Classification

In [None]:
# ! pip install nltk

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

In [3]:
#import train csv using pandas
df = pd.read_csv("/content/drive/MyDrive/cse445_medical_misinfo/NLP.csv")

In [4]:
#(samples, features)
df.shape

(1495, 2)

In [5]:
df.dtypes

Translation    object
Target          int64
dtype: object

In [6]:
df.head()

Unnamed: 0,Translation,Target
0,"Eat the leaves of diabetes tree / gynura tree,...",1
1,"No morning or afternoon insulin or tablets, th...",1
2,Removed by Qur'anic practices and Unani and Ho...,1
3,"Masturbation leads to premature ejaculation, w...",1
4,Masturbation causes regular headaches.,1


In [7]:
#Use this to check dataset ratio
#"target" is the class diagram

print((df.Target == 1).sum()) #Disaster
print((df.Target == 0).sum()) #No Disaster

699
796


In [8]:
#Preprocessing the dataset

import re
import string

#might be unnecessary for 445
def removeURL(text):
  url = re.compile(r"https?://\S+|www\.\S+")
  return url.sub(r"", text)

#remove punctuations if any
def removePunct(text):
  translator = str.maketrans("","", string.punctuation)
  return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
#checking URL cleaner
pattern = re.compile(r"https?://\S+|www\.\S+")
for t in df.Translation:
  matches = pattern.findall(t)
  for match in matches:
    print(t)
    print(match)
    print(pattern.sub(r"", t))
  if len(matches) > 0:
    break

According to information available on the World Health Organization website, there is currently no specific treatment for monkeypox. However, smallpox vaccination monkeypox con... https://bangla.aajtak.in/lifestyle/photo/monkeypox-india-monkeypox-infection-symptoms-causes-treatment-who-sud-424932-2022-07-15-10
https://bangla.aajtak.in/lifestyle/photo/monkeypox-india-monkeypox-infection-symptoms-causes-treatment-who-sud-424932-2022-07-15-10
According to information available on the World Health Organization website, there is currently no specific treatment for monkeypox. However, smallpox vaccination monkeypox con... 


In [10]:
#df[text] means it is being assigned to the text column
#df.text.map means that the text column is being used
df['Translation'] = df.Translation.map(removeURL)
df['Translation'] = df.Translation.map(removePunct)

In [11]:
# remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

def remove_stopwords(text):
  filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
  return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
# print stop words
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [13]:
df["Translation"] = df.Translation.map(remove_stopwords)
df.Translation #print text column

0       eat leaves diabetes tree gynura tree keep diab...
1       morning afternoon insulin tablets time diabete...
2       removed quranic practices unani homeo treatmen...
3       masturbation leads premature ejaculation hinde...
4                   masturbation causes regular headaches
                              ...                        
1490    psoriasis iron deficiency anemia diabetes hear...
1491    cut nails shower always keep nails clean dry c...
1492    going salon nail care make sure tools products...
1493    corners sides nail cannot cut cutting nails ti...
1494    shoes cause pain pressure nails worn bite nail...
Name: Translation, Length: 1495, dtype: object

We cannot put strings into an RNN directly

Therefore we need to tranform strings to a represntation

In [14]:
#Count all the unique words in the text column
from collections import Counter

def counterWord(inputText):
  count = Counter()
  for text in inputText.values:
    for word in text.split():
      count[word] += 1
  return count

counter = counterWord(df.Translation)


counter

Counter({'eat': 123,
         'leaves': 121,
         'diabetes': 131,
         'tree': 13,
         'gynura': 1,
         'keep': 86,
         'control': 53,
         'morning': 45,
         'afternoon': 2,
         'insulin': 37,
         'tablets': 2,
         'time': 78,
         'completely': 16,
         'controlled': 3,
         'plant': 9,
         'rich': 85,
         'foreign': 1,
         'medicinal': 6,
         'properties': 71,
         'removed': 7,
         'quranic': 1,
         'practices': 1,
         'unani': 4,
         'homeo': 1,
         'treatment': 50,
         'inshaallah': 3,
         'physical': 21,
         'problems': 130,
         '1': 64,
         'sexual': 35,
         'impotence': 3,
         '2': 47,
         'weakness': 17,
         '3': 32,
         'meh': 1,
         'promeh': 1,
         'disease': 130,
         '4': 15,
         '5': 19,
         'jaundice': 3,
         'masturbation': 15,
         'leads': 2,
         'premature': 4,
         '

In [15]:
#Count number of unique words, needed for Tokenizing data
uniqueWords = len(counter)
print("Number of unique words:",uniqueWords)

Number of unique words: 4501


In [16]:
#printing out counter words
counter.most_common(5)

[('body', 389), ('blood', 289), ('also', 263), ('helps', 207), ('water', 195)]

#Data set spliting time

In [17]:
#spliting dataset into training and validation sets by ratio
trainSize = int(df.shape[0] * 0.8)

train_df = df[:trainSize]
val_df = df[trainSize:]

#split text and labels into numpy arrays
train_text = train_df.Translation.to_numpy() # text to numpy
train_label = train_df.Target.to_numpy() # label to numpy
val_text = val_df.Translation.to_numpy()
val_label = val_df.Target.to_numpy()

#check if proper shapes
train_text.shape, val_text.shape

((1196,), (299,))

# Tokenizing Text Data

In [18]:
# Tokenize the data
# change the texts into a sequence of integers
from keras.preprocessing.text import Tokenizer

# vectorize a text corpus here
tokenizer = Tokenizer( num_words = uniqueWords )
tokenizer.fit_on_texts(train_text) #fit only on training

# store the word index from Tokenizer
# Tokenzier indexes all the words by default

wordIndex = tokenizer.word_index

#print wordIndex
wordIndex


{'body': 1,
 'blood': 2,
 'also': 3,
 'helps': 4,
 'water': 5,
 'juice': 6,
 'skin': 7,
 'honey': 8,
 'leaves': 9,
 'turmeric': 10,
 'beneficial': 11,
 'problems': 12,
 'eating': 13,
 'people': 14,
 'increases': 15,
 'vitamin': 16,
 'various': 17,
 'raw': 18,
 'reduce': 19,
 'eat': 20,
 'heart': 21,
 'many': 22,
 'help': 23,
 'contains': 24,
 'disease': 25,
 'fever': 26,
 'diabetes': 27,
 'prevent': 28,
 'stomach': 29,
 'result': 30,
 'food': 31,
 'rich': 32,
 'levels': 33,
 'increase': 34,
 'get': 35,
 'pressure': 36,
 'day': 37,
 'sugar': 38,
 'cholesterol': 39,
 'risk': 40,
 'benefits': 41,
 'properties': 42,
 'diseases': 43,
 'like': 44,
 'high': 45,
 'daily': 46,
 'antioxidants': 47,
 'hair': 48,
 'iron': 49,
 'due': 50,
 'drink': 51,
 'role': 52,
 'reduces': 53,
 'black': 54,
 'pain': 55,
 'one': 56,
 'good': 57,
 'brain': 58,
 'healthy': 59,
 'health': 60,
 'oil': 61,
 'every': 62,
 'dengue': 63,
 'cancer': 64,
 '1': 65,
 'contain': 66,
 'hot': 67,
 'lot': 68,
 'system': 69,
 'c

In [19]:
#create sequences with unique index from the sentences
train_sequence = tokenizer.texts_to_sequences(train_text)
val_sequence = tokenizer.texts_to_sequences(val_text)

#compare
print(train_text[10:15])
print(train_sequence[10:15])

['messageoil use increases penile strength eliminates penile laxity due masturbation penile tenderness strengthens blood vessels increases penile erection slowness making penile longer stronger'
 'mucus cant stopped taking medicine stop mucus herbal naturo stop mucus ibs better'
 'take homeopathy pills gastric ulcer healed hardness chest pain chest pain day'
 'ibs disease per medical science maintain food code enough keep save ibs thanks 01712591008'
 'sexual power decreases take psychiatric medicine']
[[2054, 135, 15, 719, 513, 952, 719, 2055, 50, 329, 719, 2056, 292, 2, 331, 15, 719, 2057, 2058, 830, 719, 628, 831], [629, 953, 1134, 350, 141, 630, 629, 631, 2059, 630, 629, 950, 245], [128, 2060, 1486, 514, 1135, 2061, 2062, 461, 55, 461, 55, 37], [950, 25, 385, 717, 1136, 253, 31, 2063, 262, 79, 1137, 950, 1487, 2064], [133, 207, 246, 128, 2065, 141]]


In [20]:
#We want to have the same length for every sequece so we add padding
#pad the sequence to have the same length

from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a squence
# NEEDS TO BE ADJUSTED MANUALLY
max_length =  20

train_padded = pad_sequences(train_sequence, maxlen = max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequence, maxlen = max_length, padding="post", truncating="post")

#check output
train_padded.shape, val_padded.shape

((1196, 20), (299, 20))

In [21]:
#compare text, sequence, padded sequence
print(train_text[10])
print(train_sequence[10])
print(train_padded[10])

messageoil use increases penile strength eliminates penile laxity due masturbation penile tenderness strengthens blood vessels increases penile erection slowness making penile longer stronger
[2054, 135, 15, 719, 513, 952, 719, 2055, 50, 329, 719, 2056, 292, 2, 331, 15, 719, 2057, 2058, 830, 719, 628, 831]
[2054  135   15  719  513  952  719 2055   50  329  719 2056  292    2
  331   15  719 2057 2058  830]


In [22]:
# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in wordIndex.items()])
reverse_word_index

{1: 'body',
 2: 'blood',
 3: 'also',
 4: 'helps',
 5: 'water',
 6: 'juice',
 7: 'skin',
 8: 'honey',
 9: 'leaves',
 10: 'turmeric',
 11: 'beneficial',
 12: 'problems',
 13: 'eating',
 14: 'people',
 15: 'increases',
 16: 'vitamin',
 17: 'various',
 18: 'raw',
 19: 'reduce',
 20: 'eat',
 21: 'heart',
 22: 'many',
 23: 'help',
 24: 'contains',
 25: 'disease',
 26: 'fever',
 27: 'diabetes',
 28: 'prevent',
 29: 'stomach',
 30: 'result',
 31: 'food',
 32: 'rich',
 33: 'levels',
 34: 'increase',
 35: 'get',
 36: 'pressure',
 37: 'day',
 38: 'sugar',
 39: 'cholesterol',
 40: 'risk',
 41: 'benefits',
 42: 'properties',
 43: 'diseases',
 44: 'like',
 45: 'high',
 46: 'daily',
 47: 'antioxidants',
 48: 'hair',
 49: 'iron',
 50: 'due',
 51: 'drink',
 52: 'role',
 53: 'reduces',
 54: 'black',
 55: 'pain',
 56: 'one',
 57: 'good',
 58: 'brain',
 59: 'healthy',
 60: 'health',
 61: 'oil',
 62: 'every',
 63: 'dengue',
 64: 'cancer',
 65: '1',
 66: 'contain',
 67: 'hot',
 68: 'lot',
 69: 'system',
 70

In [23]:
def decode(sequence):
  return " ".join([reverse_word_index.get(idx,"?") for idx in sequence])

In [24]:
decoded_text = decode(train_sequence[10])

print(train_sequence[10])
print(decoded_text)

[2054, 135, 15, 719, 513, 952, 719, 2055, 50, 329, 719, 2056, 292, 2, 331, 15, 719, 2057, 2058, 830, 719, 628, 831]
messageoil use increases penile strength eliminates penile laxity due masturbation penile tenderness strengthens blood vessels increases penile erection slowness making penile longer stronger


# ITS MODEL TIME

In [27]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [28]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [31]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
vocab_size = 20000  # Only consider the top 20k words
maxlen = 20  # Only consider the first 200 words of each movie review
inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [35]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    train_padded, train_label, batch_size=32, epochs=5, validation_data=(val_padded, val_label)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
#LSTM
from tensorflow.keras import layers

# we use word embedding for better performance
# embedding is a dense vector of floating point values

model = keras.models.Sequential()
model.add(layers.Embedding(uniqueWords, 32, input_length=max_length))

#layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer of wordIndex in the input should be no larger than the num_words (vocab size)
# model.output_shape is (None, input_length, 32), where 'None' is the batch dimension.

model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 32)            144032    
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 168,929
Trainable params: 168,929
Non-trainable params: 0
_________________________________________________________________


In [None]:
# as we are doing binary classificiation
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

  super(Adam, self).__init__(name, **kwargs)


Training with LSTM

In [None]:
model.fit(train_padded, train_label, epochs=20, validation_data=(val_padded, val_label), verbose =2)

Epoch 1/20
38/38 - 4s - loss: 0.6870 - accuracy: 0.5493 - val_loss: 0.7590 - val_accuracy: 0.2475 - 4s/epoch - 97ms/step
Epoch 2/20
38/38 - 1s - loss: 0.5122 - accuracy: 0.7901 - val_loss: 0.7254 - val_accuracy: 0.5819 - 569ms/epoch - 15ms/step
Epoch 3/20
38/38 - 1s - loss: 0.2489 - accuracy: 0.9147 - val_loss: 0.8647 - val_accuracy: 0.5953 - 581ms/epoch - 15ms/step
Epoch 4/20
38/38 - 1s - loss: 0.1703 - accuracy: 0.9415 - val_loss: 0.7990 - val_accuracy: 0.6087 - 585ms/epoch - 15ms/step
Epoch 5/20
38/38 - 1s - loss: 0.1374 - accuracy: 0.9557 - val_loss: 0.9399 - val_accuracy: 0.6087 - 578ms/epoch - 15ms/step
Epoch 6/20
38/38 - 1s - loss: 0.1165 - accuracy: 0.9590 - val_loss: 1.3218 - val_accuracy: 0.5585 - 579ms/epoch - 15ms/step
Epoch 7/20
38/38 - 1s - loss: 0.1176 - accuracy: 0.9557 - val_loss: 1.1034 - val_accuracy: 0.6154 - 557ms/epoch - 15ms/step
Epoch 8/20
38/38 - 1s - loss: 0.1005 - accuracy: 0.9657 - val_loss: 1.0157 - val_accuracy: 0.6054 - 581ms/epoch - 15ms/step
Epoch 9/20


<keras.callbacks.History at 0x7f9373655730>

In [None]:
# predictions
predictions = model.predict(train_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

