# Medical Misinformation Classification

In [1]:
# ! pip install nltk

In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

In [2]:
#import train csv using pandas
df = pd.read_csv("/content/drive/MyDrive/cse445_medical_misinfo/train.csv")

In [3]:
#(samples, features)
df.shape

(7613, 5)

In [4]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
#Use this to check dataset ratio
#"target" is the class diagram

print((df.target == 1).sum()) #Disaster
print((df.target == 0).sum()) #No Disaster

3271
4342


In [6]:
#Preprocessing the dataset

import re
import string

#might be unnecessary for 445
def removeURL(text):
  url = re.compile(r"https?://\S+|www\.\S+")
  return url.sub(r"", text)

#remove punctuations if any
def removePunct(text):
  translator = str.maketrans("","", string.punctuation)
  return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
#checking URL cleaner
pattern = re.compile(r"https?://\S+|www\.\S+")
for t in df.text:
  matches = pattern.findall(t)
  for match in matches:
    print(t)
    print(match)
    print(pattern.sub(r"", t))
  if len(matches) > 0:
    break

@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
http://t.co/lHYXEOHY6C
@bbcmtd Wholesale Markets ablaze 


In [8]:
#df[text] means it is being assigned to the text column
#df.text.map means that the text column is being used
df['text'] = df.text.map(removeURL)
df['text'] = df.text.map(removePunct)

In [9]:
# remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

def remove_stopwords(text):
  filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
  return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
# print stop words
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [11]:
df["text"] = df.text.map(remove_stopwords)
df.text #print text column

0            deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3       13000 people receive wildfires evacuation orde...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    ariaahrary thetawniest control wild fires cali...
7610                      m194 0104 utc5km volcano hawaii
7611    police investigating ebike collided car little...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

We cannot put strings into an RNN directly

Therefore we need to tranform strings to a represntation

In [12]:
#Count all the unique words in the text column
from collections import Counter

def counterWord(inputText):
  count = Counter()
  for text in inputText.values:
    for word in text.split():
      count[word] += 1
  return count

counter = counterWord(df.text)


counter

Counter({'deeds': 2,
         'reason': 20,
         'earthquake': 50,
         'may': 88,
         'allah': 9,
         'forgive': 2,
         'us': 164,
         'forest': 65,
         'fire': 250,
         'near': 54,
         'la': 25,
         'ronge': 1,
         'sask': 1,
         'canada': 11,
         'residents': 8,
         'asked': 9,
         'shelter': 6,
         'place': 26,
         'notified': 1,
         'officers': 8,
         'evacuation': 50,
         'orders': 11,
         'expected': 15,
         '13000': 4,
         'people': 196,
         'receive': 2,
         'wildfires': 11,
         'california': 117,
         'got': 112,
         'sent': 13,
         'photo': 41,
         'ruby': 1,
         'alaska': 6,
         'smoke': 48,
         'pours': 1,
         'school': 66,
         'rockyfire': 4,
         'update': 37,
         'hwy': 9,
         '20': 26,
         'closed': 20,
         'directions': 1,
         'due': 31,
         'lake': 14,
         'co

In [13]:
#Count number of unique words, needed for Tokenizing data
uniqueWords = len(counter)
print("Number of unique words:",uniqueWords)

Number of unique words: 17971


In [14]:
#printing out counter words
counter.most_common(5)

[('like', 345), ('im', 299), ('amp', 298), ('fire', 250), ('get', 229)]

#Data set spliting time

In [15]:
#spliting dataset into training and validation sets by ratio
trainSize = int(df.shape[0] * 0.8)

train_df = df[:trainSize]
val_df = df[trainSize:]

#split text and labels into numpy arrays
train_text = train_df.text.to_numpy() # text to numpy
train_label = train_df.target.to_numpy() # label to numpy
val_text = val_df.text.to_numpy()
val_label = val_df.target.to_numpy()

#check if proper shapes
train_text.shape, val_text.shape

((6090,), (1523,))

# Tokenizing Text Data

In [16]:
# Tokenize the data
# change the texts into a sequence of integers
from keras.preprocessing.text import Tokenizer

# vectorize a text corpus here
tokenizer = Tokenizer( num_words = uniqueWords )
tokenizer.fit_on_texts(train_text) #fit only on training

# store the word index from Tokenizer
# Tokenzier indexes all the words by default

wordIndex = tokenizer.word_index

#print wordIndex
wordIndex


{'like': 1,
 'amp': 2,
 'fire': 3,
 'im': 4,
 'get': 5,
 'via': 6,
 'new': 7,
 'people': 8,
 'news': 9,
 'dont': 10,
 'emergency': 11,
 'one': 12,
 '2': 13,
 'us': 14,
 'video': 15,
 'disaster': 16,
 'burning': 17,
 'body': 18,
 'would': 19,
 'buildings': 20,
 'police': 21,
 'crash': 22,
 'first': 23,
 'california': 24,
 'still': 25,
 'man': 26,
 'got': 27,
 'know': 28,
 'day': 29,
 'back': 30,
 'going': 31,
 'two': 32,
 'time': 33,
 'full': 34,
 'accident': 35,
 'see': 36,
 'world': 37,
 'attack': 38,
 'nuclear': 39,
 'youtube': 40,
 'may': 41,
 'love': 42,
 'go': 43,
 'rt': 44,
 'many': 45,
 'cant': 46,
 '3': 47,
 'watch': 48,
 'collapse': 49,
 'dead': 50,
 'today': 51,
 'car': 52,
 'mass': 53,
 'want': 54,
 'years': 55,
 'work': 56,
 'train': 57,
 'last': 58,
 'good': 59,
 'think': 60,
 'families': 61,
 'hiroshima': 62,
 'life': 63,
 'fires': 64,
 'best': 65,
 'could': 66,
 'say': 67,
 'u': 68,
 'death': 69,
 'hot': 70,
 'forest': 71,
 'way': 72,
 'killed': 73,
 'need': 74,
 'legion

In [19]:
#create sequences with unique index from the sentences
train_sequence = tokenizer.texts_to_sequences(train_text)
val_sequence = tokenizer.texts_to_sequences(val_text)

#compare
print(train_text[10:15])
print(train_sequence[10:15])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days ive lost count'
 'flood bago myanmar arrived bago'
 'damage school bus 80 multi car crash breaking']
[[520, 8, 395, 156, 297, 411], [749, 470, 2248, 138, 2249, 2813, 521, 611, 188, 470, 2248, 189, 189, 5679, 117], [2814, 117, 1884, 5680, 2248, 1285, 1450, 522, 256, 644, 2815], [99, 3742, 612, 1451, 3742], [111, 91, 336, 3743, 3744, 52, 22, 312]]


In [20]:
#We want to have the same length for every sequece so we add padding
#pad the sequence to have the same length

from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a squence
# NEEDS TO BE ADJUSTED MANUALLY
max_length =  20

train_padded = pad_sequences(train_sequence, maxlen = max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequence, maxlen = max_length, padding="post", truncating="post")

#check output
train_padded.shape, val_padded.shape

((6090, 20), (1523, 20))

In [21]:
#compare text, sequence, padded sequence
print(train_text[10])
print(train_sequence[10])
print(train_padded[10])

three people died heat wave far
[520, 8, 395, 156, 297, 411]
[520   8 395 156 297 411   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]
