# Imports

In [1]:
import numpy as np
!pip install nltk
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all


    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger.zip.
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_ru.zip.
       | Downloading package basque_grammars to /root/nltk_data...
       |   Unzipping grammars/basque_grammars.zip.
       | Downloading package bcp47 to /root/nltk_data...
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package bllip_wsj_no_aux to /root/nltk_data...
       |   Unzipping models/bllip_wsj_no_aux.zip.
       | Downloading package book_grammars to


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

# Loading data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import zipfile

DIR = r'/content/drive/Shareddrives/Materijali 2023 2024/6. semestar/Mašinsko učenje/Vežbe/Domaci 1/ml_d1_x_y_z.zip'

with zipfile.ZipFile(DIR, 'r') as zip_ref:
    zip_ref.extractall()

In [4]:
DATA_DIR = 'ml_d1_x_y_z/data/disaster-tweets.csv'

In [5]:
import csv

X, y = [], []
with open(DATA_DIR, 'r') as f:
  reader = csv.reader(f, delimiter=',', quotechar='"')
  next(reader, None)
  for row in reader:
    y.append(int(row[4]))
    X.append([row[1], row[2], row[3]])

X = np.array(X, dtype=str)
y = np.array(y, dtype=np.int32)
X[:5]

array([['', '',
        'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'],
       ['', '', 'Forest fire near La Ronge Sask. Canada'],
       ['', '',
        "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"],
       ['', '',
        '13,000 people receive #wildfires evacuation orders in California '],
       ['', '',
        'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ']],
      dtype='<U157')

# Preparing data

## Imputing missing values

In [6]:
print(f"Text NaN: {np.count_nonzero(X[:, 2]=='')} / {len(X[:,2])}")
print(f"Keyword NaN: {np.count_nonzero(X[:, 0]=='')} / {len(X[:,0])}")
print(f"Location NaN: {np.count_nonzero(X[:, 1]=='')} / { len(X[:,1])}")
print(f'Target NaN: {np.isnan(y).sum() } / {len(y)}')
print('\n')

countries_not_empty = [i for i in X[:,1] if i]
u, indices = np.unique(countries_not_empty, return_inverse=True)
most_freq_cntry = u[np.argmax(np.bincount(indices))]
print(f'Most common location: {most_freq_cntry}')

Text NaN: 0 / 7613
Keyword NaN: 61 / 7613
Location NaN: 2533 / 7613
Target NaN: 0 / 7613


Most common location: USA


In [7]:
def imputing(X):
  '''
  Imputes missing values for keyword and location columns
  '''
  X[1][X[1] == ''] = 'USA'

  def fill_nan(row):
    '''
    Imputes the missing keyword with the first hashtag if it
    exists, otherwise with 'no_keyword'
    '''
    if row[0]=='':
      if '#' in row[2] and not row[2].endswith('#'):
        return row[2].split('#')[-1].split()[0]
      return 'no_keyword'
    else:
      return row[0]

  X[0] = np.apply_along_axis(fill_nan, axis=0, arr=X)
  return X

## Tokenization

In [8]:
X = X.T # transpose X

In [9]:
from nltk.stem import PorterStemmer
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import html
import re


def remove_urls(text):
  '''
  Removes URLs, HTML escape characters and special characters
  '''
  text = html.unescape(text)
  re_url = r'https?://\S+|www\.\S+'
  re_special = r'[^a-zA-Z0-9\s]'
  text = re.sub(re_url, '', text)
  text = re.sub(re_special, ' ', text)
  return text.lower()

def stem(text):
  '''
  Stems the words
  '''
  final = ""
  for word in text.split():
    final += " " + porter.stem(word)
  return final[1:]

def remove_stopword(text):
  '''
  Remove stop-words
  '''
  final_text ="";
  for word in text.split():
    if word not in stopwrds:
      final_text+= " " + word

  return final_text[1:]

print(X[2])

# remove urls
remove_urls_vec = np.vectorize(remove_urls)
X[2] = remove_urls_vec(X[2])
print(X[2])

# remove stopwords
stopwrds = stopwords.words('english')
remove_stopword_vec = np.vectorize(remove_stopword)
X[2] = remove_stopword_vec(X[2])
print(X[2])

# stem
porter = PorterStemmer()
stem_vec = np.vectorize(stem)
X[2] = stem_vec(X[2])
print(X[2])

# impute missing values
X = imputing(X)
print(X[1])
print(X[0])

# tokenize
tokenized = [word_tokenize(sentence) for sentence in X[2]]

keywords = X[0]
locations = X[1]

['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'
 'Forest fire near La Ronge Sask. Canada'
 "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"
 ... 'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ'
 'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.'
 'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d']
['our deeds are the reason of this  earthquake may allah forgive us all'
 'forest fire near la ronge sask  canada'
 'all residents asked to  shelter in place  are being notified by officers  no other evacuation or shelter in place orders are expected'
 ... 'm1 94  01 04 utc  5km s of volcano hawaii  '
 'police investigating after an e bike collided with a car in little portugal  e bike rider suffered serious non life threatening in

## Vectorization

## Creating a Vocab

In [10]:
print('Creating the vocab...')
vocab_set = set()
for doc in tokenized:
  for word in doc:
    vocab_set.add(word)
vocab = list(vocab_set)

print('Vocab:', list(zip(vocab, range(len(vocab)))))
print('Feature vector size: ', len(vocab))

Creating the vocab...
Vocab: [('selfavow', 0), ('orch', 1), ('30stm', 2), ('juic1', 3), ('remand', 4), ('railroad', 5), ('time2015', 6), ('azwx', 7), ('blackberri', 8), ('total', 9), ('priest', 10), ('3939', 11), ('hunhri', 12), ('leejasp', 13), ('savannahross', 14), ('2474', 15), ('luv', 16), ('amman', 17), ('displeas', 18), ('chip', 19), ('rout', 20), ('delta', 21), ('moscow', 22), ('floridian', 23), ('usar2015', 24), ('modi', 25), ('nathanfillion', 26), ('cricket', 27), ('dancer', 28), ('lockdown', 29), ('titania', 30), ('bestnaijamad', 31), ('soviet', 32), ('r', 33), ('forecast', 34), ('xv', 35), ('030', 36), ('leadership', 37), ('450', 38), ('hugomatz', 39), ('digitalhealth', 40), ('droid', 41), ('aan', 42), ('tomlinson', 43), ('boxer', 44), ('bldrcosheriff', 45), ('mediterran', 46), ('sfgate', 47), ('askh3cz', 48), ('exofficio', 49), ('northeast', 50), ('irishspi', 51), ('invent', 52), ('periwinkl', 53), ('bluetooth', 54), ('2327564d', 55), ('termin', 56), ('flight', 57), ('games

## TF-IDF Vectorization

In [11]:
import math

def freq_score(word, doc):
  return doc.count(word) / len(doc)


def tfidf_score(word, doc):
  tf = freq_score(word, doc)
  idf = idf_table[word]
  return tf * idf

np.set_printoptions(precision=2, linewidth=200)

print('Calculating the IDF table...')
doc_counts = dict()
for word in vocab:
  doc_counts[word] = 0
  for doc in tokenized:
    if word in doc:
      doc_counts[word] += 1
print('Doc counts:')
print(doc_counts)
idf_table = dict()
for word in vocab:
  idf = math.log10(len(tokenized)/doc_counts[word])
  idf_table[word] = idf
print('IDF table:')
print(idf_table)


print('Creating TF-IDF features...')
X = np.zeros((len(tokenized), len(vocab)), dtype=np.float32)
for doc_idx in range(len(tokenized)):
  doc = tokenized[doc_idx]
  for word_idx in range(len(vocab)):
    word = vocab[word_idx]
    cnt = tfidf_score(word, doc)
    X[doc_idx][word_idx] = cnt

Calculating the IDF table...
Doc counts:
{'selfavow': 1, 'orch': 1, '30stm': 1, 'juic1': 1, 'remand': 1, 'railroad': 1, 'time2015': 14, 'azwx': 1, 'blackberri': 5, 'total': 40, 'priest': 2, '3939': 1, 'hunhri': 1, 'leejasp': 1, 'savannahross': 1, '2474': 1, 'luv': 1, 'amman': 1, 'displeas': 1, 'chip': 2, 'rout': 10, 'delta': 2, 'moscow': 3, 'floridian': 1, 'usar2015': 1, 'modi': 4, 'nathanfillion': 1, 'cricket': 5, 'dancer': 2, 'lockdown': 1, 'titania': 1, 'bestnaijamad': 6, 'soviet': 3, 'r': 38, 'forecast': 9, 'xv': 2, '030': 2, 'leadership': 3, '450': 1, 'hugomatz': 1, 'digitalhealth': 1, 'droid': 3, 'aan': 1, 'tomlinson': 2, 'boxer': 4, 'bldrcosheriff': 1, 'mediterran': 8, 'sfgate': 1, 'askh3cz': 1, 'exofficio': 1, 'northeast': 3, 'irishspi': 1, 'invent': 1, 'periwinkl': 1, 'bluetooth': 4, '2327564d': 1, 'termin': 1, 'flight': 19, 'gamescom': 1, 'manifest': 2, 'highqualitybird': 1, 'ohio': 3, 'dive': 7, 'testimoni': 3, 'justjon': 1, 'madisonpa': 1, 'liveonkbak': 1, 'victorian': 1, '

# Naive Bayes

In [12]:
class MultinomialNaiveBayes:
  def __init__(self, nb_classes, nb_words, pseudocount):
    self.nb_classes = nb_classes # number of classes
    self.nb_words = nb_words # number of words in vocabulary in our case
    self.pseudocount = pseudocount # alpha parameter (smoothing)

  def fit(self, X, Y):
    '''
    Fits X and y on the model
    '''

    nb_examples = X.shape[0] # number of training examples

    # Calculating P(class) - priors
    # np.bincount returns the number of appearances of each number in the interval [0, max_number_in_the_list] in the list
    self.priors = np.bincount(Y) / nb_examples # P(class) = number_of_appearances / number_of_examples
    print('Priors:')
    print(self.priors)

    # Calculates the number of appearances of each word for each class
    occs = np.zeros((self.nb_classes, self.nb_words))
    for i in range(nb_examples):
      c = Y[i] # current example class
      for w in range(self.nb_words):
        cnt = X[i][w] # number of appearances of the word in the features
        occs[c][w] += cnt # number of appearances of the word for the class
    print('Occurences:')
    print(occs)

    # Calculating P(word_i | class) - likelihoods
    self.like = np.zeros((self.nb_classes, self.nb_words))
    for c in range(self.nb_classes):
      for w in range(self.nb_words):
        up = occs[c][w] + self.pseudocount # number of appearances of the word + alpha
        down = np.sum(occs[c]) + self.nb_words*self.pseudocount # total number of words in a class + total number of words (vocab size) * alpha
        self.like[c][w] = up / down # likelihood
    print('Likelihoods:')
    print(self.like)

  def predict(self, example):
    '''
    Predicts the class of the example (with log)
    '''
    # Calculating P(class | example) for each class
    probs = np.zeros(self.nb_classes)
    for c in range(self.nb_classes):
      prob = np.log(self.priors[c]) # using log for lower chances of overwlof
      for w in range(self.nb_words):
        cnt = example[w]
        prob += cnt * np.log(self.like[c][w]) # addition instead of multiplication, and log
      probs[c] = prob

    #print('\"Probabilites\" for a test (with log):')
    #print(probs)

    # Finding the class with the largest probability
    prediction = np.argmax(probs)
    return prediction

  def predict_multiply(self, example):
    '''
    Predicts the class of the example (without log)
    '''
    # Calculating P(class | example) for each class
    probs = np.zeros(self.nb_classes)
    for c in range(self.nb_classes):
      prob = self.priors[c] # no log
      for w in range(self.nb_words):
        cnt = example[w]
        prob *= self.like[c][w] ** cnt # multiplying and scaling
      probs[c] = prob

    #print('\"Probabilites\" for a test (without log):')
    #print(probs)

    # Finding the class with the largest probability
    prediction = np.argmax(probs)
    return prediction

In [13]:
def score(model, X_test, y_test):
  preds = []
  for i in range(X_test.shape[0]):
    preds.append(model.predict(X_test[i]))

  accuracy = (preds==y_test).mean()
  return accuracy

Finding the mean accuracy between three consecutive runs

In [14]:
accuracies = []

for i in range(3):
  # nasumicno promesaj
  n_samples = X.shape[0]
  np.random.seed(i) # random seed za reproduktabilnost
  indices = np.random.permutation(n_samples)
  X_shuff = X[indices]
  y_shuff = y[indices]

  # train-test split
  n_train_samples = int(0.8 * n_samples)
  X_train = X_shuff[:n_train_samples][:]
  X_test = X_shuff[n_train_samples:][:]
  y_train = y_shuff[:n_train_samples]
  y_test = y_shuff[n_train_samples:]

  model = MultinomialNaiveBayes(nb_classes=2, nb_words=len(vocab), pseudocount=1)
  model.fit(X_train, y_train)
  accuracies.append(score(model, X_test, y_test))

print(f'Mean accuracy: {sum(accuracies) / len(accuracies)}')

Priors:
[0.57 0.43]
Occurences:
[[0.3  0.   0.   ... 0.23 1.61 1.93]
 [0.   0.3  0.   ... 0.   0.81 0.  ]]
Likelihoods:
[[5.62e-05 4.33e-05 4.33e-05 ... 5.31e-05 1.13e-04 1.27e-04]
 [4.88e-05 6.33e-05 4.88e-05 ... 4.88e-05 8.84e-05 4.88e-05]]
Priors:
[0.57 0.43]
Occurences:
[[0.   0.   0.   ... 0.23 2.41 2.44]
 [0.   0.3  0.97 ... 0.   1.07 0.  ]]
Likelihoods:
[[4.32e-05 4.32e-05 4.32e-05 ... 5.31e-05 1.48e-04 1.49e-04]
 [4.88e-05 6.34e-05 9.62e-05 ... 4.88e-05 1.01e-04 4.88e-05]]
Priors:
[0.57 0.43]
Occurences:
[[0.   0.   0.   ... 0.23 1.72 1.84]
 [0.   0.3  0.97 ... 0.   1.07 0.  ]]
Likelihoods:
[[4.33e-05 4.33e-05 4.33e-05 ... 5.32e-05 1.18e-04 1.23e-04]
 [4.87e-05 6.33e-05 9.60e-05 ... 4.87e-05 1.01e-04 4.87e-05]]
Mean accuracy: 0.7957977675640183


## Cross-Validation (not used)

In [15]:
#def kfold_indices(X, k):
    #fold_size = len(X) // k
    #indices = np.arange(len(X))
    #folds = []
   # for i in range(k):
      #  test_indices = indices[i * fold_size: (i + 1) * fold_size]
      #  train_indices = np.concatenate([indices[:i * fold_size], indices[(i + 1) * fold_size:]])
      #  folds.append((train_indices, test_indices))
    #return folds

#fold_indices = kfold_indices(X, 3)


In [16]:
#for train_indices, test_indices in fold_indices:
  #scores=[]
  #X_train, y_train = X[train_indices], y[train_indices]
  #X_test, y_test = X[test_indices], y[test_indices]

  # Train the model on the training data
  #model = MultinomialNaiveBayes(nb_classes=2, nb_words=len(vocab), pseudocount=1)
  #model.fit(X_train, y_train)

  # Calculate the accuracy score for this fold
  #fold_score = score(X_test, y_test)

  # Append the fold score to the list of scores
  #scores.append(fold_score)

#mean_accuracy = np.mean(scores)

In [17]:
##mean_accuracy

# Discussion

Convert tokenized sequence to numpy array

In [18]:
# Get the maximum length of tokenized sentences
max_len = max(len(tokens) for tokens in tokenized)

# Create an empty array with the appropriate shape
tokenized_arr = np.empty((len(tokenized), max_len), dtype=object)

# Fill the array with tokenized sentences
for i, tokens in enumerate(tokenized):
    tokenized_arr[i, :len(tokens)] = tokens

#tokenized_arr = tokenized_arr[indices]

tokenized_arr = np.where(tokenized_arr == None, "", tokenized_arr)

## Top 5 most frequent words in each class

In [19]:
zeros_indices = [i for i, value in enumerate(y) if value == 0]
ones_indices = [i for i, value in enumerate(y) if value == 1]

unique_elements_zeros, counts_zeros = np.unique(tokenized_arr[zeros_indices].flatten(), return_counts=True)
unique_elements_ones, counts_ones = np.unique(tokenized_arr[ones_indices].flatten(), return_counts=True)


zeros_dict = dict(zip(unique_elements_zeros, counts_zeros))
sorted_zeros_dict = dict(sorted(zeros_dict.items(), key=lambda item: item[1], reverse=True))
print(sorted_zeros_dict)

ones_dict = dict(zip(unique_elements_ones, counts_ones))
sorted_ones_dict = dict(sorted(ones_dict.items(), key=lambda item: item[1], reverse=True))
print(sorted_ones_dict)

top_5_zeros = list(sorted_zeros_dict.keys())[1:6]
top_5_ones = list(sorted_ones_dict.keys())[1:6]

{'': 69696, 'like': 308, 'get': 223, 'new': 171, 'go': 144, 'one': 139, 'love': 125, 'bodi': 119, '2': 114, 'bag': 112, 'time': 106, 'day': 102, 'video': 102, 'would': 101, 'scream': 100, 'make': 99, 'via': 99, 'want': 99, 'see': 98, 'peopl': 95, 'burn': 94, 'got': 93, 'know': 92, 'fire': 90, 'let': 90, 'look': 90, 'back': 88, 'come': 88, '3': 84, 'emerg': 84, 'full': 84, 'think': 82, 'wreck': 82, 'obliter': 79, 'feel': 77, 'fuck': 77, 'us': 77, 'say': 76, 'u': 76, 'youtub': 76, 'drown': 73, 'still': 73, 'need': 72, 'work': 72, 'world': 72, 'take': 69, 'good': 68, 'man': 66, 'year': 66, 'rt': 63, 'crush': 62, 'explod': 62, 'lol': 62, 'na': 62, 'life': 61, 'way': 61, 'destroy': 60, 'first': 59, 'news': 59, 'read': 59, 'even': 58, 'last': 58, 'surviv': 58, 'watch': 58, 'help': 56, '1': 55, '5': 55, 'best': 55, 'build': 55, 'injuri': 55, 'realli': 55, 'reddit': 55, 'thing': 54, 'fatal': 53, 'plan': 53, 'w': 53, 'bomb': 52, 'delug': 52, 'quarantin': 52, 'crash': 51, 'mani': 51, 'much': 51,

In [20]:
print(f'Top 5 most frequent words in tweets labeled as disasters: {top_5_ones}')
print(f'Top 5 most frequent words in tweets labeled as not disasters: {top_5_zeros}')

Top 5 most frequent words in tweets labeled as disasters: ['fire', 'bomb', 'kill', 'news', 'flood']
Top 5 most frequent words in tweets labeled as not disasters: ['like', 'get', 'new', 'go', 'one']


The 5 most frequent words in tweets with accidents make sense because they indicate potential accidents, and there is also the word news, which is often mentioned because accidents are often mentioned in the news. As for the 5 most common words in tweets that are not accidents, they are some common words in normal speech.


## Different Metric

Introducing a different metric $$LR(word) = ones(word) / zeros(word)$$
Where:\
$ones(word)$ is the number of appearances of the word in tweets labeled as disasters\
$zeros(word)$ is the number of appearances of the word in tweets labeled as not disasters.\
We will test this metric for the words that appear at least 10 times in both of the classes.

In [21]:
lr = dict()

for key in sorted_ones_dict.keys():
  if sorted_ones_dict[key] >= 10:
    if key in sorted_zeros_dict.keys() and sorted_zeros_dict[key] >=10:
      lr[key] = sorted_ones_dict[key] / sorted_zeros_dict[key]

sorted_lr = dict(sorted(lr.items(), key=lambda item: item[1], reverse=True))
top_5_lr = list(sorted_lr.keys())[:5]
bottom_5_lr = list(sorted_lr.keys())[-5:]

In [22]:
print(f'Top 5 highest LRs: {top_5_lr}')
print(f'Top 5 lowest LRs: {bottom_5_lr}')

Top 5 highest LRs: ['kill', 'warn', 'train', 'report', 'latest']
Top 5 lowest LRs: ['let', 'scream', 'obliter', 'love', 'full']


The LR metric gave better results. Here, instead of looking at how many times a word appeared, we look at how often/less often it appeared in accident tweets. This is how we got a better understanding of the words that often appear in both accident tweets and ordinary tweets. For example. the low LR value for terms like scream and obliter indicates that they are mentioned much more often in the context of non-accident tweets, the opposite is true for words like kill, warn and train.