# Homework 2 Programming Exercise

In [83]:
import pandas as pd
import numpy as np

In [84]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [85]:
print('Training data points:', len(train))
print('Test data points:', len(test))

Training data points: 7613
Test data points: 3263


There are 7613 training data points and 3263 test data points

In [86]:
len(train[train['target'] == 1]) / len(train)

0.4296597924602653

In [87]:
len(train[train['target'] == 0]) / len(train)

0.5703402075397347

\~43% of the training tweets are about real disasters and  \~57% of the training tweets are not about real disasters.

In [88]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [89]:
from sklearn.model_selection import train_test_split 

X = list(train.columns[:-1])

# Splits train.csv into training set (70%) and development set (30%)
X_train, X_dev, y_train, y_dev = train_test_split(train[X], train['target'], test_size=0.3, random_state=0)
X_train

Unnamed: 0,id,keyword,location,text
476,686,attack,#UNITE THE BLUE,@blazerfan not everyone can see ignoranceshe i...
4854,6913,mass%20murderer,,White people I know you worry tirelessly about...
4270,6066,heat%20wave,,Chilli heat wave Doritos never fail!
992,1441,body%20bagging,New Your,@BroseidonRex @dapurplesharpie I skimmed throu...
4475,6365,hostages,cuba,#hot C-130 specially modified to land in a st...
...,...,...,...,...
4931,7025,mayhem,"Manavadar, Gujarat",They are the real heroes... RIP Brave hearts.....
3264,4689,engulfed,USA,Car engulfed in flames backs up traffic at Par...
1653,2388,collapsed,"Alexandria, Egypt.",Great British Bake Off's back and Dorret's cho...
2607,3742,destroyed,USA,Black Eye 9: A space battle occurred at Star O...


In [90]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet 
from nltk.tag import pos_tag

# import nltk
# nltk.download()

# Lemmatize words based on part of speech (verbs, adjectives, and nouns)
def lemmatize(text):
    wnl = WordNetLemmatizer()
    word_tags = pos_tag(text.split())
    result_text = []
    for word_tag in word_tags:
        lemmatized_word = word_tag[0]
        # lemmatize verbs (e.g. ate -> eat)
        if 'VB' in word_tag[1]:
            lemmatized_word = wnl.lemmatize(word_tag[0], pos='v')
        # lemmatize adjectives (e.g. better -> good)
        elif 'JJ' in word_tag[1]:
            lemmatized_word = wnl.lemmatize(word_tag[0], pos='a')
        # lemmatize nouns (e.g. cookies -> cookie)
        elif 'NN' in word_tag[1]:
            lemmatized_word = wnl.lemmatize(word_tag[0], pos='n')
        result_text.append(lemmatized_word)
    return ' '.join(result_text)

X_train['text'] = X_train['text'].apply(lambda text: lemmatize(text))
X_dev['text'] = X_dev['text'].apply(lambda text: lemmatize(text))
X_train['text']

476     @blazerfan not everyone can see ignoranceshe b...
4854    White people I know you worry tirelessly about...
4270                 Chilli heat wave Doritos never fail!
992     @BroseidonRex @dapurplesharpie I skim through ...
4475    #hot C-130 specially modify to land in a stadi...
                              ...                        
4931    They be the real heroes... RIP Brave hearts......
3264    Car engulfed in flame back up traffic at Parle...
1653    Great British Bake Off's back and Dorret's cho...
2607    Black Eye 9: A space battle occur at Star O784...
2732    ???????????? @MikeParrActor absolutely devasta...
Name: text, Length: 5329, dtype: object

In [91]:
import re

stop_words = ['the', 'a', 'an', 'and', 'or', 'this', 'that', 'i', 'my', 'me', 'we', 'us', 'our', 'she', 'her', 
              'he', 'his', 'him', 'they', 'their', 'them', 'you', 'your', 'there', 'are', 'is', 'from', 'to',
              'will', 'can', 'cant', 'would', 'has', 'have', 'could', 'be', 'as', 'if', 'in', 'on', 'also', 'at', 
              'of', 'into', 'by', 'be', 'it', 'its', 'so', 'im', 'youre', 'theyre', 'hes', 'shes', 'were', 'was', 
              'not','but', 'no', 'never', 'with', 'really', 'do', 'for', 'about', 'what', 'how', 'who', 'just',
              'when', 'via', 'which', 'than']

def regex_stop_word(words):
    regex = r'\b'
    for i in range(len(words)):
        if i == len(words) - 1:
            regex += words[i] + r'\b'
        else:
            regex += words[i] + r'\b|\b'
    return regex


# Converts all the words to lowercase
X_train['text'] = X_train['text'].apply(lambda text: text.lower())
X_dev['text'] = X_dev['text'].apply(lambda text: text.lower())

# Removes URLs
X_train['text'] = X_train['text'].apply(lambda text: re.sub(r'http\S+', '', text))
X_dev['text'] = X_dev['text'].apply(lambda text: re.sub(r'http\S+', '', text))

# Removes user id
X_train['text'] = X_train['text'].apply(lambda text: re.sub(r'@(.*?)[\s]', ' ', text))
X_dev['text'] = X_dev['text'].apply(lambda text: re.sub(r'@(.*?)[\s]', ' ', text))

# Strips punctuations
X_train['text'] = X_train['text'].apply(lambda text: re.sub(r'[^\w\s]', '', text))
X_dev['text'] = X_dev['text'].apply(lambda text: re.sub(r'[^\w\s]', '', text))

# Strips the stop words (the, a, an, and, or)
X_train['text'] = X_train['text'].apply(lambda text: re.sub(regex_stop_word(stop_words), '', text))
X_dev['text'] = X_dev['text'].apply(lambda text: re.sub(regex_stop_word(stop_words), '', text))

In [92]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True, min_df=20)
vtz = vectorizer.fit(X_train['text'])
V_train = vtz.transform(X_train['text']).toarray()
V_dev = vtz.transform(X_dev['text']).toarray()

In [93]:
# without regularization terms

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

clf = LogisticRegression(penalty='none', max_iter=1000).fit(V_train, y_train)
y_train_predict = clf.predict(V_train)
results = metrics.classification_report(y_train, y_train_predict)
print(results)

y_dev_predict = clf.predict(V_dev)
results = metrics.classification_report(y_dev, y_dev_predict)
print(results)

              precision    recall  f1-score   support

           0       0.80      0.87      0.84      3004
           1       0.82      0.72      0.76      2325

    accuracy                           0.81      5329
   macro avg       0.81      0.80      0.80      5329
weighted avg       0.81      0.81      0.80      5329

              precision    recall  f1-score   support

           0       0.78      0.84      0.80      1338
           1       0.74      0.66      0.70       946

    accuracy                           0.76      2284
   macro avg       0.76      0.75      0.75      2284
weighted avg       0.76      0.76      0.76      2284



In [94]:
# with L1 regularization

from sklearn.linear_model import LogisticRegression

clf_l1 = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000).fit(V_train, y_train)
y_train_predict = clf_l1.predict(V_train)
results = metrics.classification_report(y_train, y_train_predict)
print(results)

y_dev_predict = clf_l1.predict(V_dev)
results = metrics.classification_report(y_dev, y_dev_predict)
print(results)

              precision    recall  f1-score   support

           0       0.79      0.89      0.84      3004
           1       0.83      0.69      0.75      2325

    accuracy                           0.80      5329
   macro avg       0.81      0.79      0.79      5329
weighted avg       0.81      0.80      0.80      5329

              precision    recall  f1-score   support

           0       0.77      0.86      0.81      1338
           1       0.76      0.64      0.70       946

    accuracy                           0.77      2284
   macro avg       0.77      0.75      0.76      2284
weighted avg       0.77      0.77      0.77      2284



In [95]:
# with L2 regularization

from sklearn.linear_model import LogisticRegression

clf_l2 = LogisticRegression(penalty='l2', solver='liblinear', max_iter=1000).fit(V_train, y_train)
y_train_predict = clf_l2.predict(V_train)
results = metrics.classification_report(y_train, y_train_predict)
print(results)

y_dev_predict = clf_l2.predict(V_dev)
results = metrics.classification_report(y_dev, y_dev_predict)
print(results)

              precision    recall  f1-score   support

           0       0.79      0.88      0.83      3004
           1       0.82      0.70      0.76      2325

    accuracy                           0.80      5329
   macro avg       0.81      0.79      0.79      5329
weighted avg       0.80      0.80      0.80      5329

              precision    recall  f1-score   support

           0       0.77      0.85      0.81      1338
           1       0.75      0.64      0.69       946

    accuracy                           0.76      2284
   macro avg       0.76      0.75      0.75      2284
weighted avg       0.76      0.76      0.76      2284



In [96]:
param_dict = dict()
words = vtz.inverse_transform(clf_l1.coef_)[0]
for i in range(len(words)):
    param_dict[words[i]] = clf_l1.coef_[0][i]

sorted_dict = sorted(param_dict.items(), key=lambda x: x[1], reverse=True)
sorted_dict

[('mh370', 3.469511379374538),
 ('release', 3.3738492561960944),
 ('explode', 3.2527462658073847),
 ('real', 3.0375415712690406),
 ('flooding', 3.0086886871873837),
 ('evacuate', 2.6082943604290314),
 ('wreck', 2.60156401729194),
 ('40', 2.354820397417945),
 ('put', 2.3424630693290567),
 ('dont', 2.184642617020311),
 ('survive', 2.15513465807705),
 ('fire', 2.0603760622823315),
 ('care', 1.9653333084389681),
 ('hostage', 1.952064547543393),
 ('already', 1.7617067278927154),
 ('death', 1.7314606992997925),
 ('demolition', 1.7244450431994114),
 ('full', 1.7109908904321285),
 ('check', 1.6742057614474313),
 ('crash', 1.6496428257268485),
 ('site', 1.6424102887138423),
 ('collapse', 1.6343846381294267),
 ('three', 1.6297293439869922),
 ('city', 1.6175715666882833),
 ('search', 1.5809614279305781),
 ('night', 1.5610383490064137),
 ('content', 1.5332518902736232),
 ('nuclear', 1.5311303852893194),
 ('horrible', 1.4903922724023027),
 ('accident', 1.4883587542047154),
 ('video', 1.473491417717

In [97]:
# Bernoulli Naive Bayes

n = V_train.shape[0]  # number of tweets
d = V_train.shape[1]  # number of words in dataset
K = 2 # Class size (1 =  real disaster, 0 = not a real disaster)
alpha = 1 # Virtual Occurrences

psis = np.zeros([K,d])
phis = np.zeros([K])

for k in range(K):
    X_k = V_train[y_train == k]
    psis[k] = (np.sum(X_k, axis=0) + alpha) / (X_k.shape[0] + 2 * alpha)
    phis[k] = X_k.shape[0] / float(n)

    
def nb_predictions(x, psis, phis):
    """This returns class assignments and scores under the NB model.
    
    We compute \arg\max_y p(y|x) as \arg\max_y p(x|y)p(y)
    """
    # adjust shapes
    n, d = x.shape
    x = np.reshape(x, (1, n, d))
    psis = np.reshape(psis, (K, 1, d))
    
    # clip probabilities to avoid log(0)
    psis = psis.clip(1e-14, 1-1e-14)
    
    # compute log-probabilities
    logpy = np.log(phis).reshape([K,1])
    logpxy = x * np.log(psis) + (1-x) * np.log(1-psis)
    logpyx = logpxy.sum(axis=2) + logpy

    return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K,n])

idx, logpyx = nb_predictions(V_dev, psis, phis)
print(idx[:100])

[0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0
 0 1 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 1 1 1 0]


In [99]:
# Accuracy
(idx==y_dev).mean()

# Find f1 score

0.7596322241681261