# Homework 2 Binary Classification on Text Data

## Part a: Download the data

In [110]:
import pandas as pd
import numpy as np

In [111]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [112]:
print('Training data points:', len(train))
print('Test data points:', len(test))

Training data points: 7613
Test data points: 3263


1) There are 7613 training data points and 3263 test data points

In [113]:
len(train[train['target'] == 1]) / len(train)

0.4296597924602653

In [114]:
len(train[train['target'] == 0]) / len(train)

0.5703402075397347

2) \~43% of the training tweets are about real disasters and  \~57% of the training tweets are about non-real disasters.

## Part b: Split the training data

In [115]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [116]:
from sklearn.model_selection import train_test_split 

X = list(train.columns[:-1])

# Splits train.csv into training set (70%) and development set (30%)
X_train, X_dev, y_train, y_dev = train_test_split(train[X], train['target'], test_size=0.3, random_state=0)
X_train

Unnamed: 0,id,keyword,location,text
476,686,attack,#UNITE THE BLUE,@blazerfan not everyone can see ignoranceshe i...
4854,6913,mass%20murderer,,White people I know you worry tirelessly about...
4270,6066,heat%20wave,,Chilli heat wave Doritos never fail!
992,1441,body%20bagging,New Your,@BroseidonRex @dapurplesharpie I skimmed throu...
4475,6365,hostages,cuba,#hot C-130 specially modified to land in a st...
...,...,...,...,...
4931,7025,mayhem,"Manavadar, Gujarat",They are the real heroes... RIP Brave hearts.....
3264,4689,engulfed,USA,Car engulfed in flames backs up traffic at Par...
1653,2388,collapsed,"Alexandria, Egypt.",Great British Bake Off's back and Dorret's cho...
2607,3742,destroyed,USA,Black Eye 9: A space battle occurred at Star O...


## Part c: Preprocess the data

In [117]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet 
from nltk.tag import pos_tag

# import nltk
# nltk.download()

# Lemmatize words based on part of speech (verbs, adjectives, and nouns)
def lemmatize(text):
    wnl = WordNetLemmatizer()
    word_tags = pos_tag(text.split())
    result_text = []
    for word_tag in word_tags:
        lemmatized_word = word_tag[0]
        # lemmatize verbs (e.g. ate -> eat)
        if 'VB' in word_tag[1]:
            lemmatized_word = wnl.lemmatize(word_tag[0], pos='v')
        # lemmatize adjectives (e.g. better -> good)
        elif 'JJ' in word_tag[1]:
            lemmatized_word = wnl.lemmatize(word_tag[0], pos='a')
        # lemmatize nouns (e.g. cookies -> cookie)
        elif 'NN' in word_tag[1]:
            lemmatized_word = wnl.lemmatize(word_tag[0], pos='n')
        result_text.append(lemmatized_word)
    return ' '.join(result_text)

X_train['text'] = X_train['text'].apply(lambda text: lemmatize(text))
X_dev['text'] = X_dev['text'].apply(lambda text: lemmatize(text))
test['text'] = test['text'].apply(lambda text: lemmatize(text))
X_train['text']

476     @blazerfan not everyone can see ignoranceshe b...
4854    White people I know you worry tirelessly about...
4270                 Chilli heat wave Doritos never fail!
992     @BroseidonRex @dapurplesharpie I skim through ...
4475    #hot C-130 specially modify to land in a stadi...
                              ...                        
4931    They be the real heroes... RIP Brave hearts......
3264    Car engulfed in flame back up traffic at Parle...
1653    Great British Bake Off's back and Dorret's cho...
2607    Black Eye 9: A space battle occur at Star O784...
2732    ???????????? @MikeParrActor absolutely devasta...
Name: text, Length: 5329, dtype: object

In [118]:
import re

stop_words = ['the', 'a', 'an', 'and', 'or', 'this', 'that', 'i', 'my', 'me', 'we', 'us', 'our', 'she', 'her', 
              'he', 'his', 'him', 'they', 'their', 'them', 'you', 'your', 'there', 'are', 'is', 'from', 'to',
              'will', 'can', 'cant', 'would', 'has', 'have', 'could', 'be', 'as', 'if', 'in', 'on', 'also', 'at', 
              'of', 'into', 'by', 'be', 'it', 'its', 'so', 'im', 'youre', 'theyre', 'hes', 'shes', 'were', 'was', 
              'not','but', 'no', 'never', 'with', 'really', 'do', 'for', 'about', 'what', 'how', 'who', 'just',
              'when', 'via', 'which', 'than', 'like']

def regex_stop_word(words):
    regex = r'\b'
    for i in range(len(words)):
        if i == len(words) - 1:
            regex += words[i] + r'\b'
        else:
            regex += words[i] + r'\b|\b'
    return regex


def preprocess_text(X):
    # Converts all the words to lowercase
    X = X.apply(lambda text: text.lower())
    
    # Removes URLs
    X = X.apply(lambda text: re.sub(r'http\S+', ' ', text))
    
    # Removes user id
    X = X.apply(lambda text: re.sub(r'@(.*?)[\s]', ' ', text))
    
    # Strips punctuations
    X = X.apply(lambda text: re.sub(r'[^\w\s]', '', text))
    
    # Strips stop words
    X = X.apply(lambda text: re.sub(regex_stop_word(stop_words), ' ', text))
    return X

X_train['text'] = preprocess_text(X_train['text'])
X_dev['text'] = preprocess_text(X_dev['text'])

## Part d: Bag of words model

In [119]:
from sklearn.feature_extraction.text import CountVectorizer

M = 10
vectorizer = CountVectorizer(binary=True, min_df=M)
vtz = vectorizer.fit(X_train['text'])
V_train = vtz.transform(X_train['text']).toarray()
V_dev = vtz.transform(X_dev['text']).toarray()

## Part e: Logistic Regression

In [120]:
# without regularization terms

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

print('F1 scores for logistic regression model without regularization')

clf = LogisticRegression(penalty='none', max_iter=1000).fit(V_train, y_train)
y_train_predict = clf.predict(V_train)
f1_train = f1_score(y_train, y_train_predict)
print('\tTraining data:', f1_train)

y_dev_predict = clf.predict(V_dev)
f1_dev = f1_score(y_dev, y_dev_predict)
print('\tDevelopment data:', f1_dev)

F1 scores for logistic regression model without regularization
	Training data: 0.8345965225144895
	Development data: 0.7204703367183325


In [121]:
# with L1 regularization

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

print('F1 scores for logistic regression model with L1 regularization')

clf_l1 = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000).fit(V_train, y_train)
y_train_predict = clf_l1.predict(V_train)
f1_train = f1_score(y_train, y_train_predict)
print('\tTraining data:', f1_train)

y_dev_predict = clf_l1.predict(V_dev)
f1_dev = f1_score(y_dev, y_dev_predict)
print('\tDevelopment data:', f1_dev)

F1 scores for logistic regression model with L1 regularization
	Training data: 0.803772716816195
	Development data: 0.7258426966292134


In [122]:
# with L2 regularization

from sklearn.linear_model import LogisticRegression

print('F1 scores for logistic regression model with L2 regularization')


clf_l2 = LogisticRegression(penalty='l2', solver='liblinear', max_iter=1000).fit(V_train, y_train)
y_train_predict = clf_l2.predict(V_train)
f1_train = f1_score(y_train, y_train_predict)
print('\tTraining data:', f1_train)

y_dev_predict = clf_l2.predict(V_dev)
f1_dev = f1_score(y_dev, y_dev_predict)
print('\tDevelopment data:',f1_dev)

F1 scores for logistic regression model with L2 regularization
	Training data: 0.8083447959651535
	Development data: 0.7278797996661102


In [123]:
param_dict = dict()
words = vtz.inverse_transform(clf_l1.coef_)[0]
for i in range(len(words)):
    param_dict[words[i]] = clf_l1.coef_[0][i]

sorted_dict = sorted(param_dict.items(), key=lambda x: x[1], reverse=True)
sorted_dict

[('trouble', 3.6615610927685114),
 ('put', 3.5353164361630283),
 ('turkey', 3.31960013331611),
 ('internet', 3.2198444246925946),
 ('hail', 3.099384491219853),
 ('thunder', 2.396460528209275),
 ('nuclear', 2.2176845744109266),
 ('course', 2.1500767370172267),
 ('investigators', 2.1336672608787985),
 ('share', 2.0998036234354207),
 ('maybe', 2.0884835411585247),
 ('free', 2.06413708477806),
 ('hurricane', 2.0606821927699377),
 ('governor', 2.0466256217678334),
 ('land', 1.9961139882897678),
 ('war', 1.9667972813048158),
 ('after', 1.9416398620573951),
 ('desolate', 1.9216269864665803),
 ('virgin', 1.8852728213855834),
 ('visit', 1.82189159034019),
 ('drought', 1.7728405710687465),
 ('drowning', 1.7666504198854094),
 ('same', 1.7399547333156387),
 ('late', 1.7225434410762008),
 ('real', 1.706497212340071),
 ('explode', 1.5937148970807815),
 ('wake', 1.5670706098828107),
 ('massacre', 1.5633722956409142),
 ('damn', 1.5572455827699547),
 ('pakistani', 1.553201492319429),
 ('fully', 1.52687

## Part f: Bernoulli Naive Bayes

In [124]:
# Bernoulli Naive Bayes

def nb_predictions(x, psis, phis, K):
    # adjust shapes
    n, d = x.shape
    x = np.reshape(x, (1, n, d))
    psis = np.reshape(psis, (K, 1, d))
    
    # clip probabilities to avoid log(0)
    psis = psis.clip(1e-14, 1-1e-14)
    
    # compute log-probabilities
    logpy = np.log(phis).reshape([K,1])
    logpxy = x * np.log(psis) + (1-x) * np.log(1-psis)
    logpyx = logpxy.sum(axis=2) + logpy

    return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K,n])


def Bernoulli_Naive_Bayes(xtrain, ytrain, xdev, K, alpha):
    n = xtrain.shape[0]  # number of tweets
    d = xtrain.shape[1]  # number of words in dataset
    psis = np.zeros([K,d])
    phis = np.zeros([K])

    for k in range(K):
        X_k = xtrain[ytrain == k]
        psis[k] = (np.sum(X_k, axis=0) + alpha) / (X_k.shape[0] + 2 * alpha)
        phis[k] = X_k.shape[0] / float(n)  

    return nb_predictions(xdev, psis, phis, K)[0]
    
idx = Bernoulli_Naive_Bayes(V_train, y_train, V_dev, K = 2, alpha = 1)
print(f1_score(idx, y_dev, average='micro'))

0.7859019264448336


## Part g: Model Comparison

model comparison here

## Part h: N-gram Model

In [125]:
# N-gram model

M2 = 10
vectorizer2 = CountVectorizer(binary=True, min_df=M2, ngram_range=(1,2))
vtz2 = vectorizer2.fit(X_train['text'])
V_train2 = vtz2.transform(X_train['text']).toarray()
V_dev2 = vtz2.transform(X_dev['text']).toarray()

features = vectorizer2.get_feature_names_out()
counter = 0
for feature in features:
    if ' ' in feature and counter < 10:
        print(feature)
        counter += 1

12000 nigerian
15 saudi
16yr old
2015 prebreak
40 family
70 year
add video
affect fatal
after waving
air ambulance


In [126]:
# number of 1-grams
len(vectorizer.get_feature_names_out())

1039

In [127]:
# number of 2-grams
len(features)

1260

Logistic regression with L2 is chosen because it has highest F-score in development set.

In [128]:
clf_2gram = LogisticRegression(penalty='l2', solver='liblinear', max_iter=1000).fit(V_train2, y_train)
y_train_predict = clf_2gram.predict(V_train2)
f1_train = f1_score(y_train, y_train_predict)
print('\tTraining data:', f1_train)

y_dev_predict = clf_2gram.predict(V_dev2)
f1_dev = f1_score(y_dev, y_dev_predict)
print('\tDevelopment data:',f1_dev)

	Training data: 0.811407543698252
	Development data: 0.7268673355629878


In [129]:
idx_train = Bernoulli_Naive_Bayes(V_train2, y_train, V_train2, K = 2, alpha = 1)
print(f1_score(idx_train, y_train, average='micro'))

idx_dev = Bernoulli_Naive_Bayes(V_train2, y_train, V_dev2, K = 2, alpha = 1)
print(f1_score(idx_dev, y_dev, average='micro'))

0.8018389941827735
0.7898423817863398


## Part i: Determine performance with the test set 

In [130]:
test['text'] = preprocess_text(test['text'])
train['text'] = preprocess_text(train['text'])

M3 = 10
vectorizer3 = CountVectorizer(binary=True, min_df=M3, ngram_range=(1,2))
vtz3 = vectorizer3.fit(train['text'])
V_train3 = vtz3.transform(train['text']).toarray()
V_test = vtz3.transform(test['text']).toarray()
idx_test = Bernoulli_Naive_Bayes(V_train3, train['target'], V_test, K = 2, alpha = 1)

# Generates final df that is used for creating a csv file
final_df = pd.DataFrame({'id': test['id'], 'target': idx_test})
final_df.to_csv('disaster_predicted.csv', index=False) 

![output.png](output.png)