Link : https://www.kaggle.com/competitions/nlp-getting-started/overview/evaluation

## Blue Print

1. Check dataset
2. Cleaning
3. Preprocessing
4. Data Split (using stratified sampling)

## Error Function

$F_1 = 2\frac{precision * recall}{precision + recall}$ (1 is the best, 0 is the worst) where:
 
precision = $\frac{TP}{TP+FP}$, recall = $\frac{TP}{TP+FN}$

In [1]:
# sklearn.metrics.f1_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', 
# sample_weight=None, zero_division='warn')
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html

## 1. Data Investigation

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
from gensim.models import Word2Vec
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
# from sklearn.datasets import make_classification
# from nltk.tokenize import TweetTokenizer
# import re
from sklearn.naive_bayes import GaussianNB

warnings.filterwarnings(action = 'ignore')

In [3]:
# Load dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(train.shape, test.shape)
train.head()

(7613, 5) (3263, 4)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# Types of values in each column
print(train.dtypes)

id           int64
keyword     object
location    object
text        object
target       int64
dtype: object


In [5]:
print(train[train["target"] == 0]["text"].values[10])   # not a disaster tweet
print(train[train["target"] == 1]["text"].values[1])    # disaster tweet

No way...I can't eat that shit
Forest fire near La Ronge Sask. Canada


## 2. Data Cleaning

In [6]:
# Check percentage of missing values
print("Ratio of missing values in training dataset")
print("missing keyword: ", str(round(train["keyword"].isnull().sum()/train.shape[0], 2)))
print("missing location: ", str(round(train["location"].isnull().sum()/train.shape[0], 2)), "\n")

print("Ratio of missing values in testing dataset")
print("missing keyword: ", str(round(test["keyword"].isnull().sum()/test.shape[0], 2)))
print("missing location: ", str(round(test["location"].isnull().sum()/test.shape[0], 2)))

Ratio of missing values in training dataset
missing keyword:  0.01
missing location:  0.33 

Ratio of missing values in testing dataset
missing keyword:  0.01
missing location:  0.34


=> Since the test dataset also contains missing values in 'keyword' and 'location', we will drop these columns and use 'text' column only.

## 3. Preprocessing

In [7]:
# Drop irrelevant columns
train = train.drop(["keyword", "location"], axis=1)
test = test.drop(["keyword", "location"], axis=1)

train.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
# Data Split
X_train, X_test, y_train, y_test = train_test_split(train.loc[:, "id":"text"], train.loc[:, ["target"]], 
                                    test_size=0.3, random_state=0, stratify=train.loc[:, ["target"]])

print(X_train.shape, X_test.shape)

(5329, 2) (2284, 2)


In [9]:
# Store "id"
id_train = train["id"]
id_test = test["id"]

In [10]:
# Get a set of stopwords
stops = set(stopwords.words('english'))

# Iterate through each sentence in the file
words_model = []
words = []

for n in range(X_train["text"].size):
    
    for i in sent_tokenize(X_train["text"].iat[n,]):
        temp = []

    # Tokenize the sentence into words
        for j in word_tokenize(i):
            # Remove stopwords
            if j.lower() not in stops:
                temp.append(j.lower())
                words.append(j.lower())     # list of words
 
        words_model.append(temp)
 

In [11]:
# Create Skip Gram model
model = Word2Vec(words_model, min_count = 1, vector_size = 100, window = 5, sg = 1)

In [12]:
# Training models
model.train(X_train["text"].to_list(), total_examples=X_train["text"].size, epochs=100)

(21907348, 53788100)

In [13]:
# Decompose sentences
def text_to_vec(text_df, model):

    vect_X = [0] * text_df.shape[0]

    for n in range(text_df["text"].size):
        vector_for_sentence = np.zeros((100,))
        # Decompose dataframe into sentences
        for s in sent_tokenize(text_df["text"].iat[n,]):
            # Decompose sentences into words
            for w in word_tokenize(s):
                if w.lower() in words:
                # if w.lower() in model.wv:
                    vector_for_sentence += model.wv[w.lower()]         # sum of word vectors
            vect_X[n] = vector_for_sentence     # replace to sentence vector

    return np.array(vect_X)

In [14]:
vect_Xtr = text_to_vec(X_train, model)
vect_Xte = text_to_vec(X_test, model)

## 4. Apply ML models

### 4-1. Logistic Regression

In [15]:
# Fit the data
reg_model = LogisticRegression()
reg_model.fit(vect_Xtr, y_train)

LogisticRegression()

In [16]:
yr_pred = reg_model.predict(vect_Xte)

error_reg = metrics.f1_score(y_test, yr_pred, labels=None, pos_label=1, average='binary', 
                                sample_weight=None, zero_division='warn')
# error_logreg = metrics.roc_auc_score(y_test, yl_pred1)   # AUC(area under curve) of ROC curve
print('Accuracy of Logistic Regression classifier with Skip Gram model on test set: {:.2f}'.format(error_reg))  # 0.60

Accuracy of Logistic Regression classifier with Skip Gram model on test set: 0.60


### 4-2. Linear SVC

In [17]:
# Fit the data
linearsvc = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
linearsvc.fit(vect_Xtr, y_train)

# print(linearsvc.named_steps['linearsvc'].coef_)
# print(linearsvc.named_steps['linearsvc'].intercept_)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])

In [18]:
ys_pred = linearsvc.predict(vect_Xte)
error_svc = metrics.f1_score(y_test, ys_pred, labels=None, pos_label=1, average='binary', 
                                sample_weight=None, zero_division='warn')
                                
print('Accuracy of Linear SVC classifier on test set: {:.2f}'.format(error_svc))  # 0.61

Accuracy of Linear SVC classifier on test set: 0.61


### 4-3. Gaussian Naive Bayes

In [19]:
gnb = GaussianNB()
gnb.fit(vect_Xtr, y_train)

GaussianNB()

In [20]:
yg_pred = gnb.predict(vect_Xte)

error_gnb = metrics.f1_score(y_test, yg_pred, labels=None, pos_label=1, average='binary', 
                                sample_weight=None, zero_division='warn')
                                
print('Accuracy of Gaussian Naive Bayes on test set: {:.2f}'.format(error_gnb))  # 0.62

Accuracy of Gaussian Naive Bayes on test set: 0.62


## 5. Apply to the testing dataset

In [21]:
# Vectorize original train & test data set
vect_train = text_to_vec(train, model) 
vect_test = text_to_vec(test, model)   

In [22]:
# Fit
linearsvc.fit(vect_train, train['target'])
gnb.fit(vect_train, train['target'])

GaussianNB()

In [23]:
# Predict
pred_SVC = pd.DataFrame(linearsvc.predict(vect_test))
pred_NB = pd.DataFrame(gnb.predict(vect_test))

In [24]:
# Add 'id'
pred_SVC = pd.concat([id_test, pred_SVC], axis=1)
pred_SVC = pred_SVC.rename(columns={0: "target"})

pred_NB = pd.concat([id_test, pred_NB], axis=1)
pred_NB = pred_NB.rename(columns={0: "target"})

In [25]:
# Create csv file
submit_SVC = pred_SVC.to_csv('submission_SVC.csv', index=False) # score : 0.72571
submit_NB = pred_NB.to_csv('submission_NB.csv', index=False)    # score : 0.58320

--------------

## 6. How can we improve accuracy?

### 6-1. Improve preprocessing

In [26]:
# Convert a collection of text documents to a matrix of token counts
count_vectorizer = CountVectorizer(stop_words="english")
count_Xtr = count_vectorizer.fit_transform(X_train["text"].values).toarray()
count_Xte = count_vectorizer.transform(X_test["text"].values).toarray()

print(count_Xtr.shape, count_Xtr[10].shape, type(count_Xtr))

(5329, 16619) (16619,) <class 'numpy.ndarray'>


In [27]:
# Logistic Regression
reg_model_count = LogisticRegression()
reg_model_count.fit(count_Xtr, y_train)
cout_yr_pred = reg_model_count.predict(count_Xte)
error_reg_count = metrics.f1_score(y_test, cout_yr_pred, labels=None, pos_label=1, average='binary', 
                                sample_weight=None, zero_division='warn')
print('Accuracy of Logistic Regression classifier with Skip Gram model on test set: {:.2f}'.format(error_reg_count))    # 0.75

Accuracy of Logistic Regression classifier with Skip Gram model on test set: 0.75


In [28]:
# Linear SVC
svc_count = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
svc_count.fit(count_Xtr, y_train)
count_ys_pred = svc_count.predict(count_Xte)
error_svc_count = metrics.f1_score(y_test, count_ys_pred, labels=None, pos_label=1, average='binary', 
                                sample_weight=None, zero_division='warn')
print('Accuracy of Linear SVC classifier on test set: {:.2f}'.format(error_svc_count))  # 0.66

Accuracy of Linear SVC classifier on test set: 0.66


In [29]:
# Gaussian Naive Bayes
gnb_count = GaussianNB()
gnb_count.fit(count_Xtr, y_train)
count_yg_pred = gnb_count.predict(count_Xte)
error_gnb_count = metrics.f1_score(y_test, count_yg_pred, labels=None, pos_label=1, average='binary', 
                                sample_weight=None, zero_division='warn')                              
print('Accuracy of Gaussian Naive Bayes on test set: {:.2f}'.format(error_gnb_count))   #0.64


Accuracy of Gaussian Naive Bayes on test set: 0.64


In [30]:
# Vectorize original train & test data set
count_train = count_vectorizer.fit_transform(train["text"].values).toarray()
count_test = count_vectorizer.transform(test["text"].values).toarray()

# Fit
reg_model_count.fit(count_train, train['target'])

# Predict 
pred_REG = pd.DataFrame(reg_model_count.predict(count_test))

# Add 'id'
pred_REG = pd.concat([id_test, pred_REG], axis=1)
pred_REG = pred_REG.rename(columns={0: "target"})

# Create csv file
submit_REG = pred_REG.to_csv('submission_count_REG.csv', index=False)    # score : 0.80018

------

### 6-2. Improve ML model

In [None]:
# Add intercepts(ones)
Xtr1 = np.hstack([np.ones((vect_Xtr.shape[0],1)), vect_Xtr[:,:] ])   
ytr1 = y_train
print("Xtr1: " + str(Xtr1.shape), " Ytr1: "+ str(ytr1.shape))

In [None]:
# Classification (Logistic Regression)

# Make a prediction with weights
def predict(x, w):
	z = w.dot(x)
	return 1.0 / (1.0 + np.exp(-z))

# Estimate coefficients using stochastic gradient descent
def train_weights(X, y, l_epoch_span, epoch_size, weights, threshold=0.002):
  n, m    = X.shape   # n= , m=
  batch_size = 25

  for batch in range(epoch_size):  # batch = 0, 1, , ..., 49
    l_rate = l_epoch_span[batch]  # learning rate

    # Randomly select 25 numbers
    arr = np.arange(n)
    indices = np.random.choice(arr, size=batch_size)  

    sum_error = 0   # summed errors from each batch
    for b in range(batch_size):  # b = 0, 1, ..., 24
      ind = indices[b]
      prediction = predict(X[ind,:], weights)
      error = abs(prediction - y[ind])
      sum_error += error
      weights = weights - 1.00 * l_rate * (sum_error / batch_size) * X[ind,:]

    print('sum_error at batch #' + str(batch) + ' is ', str(sum_error))
  
    if sum_error<threshold:
      break

  return weights

In [None]:
epoch_size = 50
n_span = np.arange(epoch_size)
l_epoch_span = 1/((1+(2*n_span))**3)    # list of learning rates
init_weights = np.zeros((1,Xtr1.shape[1]))
weights = train_weights(Xtr1, ytr1, l_epoch_span, epoch_size, init_weights) 

In [None]:
# Validate
for i in range(Xtr1.shape[0]):
    Ypred1 = predict(Xtr1[i,:], weights)

print(Ypred1.size, type(Ypred1))