Link : https://www.kaggle.com/competitions/nlp-getting-started/overview/evaluation

## Blue Print

1. Check dataset
2. Cleaning
3. Preprocessing
4. Data Split (using stratified sampling)

## Error Function

$F_1 = 2\frac{precision * recall}{precision + recall}$ (1 is the best, 0 is the worst) where:
 
precision = $\frac{TP}{TP+FP}$, recall = $\frac{TP}{TP+FN}$

In [None]:
# sklearn.metrics.f1_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', 
# sample_weight=None, zero_division='warn')
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html

## 1. Data Investigation

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
from gensim.models import Word2Vec
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification

warnings.filterwarnings(action = 'ignore')

In [2]:
# Load dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(train.shape, test.shape)
train.head()

(7613, 5) (3263, 4)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
# Types of values in each column
print(train.dtypes)

id           int64
keyword     object
location    object
text        object
target       int64
dtype: object


In [4]:
print(train[train["target"] == 0]["text"].values[10])       # not a disaster tweet
print(train[train["target"] == 1]["text"].values[1])  # disaster tweet

No way...I can't eat that shit
Forest fire near La Ronge Sask. Canada


## 2. Data Cleaning

In [5]:
# Check percentage of missing values
print("Ratio of missing values in training dataset")
print("missing keyword: ", str(round(train["keyword"].isnull().sum()/train.shape[0], 2)))
print("missing location: ", str(round(train["location"].isnull().sum()/train.shape[0], 2)), "\n")

print("Ratio of missing values in testing dataset")
print("missing keyword: ", str(round(test["keyword"].isnull().sum()/test.shape[0], 2)))
print("missing location: ", str(round(test["location"].isnull().sum()/test.shape[0], 2)))

Ratio of missing values in training dataset
missing keyword:  0.01
missing location:  0.33 

Ratio of missing values in testing dataset
missing keyword:  0.01
missing location:  0.34


=> Since the test dataset also contains missing values in 'keyword' and 'location', we will drop these columns and use 'text' column only.

## 3. Preprocessing

In [6]:
# Drop irrelevant columns
train = train.drop(["keyword", "location"], axis=1)
test = test.drop(["keyword", "location"], axis=1)

train.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
# Data Split
X_train, X_test, y_train, y_test = train_test_split(train.loc[:, "id":"text"], train.loc[:, ["target"]], 
                                    test_size=0.3, random_state=0, stratify=train.loc[:, ["target"]])

print(X_train.shape, X_test.shape)

(5329, 2) (2284, 2)


In [8]:
# Store "id"
id_train = X_train["id"]
id_test = X_test["id"]
id_train

4812     6849
2080     2988
652       944
3528     5043
3007     4321
        ...  
7248    10378
7606    10866
1885     2708
3865     5496
3397     4864
Name: id, Length: 5329, dtype: int64

In [9]:
# Get a set of stopwords
stops = set(stopwords.words('english'))

# Iterate through each sentence in the file
words_model = []
words = []

for n in range(X_train["text"].size):
    
    for i in sent_tokenize(X_train["text"].iat[n,]):
        temp = []

    # Tokenize the sentence into words
        for j in word_tokenize(i):
            # Remove stopwords
            if j.lower() not in stops:
                temp.append(j.lower())
                words.append(j.lower())     # list of words
 
        words_model.append(temp)
 

In [10]:
# Create Skip Gram model
model = Word2Vec(words_model, min_count = 1, vector_size = 100, window = 5, sg = 1)

In [11]:
# Training models
model.train(X_train["text"].to_list(), total_examples=X_train["text"].size, epochs=100)

(21907038, 53788100)

In [12]:
# Decompose sentences
def text_to_vec(text_df, model):

    vect_X = [0] * text_df.shape[0]

    for n in range(text_df["text"].size):
        vector_for_sentence = np.zeros((100,))
        # Decompose dataframe into sentences
        for s in sent_tokenize(text_df["text"].iat[n,]):
            # Decompose sentences into words
            for w in word_tokenize(s):
                if w.lower() in words:
                # if w.lower() in model.wv:
                    vector_for_sentence += model.wv[w.lower()]         # sum of word vectors
            vect_X[n] = vector_for_sentence     # replace to sentence vector

    return np.array(vect_X)

In [13]:
vect_Xtr = text_to_vec(X_train, model)
vect_Xte = text_to_vec(X_test, model)

## 4. Apply ML models

### 4-1. Logistic Regression

In [14]:
# Fit the data
reg_model = LogisticRegression()
reg_model.fit(vect_Xtr, y_train)

LogisticRegression()

In [15]:
yr_pred = reg_model.predict(vect_Xte)

error_reg = metrics.f1_score(y_test, yr_pred, labels=None, pos_label=1, average='binary', 
                                sample_weight=None, zero_division='warn')
# error_logreg = metrics.roc_auc_score(y_test, yl_pred1)   # AUC(area under curve) of ROC curve
print('Accuracy of logistic regression classifier with Skip Gram model on test set: {:.2f}'.format(error_reg))  # 0.59

Accuracy of logistic regression classifier with Skip Gram model on test set: 0.59


### 4-2. Linear SVC

In [16]:
# Fit the data
linearsvc = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
linearsvc.fit(vect_Xtr, y_train)

# print(linearsvc.named_steps['linearsvc'].coef_)
# print(linearsvc.named_steps['linearsvc'].intercept_)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])

In [17]:
ys_pred = linearsvc.predict(vect_Xte)
error_svc = metrics.f1_score(y_test, ys_pred, labels=None, pos_label=1, average='binary', 
                                sample_weight=None, zero_division='warn')
                                
print('Accuracy of linear SVC classifier on test set: {:.2f}'.format(error_svc))  # 0.63

Accuracy of linear SVC classifier on test set: 0.63


### 4-3. Naive Bayes

------------------------------------------

In [None]:
# Add intercepts(ones)
Xtr1 = np.hstack([np.ones((vect_Xtr1.shape[0],1)), vect_Xtr1[:,:] ])   
Ytr1 = y_train
print("Xtr1: " + str(Xtr1.shape), " Ytr1: "+ str(Ytr1.shape))

In [None]:
# Classification (Logistic Regression)

# Make a prediction with weights
def predict(x, w):
	z = w.dot(x)
	return 1.0 / (1.0 + np.exp(-z))

# Estimate coefficients using stochastic gradient descent
def train_weights(X, y, l_epoch_span, epoch_size, weights, threshold=0.002):
  n, m    = X.shape   # n= , m=
  batch_size = 25

  for batch in range(epoch_size):  # batch = 0, 1, , ..., 49
    l_rate = l_epoch_span[batch]  # learning rate

    # Randomly select 25 numbers
    arr = np.arange(n)
    indices = np.random.choice(arr, size=batch_size)  

    sum_error = 0   # summed errors from each batch
    for b in range(batch_size):  # b = 0, 1, ..., 24
      ind = indices[b]
      prediction = predict(X[ind,:], weights)
      error = abs(prediction - y[ind])
      sum_error += error
      weights = weights - 1.00 * l_rate * (sum_error / batch_size) * X[ind,:]

    print('sum_error at batch #' + str(batch) + ' is ', str(sum_error))
  
    if sum_error<threshold:
      break

  return weights

In [None]:
epoch_size = 50
n_span = np.arange(epoch_size)
l_epoch_span = 1/((1+(2*n_span))**3)    # list of learning rates
init_weights = np.zeros((1,Xtr1.shape[1]))
weights = train_weights(Xtr1, Ytr1, l_epoch_span, epoch_size, init_weights) 

In [None]:
# Validate
for i in range(Xtr1.shape[0]):
    Ypred1 = predict(Xtr1[i,:], weights)

print(Ypred1.size, type(Ypred1))

_____

In [None]:
# Convert a collection of text documents to a matrix of token counts
count_vectorizer = CountVectorizer(stop_words="english")
count_train = count_vectorizer.fit_transform(X_train["text"].values)
count_test = count_vectorizer.transform(X_test["text"].values)

print(count_train.shape, count_train[10].shape)

In [None]:
count_vectorizer.get_stop_words()

In [None]:
count_train[5000]

## 4. 

text preprocessing
- Word2Vec
- tweettoeknizer 
- tokenize -> lower -> Counter() and .most_common()
- remove stop words stopwords.word('english')? english_stops?
- Linear SVC
- Naive Bayes