# 1. Import Packages

In [3]:
import numpy as np 
import pandas as pd 
import nltk
import matplotlib.pyplot as plt
import random
import re
import string
import pickle
from nltk.corpus import stopwords # module for stop words
from nltk.stem import PorterStemmer # module for stemming
from nltk.tokenize import TweetTokenizer # module for tokenizing strings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [4]:
# # download the stop words
# nltk.download('stopwords')

# 2. Data Cleaning

## 2.1. Explore the dataset
Look at the number of training dataset that is with label 1 (related to disaster) versus lable 0 (not related to disaster)

In [5]:
# import train dataset
raw_df = pd.read_csv("/kaggle/input/nlpgettingstarted/train.csv")
test_df = pd.read_csv("/kaggle/input/nlpgettingstarted/test.csv")
# check the number of dataset with label 1 and 0 
print("number of disaster sample:", len(raw_df[raw_df["target"] == 1]))
print("number of not disaster sample:", len(raw_df[raw_df["target"] == 0]))

number of disaster sample: 3271
number of not disaster sample: 4342


## 2.2 Train/Validation Split
Since the test dataset is given, I am creating the train and validation sets to find the best $\theta$ that minimize the cost without overfitting
* Train: 80% of disaster sample + 80% of not disaster sample
* Validation: 20% of disaster sample + 20% of not disaster sample

In [6]:
# disaster sample
X_train_dis, X_val_dis, y_train_dis, y_val_dis = train_test_split(raw_df[raw_df["target"] == 1]['text'], 
                                                                    raw_df[raw_df["target"] == 1]['target'],
                                                                    test_size=0.20, random_state=42, shuffle=True)

# not disaster sample 
X_train_ndis, X_val_ndis, y_train_ndis, y_val_ndis = train_test_split(raw_df[raw_df["target"] == 0]['text'], 
                                                                    raw_df[raw_df["target"] == 0]['target'],
                                                                    test_size=0.20, random_state=42, shuffle=True)
X_train = pd.concat([X_train_dis, X_train_ndis], axis = 0).reset_index(drop = True).to_frame()
y_train = pd.concat([y_train_dis, y_train_ndis], axis = 0).reset_index(drop = True).to_frame()
train_df = pd.concat([X_train, y_train], axis = 1)
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True) # shuffle dataset


X_val = pd.concat([X_val_dis, X_val_ndis], axis = 0).reset_index(drop = True).to_frame()
y_val = pd.concat([y_val_dis, y_val_ndis], axis = 0).reset_index(drop = True).to_frame()
val_df = pd.concat([X_val, y_val], axis = 1)
val_df = val_df.sample(frac=1, random_state=42).reset_index(drop=True) # shuffle dataset


In [7]:
train_df.shape

(6089, 2)

## 2.2. Data Cleaning and Tokenizing
1. Remove the hash tag
2. Remove hyperlink
3. Remove any word that start with @
4. Tokenize the text
5. Remove stop words
6. Remove punctuation
7. Stemming

In [8]:
# Initialize tokenizer, which will make the string to be list and lowercase all the words
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)

# Initialize stemmer, which will be used to stem the word
stemmer = PorterStemmer()

In [9]:
def clean_tokenize(text):
    """Process text function.
    Input:
        text: the text of the tweet
    Output:
        clean_token: a list of words containing the processed tweet

    """
    # remove hashtags
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)
    
    # remove hyperlink
    text = re.sub(r'https?://[^\s\n\r]+', '', text)

    # remove @
    text = re.sub('@.*? ', '', text)

    # remove ...
    text = re.sub('\.\.\.', '', text)

    # remove \x89
    text = re.sub(r'\x89', '', text)
    
    # remove could, would, should
    text = re.sub(r'\b(could|would|should)\b', '', text, flags=re.IGNORECASE)

    # Tokenize the text, which will also lowercase the word
    text_token = tokenizer.tokenize(text)

    # remove stop words, punctuation and stem the word
    clean_token = []
    for word in text_token:
        if (word not in stopwords.words('english') and  # remove stopwords
            word not in string.punctuation):   # remove punctuation
            # stemming
            clean_word = stemmer.stem(word)
            # remove not readable words and digit
            clean_word = re.sub(r'[^a-zA-Z]', ' ', clean_word)


            clean_token.append(clean_word)
    return clean_token


In [10]:
# clean training dataset

train_df['Remove_Hash_Link_At'] = train_df['text'].map(clean_tokenize)

# clean validation dataset
val_df['Remove_Hash_Link_At'] = val_df['text'].map(clean_tokenize)

# clean testing datast
test_df['Remove_Hash_Link_At'] = test_df['text'].map(clean_tokenize)

# Vanishing Gradient Return
1. Apply TfidfVectorizer

    a. TfidfVectorizer is a tool from scikit-learn that transforms a list of text documents into a matrix of TF-IDF features.

   
    b. TF-IDF = Term Frequency – Inverse Document Frequency

2. Scale the Dataset

    a. Upon the first logistic regression training, I had discovered my model face the vanishing gradient return, where the gradient vector is too small for the model to update the $\theta$ vector. To avoid this issue, I had decided to scale the data prior to training my logistic regression model.

3. Apply PCA



In [11]:
# Combine the text to fit into the TfidVectorizer
train_df["Remove_Hash_Link_At_Combine"] = train_df['Remove_Hash_Link_At'].apply(lambda tokens: ' '.join(tokens))

In [12]:
# Apply TfidVectorizer
vectorizer = TfidfVectorizer()
train_tfidf = vectorizer.fit_transform(train_df["Remove_Hash_Link_At_Combine"])
train_tfidf = pd.DataFrame(train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
train_tfidf.index = train_df.index  # So we can merge later

In [13]:
# Scale the Data
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_tfidf)

In [14]:
# Apply the PCA
pca = PCA(n_components=0.50)
train_pca = pca.fit_transform(train_scaled)

train_pca_df = pd.DataFrame(train_pca, columns=[f"PCA_{i+1}" for i in range(train_pca.shape[1])], index=train_df.index)

In [15]:
train_df_fnl = pd.concat([train_df, train_pca_df], axis = 1)
train_df_fnl

Unnamed: 0,text,target,Remove_Hash_Link_At,Remove_Hash_Link_At_Combine,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,...,PCA_1177,PCA_1178,PCA_1179,PCA_1180,PCA_1181,PCA_1182,PCA_1183,PCA_1184,PCA_1185,PCA_1186
0,Do you feel deluged by low self-image? Take th...,0,"[feel, delug, low, self imag, take, quiz]",feel delug low self imag take quiz,-0.081771,-0.083898,-0.056561,-0.088035,-0.051834,0.619031,...,0.142783,0.481812,-0.215890,-0.066292,-1.136678,-0.831191,-0.172309,-0.685000,-0.729436,0.876607
1,I'm drowning in spirits to wash you out,0,"[drown, spirit, wash]",drown spirit wash,0.045133,-0.020062,-0.037846,-0.072852,-0.006315,-0.035116,...,1.602681,-0.135783,1.038651,0.838980,-0.052299,-0.343928,0.240409,0.174168,-0.916541,0.044347
2,Lunch for the crew is made. Night night it's b...,0,"[lunch, crew, made, night, night, long, day, p...",lunch crew made night night long day peac love...,-0.102266,-0.077306,-0.066704,-0.000926,-0.075748,-0.045665,...,-0.049220,0.246913,-1.593456,-0.722533,0.208101,-0.293919,0.462922,-0.457073,-0.458133,-0.173036
3,Baltimore City : I-95 NORTH AT MP 54.8 (FORT M...,1,"[baltimor, citi, , north, mp, , fort, mc...",baltimor citi north mp fort mchenri tu...,-0.113125,-0.128287,0.240315,-0.110209,-0.137487,-0.025825,...,8.335793,-4.285306,-2.890552,3.269782,5.830762,3.924183,-2.320754,-0.867953,-5.543731,2.258131
4,I concur. The longer you spend with your child...,0,"[concur, longer, spend, child, harm, mmk]",concur longer spend child harm mmk,-0.084877,-0.062786,-0.039423,-0.022109,-0.066308,-0.052131,...,-1.808606,-0.976440,-1.680733,1.124999,0.667636,-1.756042,2.074628,-3.507022,1.991084,0.194354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6084,If I could I would have been by at work but go...,0,"[work, got, injur, secur, concern, must, settl...",work got injur secur concern must settl tortur,-0.093739,-0.062913,-0.063302,-0.070698,-0.080632,-0.075403,...,2.813912,-0.180907,3.235155,0.377263,0.141711,2.957237,0.468470,-1.581295,-0.186921,-3.779953
6085,@UntamedDirewolf 'I... Wow. Alright.' Sansa sh...,0,"[wow, alright, sansa, shook, head, blink, rapi...",wow alright sansa shook head blink rapidli new...,-0.111144,-0.114540,-0.038543,-0.065219,-0.063372,-0.074911,...,-1.471872,0.355770,1.922146,-1.504906,3.387658,-2.724790,2.388234,-1.185526,-2.411593,-2.784889
6086,well it feels like im on fire.,0,"[well, feel, like, im, fire]",well feel like im fire,-0.089198,-0.071518,-0.053366,0.033320,-0.029724,-0.010348,...,-0.408594,0.188136,-0.051951,-0.089511,-0.257227,0.030366,-0.155967,-0.018267,-0.507889,0.162730
6087,We destroyed the #Zimmerman fan club on Twitte...,0,"[destroy, zimmerman, fan, club, twitter, oblit...",destroy zimmerman fan club twitter obliter ren...,-0.107102,0.015778,-0.077112,-0.110659,-0.117066,-0.072565,...,0.137081,0.112680,0.012523,0.199074,-0.641680,-0.328984,0.062118,-0.840016,0.947162,-2.301163


# Find the Top PCA Contributing Words

In [16]:
def keep_only_top_words(word_lst):
    return [word for word in word_lst if word in top_words]

In [17]:
train_df_fnl

Unnamed: 0,text,target,Remove_Hash_Link_At,Remove_Hash_Link_At_Combine,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,...,PCA_1177,PCA_1178,PCA_1179,PCA_1180,PCA_1181,PCA_1182,PCA_1183,PCA_1184,PCA_1185,PCA_1186
0,Do you feel deluged by low self-image? Take th...,0,"[feel, delug, low, self imag, take, quiz]",feel delug low self imag take quiz,-0.081771,-0.083898,-0.056561,-0.088035,-0.051834,0.619031,...,0.142783,0.481812,-0.215890,-0.066292,-1.136678,-0.831191,-0.172309,-0.685000,-0.729436,0.876607
1,I'm drowning in spirits to wash you out,0,"[drown, spirit, wash]",drown spirit wash,0.045133,-0.020062,-0.037846,-0.072852,-0.006315,-0.035116,...,1.602681,-0.135783,1.038651,0.838980,-0.052299,-0.343928,0.240409,0.174168,-0.916541,0.044347
2,Lunch for the crew is made. Night night it's b...,0,"[lunch, crew, made, night, night, long, day, p...",lunch crew made night night long day peac love...,-0.102266,-0.077306,-0.066704,-0.000926,-0.075748,-0.045665,...,-0.049220,0.246913,-1.593456,-0.722533,0.208101,-0.293919,0.462922,-0.457073,-0.458133,-0.173036
3,Baltimore City : I-95 NORTH AT MP 54.8 (FORT M...,1,"[baltimor, citi, , north, mp, , fort, mc...",baltimor citi north mp fort mchenri tu...,-0.113125,-0.128287,0.240315,-0.110209,-0.137487,-0.025825,...,8.335793,-4.285306,-2.890552,3.269782,5.830762,3.924183,-2.320754,-0.867953,-5.543731,2.258131
4,I concur. The longer you spend with your child...,0,"[concur, longer, spend, child, harm, mmk]",concur longer spend child harm mmk,-0.084877,-0.062786,-0.039423,-0.022109,-0.066308,-0.052131,...,-1.808606,-0.976440,-1.680733,1.124999,0.667636,-1.756042,2.074628,-3.507022,1.991084,0.194354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6084,If I could I would have been by at work but go...,0,"[work, got, injur, secur, concern, must, settl...",work got injur secur concern must settl tortur,-0.093739,-0.062913,-0.063302,-0.070698,-0.080632,-0.075403,...,2.813912,-0.180907,3.235155,0.377263,0.141711,2.957237,0.468470,-1.581295,-0.186921,-3.779953
6085,@UntamedDirewolf 'I... Wow. Alright.' Sansa sh...,0,"[wow, alright, sansa, shook, head, blink, rapi...",wow alright sansa shook head blink rapidli new...,-0.111144,-0.114540,-0.038543,-0.065219,-0.063372,-0.074911,...,-1.471872,0.355770,1.922146,-1.504906,3.387658,-2.724790,2.388234,-1.185526,-2.411593,-2.784889
6086,well it feels like im on fire.,0,"[well, feel, like, im, fire]",well feel like im fire,-0.089198,-0.071518,-0.053366,0.033320,-0.029724,-0.010348,...,-0.408594,0.188136,-0.051951,-0.089511,-0.257227,0.030366,-0.155967,-0.018267,-0.507889,0.162730
6087,We destroyed the #Zimmerman fan club on Twitte...,0,"[destroy, zimmerman, fan, club, twitter, oblit...",destroy zimmerman fan club twitter obliter ren...,-0.107102,0.015778,-0.077112,-0.110659,-0.117066,-0.072565,...,0.137081,0.112680,0.012523,0.199074,-0.641680,-0.328984,0.062118,-0.840016,0.947162,-2.301163


In [18]:
feature_names = vectorizer.get_feature_names_out()
n_feature_words = len(feature_names)

importance = np.sum(np.abs(pca.components_), axis=0)

top_indices = [i for i, val in enumerate(importance) if val > np.percentile(importance, 50)]
top_words = feature_names[top_indices]


In [19]:
n_feature_words

9851

In [22]:
train_df_fnl["top_words"] = train_df_fnl["Remove_Hash_Link_At"].apply(keep_only_top_words)
train_df_fnl["top_word_combine"] = train_df_fnl["top_words"].apply(lambda words: " ".join(words))
val_df_fnl = val_df.copy()
test_df_fnl = test_df.copy()
val_df_fnl["top_words"] = val_df["Remove_Hash_Link_At"].apply(keep_only_top_words)
val_df_fnl["top_word_combine"] = val_df_fnl["top_words"].apply(lambda words: " ".join(words))
test_df_fnl["top_words"]= test_df["Remove_Hash_Link_At"].apply(keep_only_top_words)
test_df_fnl["top_word_combine"] = test_df_fnl["top_words"].apply(lambda words: " ".join(words))

In [23]:
train_df_fnl

Unnamed: 0,text,target,Remove_Hash_Link_At,Remove_Hash_Link_At_Combine,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,...,PCA_1179,PCA_1180,PCA_1181,PCA_1182,PCA_1183,PCA_1184,PCA_1185,PCA_1186,top_words,top_word_combine
0,Do you feel deluged by low self-image? Take th...,0,"[feel, delug, low, self imag, take, quiz]",feel delug low self imag take quiz,-0.081771,-0.083898,-0.056561,-0.088035,-0.051834,0.619031,...,-0.215890,-0.066292,-1.136678,-0.831191,-0.172309,-0.685000,-0.729436,0.876607,"[feel, delug, low, take, quiz]",feel delug low take quiz
1,I'm drowning in spirits to wash you out,0,"[drown, spirit, wash]",drown spirit wash,0.045133,-0.020062,-0.037846,-0.072852,-0.006315,-0.035116,...,1.038651,0.838980,-0.052299,-0.343928,0.240409,0.174168,-0.916541,0.044347,[spirit],spirit
2,Lunch for the crew is made. Night night it's b...,0,"[lunch, crew, made, night, night, long, day, p...",lunch crew made night night long day peac love...,-0.102266,-0.077306,-0.066704,-0.000926,-0.075748,-0.045665,...,-1.593456,-0.722533,0.208101,-0.293919,0.462922,-0.457073,-0.458133,-0.173036,"[lunch, crew, made, night, night, long, day, p...",lunch crew made night night long day peac rescu
3,Baltimore City : I-95 NORTH AT MP 54.8 (FORT M...,1,"[baltimor, citi, , north, mp, , fort, mc...",baltimor citi north mp fort mchenri tu...,-0.113125,-0.128287,0.240315,-0.110209,-0.137487,-0.025825,...,-2.890552,3.269782,5.830762,3.924183,-2.320754,-0.867953,-5.543731,2.258131,"[citi, north, fort, mchenri, tunnel, bore, col...",citi north fort mchenri tunnel bore collis nor...
4,I concur. The longer you spend with your child...,0,"[concur, longer, spend, child, harm, mmk]",concur longer spend child harm mmk,-0.084877,-0.062786,-0.039423,-0.022109,-0.066308,-0.052131,...,-1.680733,1.124999,0.667636,-1.756042,2.074628,-3.507022,1.991084,0.194354,"[spend, child]",spend child
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6084,If I could I would have been by at work but go...,0,"[work, got, injur, secur, concern, must, settl...",work got injur secur concern must settl tortur,-0.093739,-0.062913,-0.063302,-0.070698,-0.080632,-0.075403,...,3.235155,0.377263,0.141711,2.957237,0.468470,-1.581295,-0.186921,-3.779953,"[work, got, injur, secur, must]",work got injur secur must
6085,@UntamedDirewolf 'I... Wow. Alright.' Sansa sh...,0,"[wow, alright, sansa, shook, head, blink, rapi...",wow alright sansa shook head blink rapidli new...,-0.111144,-0.114540,-0.038543,-0.065219,-0.063372,-0.074911,...,1.922146,-1.504906,3.387658,-2.724790,2.388234,-1.185526,-2.411593,-2.784889,"[shook, rapidli, new, inform, realli]",shook rapidli new inform realli
6086,well it feels like im on fire.,0,"[well, feel, like, im, fire]",well feel like im fire,-0.089198,-0.071518,-0.053366,0.033320,-0.029724,-0.010348,...,-0.051951,-0.089511,-0.257227,0.030366,-0.155967,-0.018267,-0.507889,0.162730,"[well, feel, like, fire]",well feel like fire
6087,We destroyed the #Zimmerman fan club on Twitte...,0,"[destroy, zimmerman, fan, club, twitter, oblit...",destroy zimmerman fan club twitter obliter ren...,-0.107102,0.015778,-0.077112,-0.110659,-0.117066,-0.072565,...,0.012523,0.199074,-0.641680,-0.328984,0.062118,-0.840016,0.947162,-2.301163,"[destroy, zimmerman, fan, twitter, renewsit, r...",destroy zimmerman fan twitter renewsit reduc s...


In [24]:
len(top_words)

4925

# 3. Build Word Dictionary
Use the training dataset to build the word dictionary. The word dictionary will use the ($word_i$, $label_i$) as key and the count of ($word_i$, $label_i$) occurrence as value

In [25]:
def build_word_dict(label_arr, token_word_arr):
    """Build frequencies.
    Input:
        token_word: a series of list of tokenized word
        label: a series of label that match the array of the list of tokenized word
    Output:
        freqs: a dictionary mapping each (word, label) pair to its frequency
    """
    word_dict = {}
    y_list = list(label_arr) # make array into list

    for label_idx in range(len(y_list)):

        for word in token_word_arr[label_idx]:
            word_dict[(word, y_list[label_idx])] = word_dict.get((word, y_list[label_idx]), 0) + 1
    return word_dict
        
word_dict = build_word_dict(train_df_fnl['target'], train_df_fnl["top_words"])    
  

    

In [26]:
print("number of words:", len(word_dict))
print("Output Example:", list(word_dict.items())[:10])

number of words: 6427
Output Example: [(('feel', 0), 64), (('delug', 0), 43), (('low', 0), 15), (('take', 0), 52), (('quiz', 0), 13), (('spirit', 0), 7), (('lunch', 0), 4), (('crew', 0), 2), (('made', 0), 28), (('night', 0), 31)]


# Extract Information

$$X_m = [x_0, x_1, x_2]
    =[1, \sum_{i=1}^{m} Freq(_i, 1), \sum_{i=1}^{m}  Freq(w_i,0)]$$

$x_0$: bias

$x_1$: number of postive label for this word from the dictionary

$x_2$: number of negative label for this word from the dictionary

$m$: number of training tweets

In [27]:
def extract_info(word_dict,token_word_arr):
    """Extract Information from Each Tweets.
    Input:
        token_word_arr: a series of list of tokenized word
        word_dic: the dictionary with (w, label) as key and frequency as value
    Output:
        1. a list of total frequency for each word in the tweet that associate with disaster from the dictionary
        2. a list of total frequency for each word in the tweet that associate with not disaster from the dictionary
    """
    cnt_dis_word = 0
    cnt_non_dis_word = 0
    dis_lst = []
    non_dis_lst = []

    # iterate all rows
    for idx in range(len(token_word_arr)):
        # iterate all words at the token_word_arr[idx]
        for word in token_word_arr[idx]:
            cnt_dis_word += word_dict.get((word, 1), 0)
            cnt_non_dis_word += word_dict.get((word, 0), 0)

        dis_lst.append(cnt_dis_word)
        non_dis_lst.append(cnt_non_dis_word)
    return dis_lst, non_dis_lst
    

        

In [28]:
# Train Dataset
train_df_fnl['disaster_score'], train_df_fnl['non_disaster_score']= extract_info(word_dict,train_df_fnl["top_words"])
train_df_fnl['bias'] = [1]*len(train_df_fnl) # add bias term
X_train, y_train = train_df_fnl[["bias", "disaster_score", 'non_disaster_score']], train_df_fnl['target']

# Validation Dataset
val_df_fnl['disaster_score'], val_df_fnl['non_disaster_score']= extract_info(word_dict,val_df_fnl['top_words'])
val_df_fnl['bias'] = [1]*len(val_df_fnl) # add bias term
X_val, y_val = val_df_fnl[["bias", "disaster_score", 'non_disaster_score']], val_df_fnl['target']

# Test Dataset
test_df_fnl['disaster_score'], test_df_fnl['non_disaster_score']= extract_info(word_dict,test_df_fnl['top_words'])
test_df_fnl['bias'] = [1]*len(test_df_fnl) # add bias term
X_test = test_df_fnl[["bias", "disaster_score", 'non_disaster_score']]

In [29]:
print(X_train.shape) #(m, 3)
print(X_val.shape) #(m, 3)


y_train = y_train.to_numpy().reshape(-1, 1)
y_val = y_val.to_numpy().reshape(-1, 1)
print(y_train.shape) #(m,1)
print(y_val.shape) # (m, 1)


(6089, 3)
(1524, 3)
(6089, 1)
(1524, 1)


# Overfitting
To avoid overfitting, I plan to implement the L2 Regression
$$\lambda*\frac{1}{2}\mathbf{\theta}^T\mathbf{\theta}$$

# Logistic Regression

* Sigmoid Function
$$ h(z) = \frac{1}{1+\exp^{-z}} $$
$$z = \theta_0 x_0 + \theta_1 x_1 + \theta_2 x_2 + ... \theta_N x_N = x^Tz$$

* Loss Function
$$J = \frac{-1}{m} \times \left(\mathbf{y}^T \cdot log(\mathbf{h}) + \mathbf{(1-y)}^T \cdot log(\mathbf{1-h}) \right)$$
    * By Taking Derivative of Log of Likelihood: $$\nabla_{\theta_j}J(\theta) = \frac{1}{m} \sum_{i=1}^m(h^{(i)}-y^{(i)})x^{(i)}_j $$
* Loss Function with Ridge Regression
$$J = \frac{-1}{m} \times \left(\mathbf{y}^T \cdot log(\mathbf{h}) + \mathbf{(1-y)}^T \cdot log(\mathbf{1-h}) \right)+\frac{\lambda}{2m}\mathbf{\theta}^T\mathbf{\theta}$$
    * By Taking Derivative of Log of Likelihood: $$\nabla_{\theta_j}J(\theta) = \frac{1}{m} (\sum_{i=1}^m(h^{(i)}-y^{(i)})x^{(i)}_j + \lambda \theta_j )$$
* Gradient Descent
$$\theta_j = \theta_j - \alpha \times \nabla_{\theta_j}J(\theta) $$
$$\mathbf{\theta} = \mathbf{\theta} - \frac{\alpha}{m} \times \left( \mathbf{x}^T \cdot \left( \mathbf{h-y} \right) \right)$$

$\alpha$: learning rate



In [30]:
def sigmoid(z):
    """Calculate sigmoid.
    Input: a digit
    Output: the sigmoid of the digit
    """   
    z = np.clip(z, -500, 500)  # Prevent overflow
    return 1/(1+np.exp(-z))

In [31]:
def accuracy(pred_label, label_y):
    """Calculate accuracy.
    Input: predicted Y and actual Y value
    Output: a list contains 0 or 1 if the predicted Y is the same as label_y
    """  
    match_list = (pred_label == label_y).astype(int)
    accuracy_percentage = sum(match_list)/ len(match_list)
    return accuracy_percentage


In [32]:
def gradDescent(alpha, theta, train_X, train_Y, val_X, val_Y,iteration, lam):
    """Calculate sigmoid.
    Input: a digit
    Output: the sigmoid of the digit
    """   
    # initialize the train and validation loss list, also the theta
    train_loss = []
    val_loss = []
    train_accuracy = []
    val_accuracy = []
    theta = np.zeros((3,1))

    # optimal_iter = 0

    # number of examples
    train_m = len(train_Y)  
    val_m = len(val_Y)
    
    
    for itera in range(iteration):

        # calculate the predicted y
        z = np.dot(train_X,theta) #(m,1)

        # put the predicted y into sigmoid
        sig = sigmoid(z) #(m,1)

        # Compute the gradient
        gradient = (1/train_m) * np.dot(train_X.T, (sig - train_Y)) #(3,1)
        gradient += (lam / train_m) * np.r_[[[0]], theta[1:]]  # exclude bias


        # update theta
        theta = theta - gradient * alpha #(3,1)


        # calculate the training loss
        train_z = np.dot(train_X,theta)        
        train_sig = sigmoid(train_z)
        train_sig = np.clip(train_sig, 1e-10, 1 - 1e-10)  # Prevent log(0)
        train_loss_val = (-1/train_m)* (train_Y.T @ np.log(train_sig)+(1-train_Y).T@ np.log(1-train_sig))[0,0]+(lam / (2 * train_m)) * np.sum(theta[1:] ** 2)
        train_loss.append(train_loss_val)

        # predict the training label
        pred_label_train = (train_sig >= 0.5).astype(int)
        # calculate the training accuray
        train_accuracy.append(accuracy_score(pred_label_train,train_Y))
        
        
        # calculate the validation loss
        val_z = np.dot(val_X,theta)
        val_sig = sigmoid(val_z)
        val_sig = np.clip(val_sig, 1e-10, 1 - 1e-10)  # Prevent log(0)
        val_loss_val = (-1/val_m)* (val_Y.T @ np.log(val_sig)+(1-val_Y).T@ np.log(1-val_sig))[0,0]
        val_loss.append(val_loss_val)   
        
        # predict the validation label
        pred_label_val = (val_sig  >= 0.5).astype(int)
        # calculate the training accuray
        val_accuracy.append(accuracy_score(pred_label_val,val_Y))
        


        
        # # update the theta optimal theta

        # curr_train_val_loss = np.log(train_loss_val) + np.log(val_loss_val)
        
        # if curr_train_val_loss < min_train_val_loss:
        #     theta_optimal = theta
        #     min_train_val_loss = curr_train_val_loss
        #     optimal_iter = itera

    
    return theta, train_loss, val_loss, train_accuracy, val_accuracy
    
    
    
    
    
    


In [33]:
# y_axis[int(5500/500)]

In [34]:
theta = np.zeros((3,1))
alpha = 0.0001
iteration = 1000
lam = 0.0001
theta, train_loss, val_loss, train_accuracy, val_accuracy = gradDescent(alpha,theta,X_train,y_train, X_val, y_val,iteration, lam)

In [35]:
print(theta)
print(np.mean(train_accuracy))
print(np.mean(val_accuracy))
print(train_accuracy)

[[ 6.08402788e-04]
 [-1.07658292e+01]
 [-1.26001344e+01]]
0.5108356051896863
0.5108372703412073
[0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.42962719658400395, 0.570372803415996, 0.57037280

In [154]:
# import matplotlib.pyplot as plt
# iterations = range(1, iteration + 1)
# plt.figure(figsize=(20, 5))
# plt.plot(iterations, train_accuracy, label='Train Accuracy', color='blue')
# plt.plot(iterations, val_accuracy, label='Validation Accuracy', color='orange')
# plt.xlabel('Iteration')
# plt.ylabel('Accuracy')
# plt.title('Train vs Test Loss over Iterations')
# plt.legend()
# plt.grid(True)
# plt.tight_layout()
# plt.show()

# Naive Bayes

In [36]:
vectorizer = CountVectorizer()
train_X_nb = vectorizer.fit_transform(train_df_fnl["top_word_combine"])
val_X_nb = vectorizer.transform(val_df_fnl["top_word_combine"])

clf = MultinomialNB(alpha=1.0)
clf.fit(train_X_nb, train_df_fnl["target"])

train_pred = clf.predict(train_X_nb)
train_accuracy = accuracy_score(train_df_fnl['target'], train_pred)

val_pred = clf.predict(val_X_nb)
val_accuracy = accuracy_score(val_df_fnl['target'], val_pred)

print(train_accuracy)
print(val_accuracy)

0.8467728691082279
0.7703412073490814


Reference: [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/c/nlp-getting-started/overview)