## Notebook Preparation

In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, NuSVC
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from bs4 import BeautifulSoup
from sklearn.model_selection import GridSearchCV
from nltk.stem.snowball import SnowballStemmer
from nltk.util import ngrams
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from nltk import ngrams
from nltk.corpus import stopwords

In [2]:
#define useful functions
def train_model(X,y, model, show_figures=True): 
    y_pred = cross_val_predict(model, X, y, cv=5)
    score = f1_score(y,y_pred)
    print("F1 Score = {0:.4f}".format(score))
    if show_figures:
        model.fit(X, y)
    return model, score

In [3]:
#read the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
#we will start off by looking at the text variable only
X_train = train[['text', 'target', 'id']].copy()
y_train = np.array(train['target']) #convert to an array for model
X_test = test[['text', 'id']].copy()

## Baseline Models - Before Cleaning

In my first attempt at a model, I would like to see the performance of a few simple models before cleaning the data. By understanding the performance of these models, I will understand the impact of my data cleaning steps. I will also try two tokenizers - CountVectorizer and TFIDF, to see which one performs better. I have disregarded the `keyword` variable in this analysis.

In [6]:
X_train.head()

Unnamed: 0,text,target,id
0,Our Deeds are the Reason of this #earthquake M...,1,1
1,Forest fire near La Ronge Sask. Canada,1,4
2,All residents asked to 'shelter in place' are ...,1,5
3,"13,000 people receive #wildfires evacuation or...",1,6
4,Just got sent this photo from Ruby #Alaska as ...,1,7


### Naive Bayes 

In [7]:
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train['text'].values)
X_test_vect = vectorizer.transform(X_test['text'].values)

In [8]:
NB_model = MultinomialNB()
NB_model.fit(X_train_vect, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
NB_fitted_model, score = train_model(X_train_vect, y_train, NB_model)

F1 Score = 0.6753


In [10]:
baselines_df = pd.DataFrame(columns=['Baseline Model', 'F1 Score'])
baselines_df.loc[len(baselines_df)] = ['Baseline NB - CountVectorizer', round(score,4)]
baselines_df

results_df = pd.DataFrame(columns=['NB Model - Cleaning Steps', 'F1 Score'])
results_df.loc[len(results_df)] = ['Baseline NB', round(score,4)]

In [11]:
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train['text'].values)
X_test_vect = vectorizer.transform(X_test['text'].values)

In [12]:
NB_fitted_model, score = train_model(X_train_vect, y_train, NB_model)

F1 Score = 0.6445


In [13]:
baselines_df.loc[len(baselines_df)] = ['Baseline NB - TFIDF', round(score,4)]
baselines_df

Unnamed: 0,Baseline Model,F1 Score
0,Baseline NB - CountVectorizer,0.6753
1,Baseline NB - TFIDF,0.6445


### SVM

In [14]:
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train['text'].values)
X_test_vect = vectorizer.transform(X_test['text'].values)

In [15]:
svm = LinearSVC()
svm.fit(X_train_vect, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [16]:
svm_fitted_model, score = train_model(X_train_vect, y_train, svm)

F1 Score = 0.5985


In [17]:
baselines_df.loc[len(baselines_df)] = ['Baseline SVM - CV', round(score,4)]
baselines_df

Unnamed: 0,Baseline Model,F1 Score
0,Baseline NB - CountVectorizer,0.6753
1,Baseline NB - TFIDF,0.6445
2,Baseline SVM - CV,0.5985


In [18]:
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train['text'].values)
X_test_vect = vectorizer.transform(X_test['text'].values)

In [19]:
svm_fitted_model, score = train_model(X_train_vect, y_train, svm)

F1 Score = 0.6215


In [20]:
baselines_df.loc[len(baselines_df)] = ['Baseline SVM - TFIDF', round(score,4)]
baselines_df

Unnamed: 0,Baseline Model,F1 Score
0,Baseline NB - CountVectorizer,0.6753
1,Baseline NB - TFIDF,0.6445
2,Baseline SVM - CV,0.5985
3,Baseline SVM - TFIDF,0.6215


### Logistic Regression

In [21]:
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train['text'].values)
X_test_vect = vectorizer.transform(X_test['text'].values)

In [22]:
log_reg = LogisticRegression()
log_reg.fit(X_train_vect, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
LogReg_fitted_model, score = train_model(X_train_vect, y_train, log_reg)



F1 Score = 0.6284


In [24]:
baselines_df.loc[len(baselines_df)] = ['Baseline Log Reg - CV', round(score,4)]
baselines_df

Unnamed: 0,Baseline Model,F1 Score
0,Baseline NB - CountVectorizer,0.6753
1,Baseline NB - TFIDF,0.6445
2,Baseline SVM - CV,0.5985
3,Baseline SVM - TFIDF,0.6215
4,Baseline Log Reg - CV,0.6284


In [25]:
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train['text'].values)
X_test_vect = vectorizer.transform(X_test['text'].values)

In [26]:
LogReg_fitted_model, score = train_model(X_train_vect, y_train, log_reg)

F1 Score = 0.6412




In [27]:
baselines_df.loc[len(baselines_df)] = ['Baseline Log Reg - TFIDF', round(score,4)]
baselines_df

Unnamed: 0,Baseline Model,F1 Score
0,Baseline NB - CountVectorizer,0.6753
1,Baseline NB - TFIDF,0.6445
2,Baseline SVM - CV,0.5985
3,Baseline SVM - TFIDF,0.6215
4,Baseline Log Reg - CV,0.6284
5,Baseline Log Reg - TFIDF,0.6412


Naive Bayes seems to be the best model for this dataset. Surprisingly, Count Vectorizer performs better than TF-IDF in the baseline model. This could be due to the fact that since there is a character limit on tweets, users are more likely to condense their message to important words and possibly skip out on stopwords that appear frequently. Additionally, since these are emergency tweets, there will be some common frequent words like alarm, panic, accident. Although these words may appear frequently, we do not want them dicounted by TF-IDF.

## Data Cleaning

Now that we have an understanding of the baseline performance of these models, we will implement some cleaning steps to assess the impact on performance.

### Remove punctuation

In [28]:
#remove punctuation
X_train['text_cleaned'] = X_train['text'].str.replace(r'[^\w\s]+', '')
X_test['text_cleaned'] = X_test['text'].str.replace(r'[^\w\s]+', '')

In [29]:
X_train.head()

Unnamed: 0,text,target,id,text_cleaned
0,Our Deeds are the Reason of this #earthquake M...,1,1,Our Deeds are the Reason of this earthquake Ma...
1,Forest fire near La Ronge Sask. Canada,1,4,Forest fire near La Ronge Sask Canada
2,All residents asked to 'shelter in place' are ...,1,5,All residents asked to shelter in place are be...
3,"13,000 people receive #wildfires evacuation or...",1,6,13000 people receive wildfires evacuation orde...
4,Just got sent this photo from Ruby #Alaska as ...,1,7,Just got sent this photo from Ruby Alaska as s...


In [30]:
#vectorize
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train['text_cleaned'].values)
X_test_vect = vectorizer.transform(X_test['text_cleaned'].values)

In [31]:
#apply model
NB_model = MultinomialNB()
NB_model.fit(X_train_vect, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
NB_fitted_model, score = train_model(X_train_vect, y_train, NB_model)

F1 Score = 0.6708


In [33]:
results_df.loc[len(results_df)] = ['Punctuation Removed', round(score,4)]
results_df

Unnamed: 0,NB Model - Cleaning Steps,F1 Score
0,Baseline NB,0.6753
1,Punctuation Removed,0.6708


It seems like punctuation removal is negatively impacting the performance of our model, however through many rounds of trial and error, I've noticed that combining the data cleaning is better for the model. Next I will remove digits.

### Remove digits

In [34]:
#remove digits
X_train['text_cleaned'] = X_train['text_cleaned'].str.replace('\d+', '')
X_test['text_cleaned'] = X_test['text_cleaned'].str.replace('\d+', '')

In [35]:
#vectorize again
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train['text_cleaned'].values)
X_test_vect = vectorizer.transform(X_test['text_cleaned'].values)

In [36]:
#apply simple model again
NB_model = MultinomialNB()
NB_model.fit(X_train_vect, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [37]:
NB_fitted_model, score = train_model(X_train_vect, y_train, NB_model)

F1 Score = 0.6716


In [38]:
results_df.loc[len(results_df)] = ['Punct. & Digits Removed', round(score,4)]
results_df

Unnamed: 0,NB Model - Cleaning Steps,F1 Score
0,Baseline NB,0.6753
1,Punctuation Removed,0.6708
2,Punct. & Digits Removed,0.6716


The model has slightly improved by removing digits from the text. In the case of tweets, removing digits is very important since many tweets contain locational and descriptive information that contain numbers. For example - "An accident has been report on I-90" or "13,000 have died in...". While important for a human, this information is meaningless for the model. Next I will remove stop words manually.

### Remove stop words

In [39]:
X_train["text_cleaned_no_SW"] = X_train["text_cleaned"].str.split()
X_test["text_cleaned_no_SW"] = X_test["text_cleaned"].str.split()

In [40]:
stop = stopwords.words('english')
X_train["text_cleaned_no_SW"] = X_train["text_cleaned_no_SW"].apply(lambda lst: [x for x in lst if x not in stop])
X_test["text_cleaned_no_SW"] = X_test["text_cleaned_no_SW"].apply(lambda lst: [x for x in lst if x not in stop])

In [41]:
X_train['text_cleaned_no_SW'] = X_train['text_cleaned_no_SW'].apply(' '.join)
X_test['text_cleaned_no_SW'] = X_test['text_cleaned_no_SW'].apply(' '.join)

In [42]:
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train['text_cleaned_no_SW'].values)
X_test_vect = vectorizer.transform(X_test['text_cleaned_no_SW'].values)

In [43]:
#apply simple model again
NB_model = MultinomialNB()
NB_model.fit(X_train_vect, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [44]:
NB_fitted_model, score = train_model(X_train_vect, y_train, NB_model)

F1 Score = 0.6614


In [45]:
results_df.loc[len(results_df)] = ['Punct, Digits & SW Removed - manually', round(score,4)]
results_df

Unnamed: 0,NB Model - Cleaning Steps,F1 Score
0,Baseline NB,0.6753
1,Punctuation Removed,0.6708
2,Punct. & Digits Removed,0.6716
3,"Punct, Digits & SW Removed - manually",0.6614


Once again, the performance has decreased when removing stop words. We know that there must be some words in this document that are repeated frequently, however they might be different than the typical enlgish stopwords. I will assess the most common words in each class, that will give a good indication of what the stopwords could be in our case.

#### Identify stopwords for each class

In [46]:
#separate the classes into different dataframes for further assessment
X_train_0 = X_train[X_train['target'] == 0]
X_train_1 = X_train[X_train['target'] == 1]

In [47]:
#calculate word frequencies for each class
stopwords_0 = pd.Series(np.concatenate([x.split() for x in X_train_0['text_cleaned']])).value_counts()[:30].index.tolist()
stopwords_1 = pd.Series(np.concatenate([x.split() for x in X_train_1['text_cleaned']])).value_counts()[:30].index.tolist()

In [48]:
print(stopwords_0)

['the', 'a', 'to', 'I', 'and', 'of', 'in', 'you', 'is', 'for', 'my', 'on', 'with', 'that', 'it', 'The', 'be', 'like', 'this', 'me', 'by', 'have', 'at', 'was', 'your', 'are', 'Im', 'just', 'amp', 'so']


In [49]:
print(stopwords_1)

['the', 'in', 'of', 'a', 'to', 'and', 'on', 'for', 'is', 'I', 'at', 'The', 'by', 'from', 'A', 'that', 'with', 'was', 'it', 'are', 'after', 'as', 'have', 'fire', 'via', 'this', 'over', 'you', 'California', 'amp']


It seems that there is not much difference in the most frequent words that appear in both classes. Mostly all are stop words except for two words that appear in class 1: _via_ and _fire_. With this analysis, however, we are sure that there are not many frequent, meaningful words that will indicate the class of a tweet.

Based on the results of all my cleaning steps, it may seem like the basic cleaning steps do not apply to this dataset, however after a few Kaggle submissions I've realized that these steps improve my model, although incrementally. When it comes to NLP analysis of tweets, text cleaning does not yield significant results most likely because there do not exist many punctuation marks, stopwords and other language nuances in tweets. This is probably due to the fact that tweets are subject to a 140 character limit, therefore users condense their messages. Upon analysis of the tweets, I've noticed that users usually share content in their tweets, which introduces a link starting with _http_. My next data cleaning step will be to remove links.

#### Remove http

In [50]:
#convert string into list
X_train['text_cleaned_no_SW'] = X_train['text_cleaned_no_SW'].apply(lambda x: x[0:-1].split())
X_test['text_cleaned_no_SW'] = X_test['text_cleaned_no_SW'].apply(lambda x: x[0:-1].split())

In [51]:
#remove words starting with http
X_train['text_cleaned_no_SW'] = X_train['text_cleaned_no_SW'].apply(lambda lst: [x for x in lst if not x.startswith("http")])
X_test['text_cleaned_no_SW'] = X_test['text_cleaned_no_SW'].apply(lambda lst: [x for x in lst if not x.startswith("http")])

In [52]:
X_train['text_cleaned_no_SW'] = X_train['text_cleaned_no_SW'].apply(' '.join).str.lower()
X_test['text_cleaned_no_SW'] = X_test['text_cleaned_no_SW'].apply(' '.join).str.lower()

In [53]:
#vectorize
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train['text_cleaned_no_SW'].values)
X_test_vect = vectorizer.transform(X_test['text_cleaned_no_SW'].values)

In [54]:
#apply simple model 
NB_model = MultinomialNB()
NB_model.fit(X_train_vect, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [55]:
nb_fitted_model, score = train_model(X_train_vect, y_train, NB_model)

F1 Score = 0.6675


In [56]:
results_df.loc[len(results_df)] = ['Punct, Digits, SW & Links Removed', round(score,4)]
results_df

Unnamed: 0,NB Model - Cleaning Steps,F1 Score
0,Baseline NB,0.6753
1,Punctuation Removed,0.6708
2,Punct. & Digits Removed,0.6716
3,"Punct, Digits & SW Removed - manually",0.6614
4,"Punct, Digits, SW & Links Removed",0.6675


Removing links has slightly increased the performance of the model. As mentioned earlier, although these steps do not make much difference in the train set, these steps are very impactful in the test set. When working with the train set, it is important to note that case-specific data cleaning steps, such as removing words starting with 'http', generally will improve the model although the score may not indicate it.

#### Apply stemming - Porter and Snowball Stemmer

##### Snowball

In [57]:
#convert string into list
X_train['snowball_stemmed'] = X_train['text_cleaned_no_SW'].apply(lambda x: x[0:-1].split())
X_test['snowball_stemmed'] = X_test['text_cleaned_no_SW'].apply(lambda x: x[0:-1].split())

In [58]:
stemmer = SnowballStemmer("english")
X_train['snowball_stemmed'] = X_train['snowball_stemmed'].apply(lambda lst: [stemmer.stem(x) for x in lst])
X_test['snowball_stemmed'] = X_test['snowball_stemmed'].apply(lambda lst: [stemmer.stem(x) for x in lst])

In [59]:
X_train['snowball_stemmed'] = X_train['snowball_stemmed'].apply(' '.join)
X_test['snowball_stemmed'] = X_test['snowball_stemmed'].apply(' '.join)

In [60]:
X_train.head()

Unnamed: 0,text,target,id,text_cleaned,text_cleaned_no_SW,snowball_stemmed
0,Our Deeds are the Reason of this #earthquake M...,1,1,Our Deeds are the Reason of this earthquake Ma...,our deeds reason earthquake may allah forgive u,our deed reason earthquak may allah forgiv
1,Forest fire near La Ronge Sask. Canada,1,4,Forest fire near La Ronge Sask Canada,forest fire near la ronge sask canad,forest fire near la rong sask cana
2,All residents asked to 'shelter in place' are ...,1,5,All residents asked to shelter in place are be...,all residents asked shelter place notified off...,all resid ask shelter place notifi offic no ev...
3,"13,000 people receive #wildfires evacuation or...",1,6,people receive wildfires evacuation orders in...,people receive wildfires evacuation orders cal...,peopl receiv wildfir evacu order californ
4,Just got sent this photo from Ruby #Alaska as ...,1,7,Just got sent this photo from Ruby Alaska as s...,just got sent photo ruby alaska smoke wildfire...,just got sent photo rubi alaska smoke wildfir ...


In [61]:
#vectorize
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train['snowball_stemmed'].values)
X_test_vect = vectorizer.transform(X_test['snowball_stemmed'].values)

In [62]:
NB_model = MultinomialNB()
NB_model.fit(X_train_vect, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [63]:
NB_fitted_model, score = train_model(X_train_vect, y_train, NB_model)

F1 Score = 0.6635


In [64]:
results_df.loc[len(results_df)] = ['Punct, Digits, SW & Links Removed - Snowball', round(score,4)]
results_df

Unnamed: 0,NB Model - Cleaning Steps,F1 Score
0,Baseline NB,0.6753
1,Punctuation Removed,0.6708
2,Punct. & Digits Removed,0.6716
3,"Punct, Digits & SW Removed - manually",0.6614
4,"Punct, Digits, SW & Links Removed",0.6675
5,"Punct, Digits, SW & Links Removed - Snowball",0.6635


##### Porter

In [65]:
#convert string into list
X_train['porter_stemmed'] = X_train['text_cleaned_no_SW'].apply(lambda x: x[0:-1].split())
X_test['porter_stemmed'] = X_test['text_cleaned_no_SW'].apply(lambda x: x[0:-1].split())

In [66]:
stemmer = PorterStemmer()
X_train['porter_stemmed'] = X_train['porter_stemmed'].apply(lambda lst: [stemmer.stem(x) for x in lst])
X_test['porter_stemmed'] = X_test['porter_stemmed'].apply(lambda lst: [stemmer.stem(x) for x in lst])

In [67]:
X_train['porter_stemmed'] = X_train['porter_stemmed'].apply(' '.join)
X_test['porter_stemmed'] = X_test['porter_stemmed'].apply(' '.join)

In [68]:
#vectorize
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train['porter_stemmed'].values)
X_test_vect = vectorizer.transform(X_test['porter_stemmed'].values)

In [69]:
NB_model = MultinomialNB()
NB_model.fit(X_train_vect, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [70]:
NB_fitted_model, score = train_model(X_train_vect, y_train, NB_model)

F1 Score = 0.6648


In [71]:
results_df.loc[len(results_df)] = ['Punct, Digits, SW & Links Removed - Porter', round(score,4)]
results_df

Unnamed: 0,NB Model - Cleaning Steps,F1 Score
0,Baseline NB,0.6753
1,Punctuation Removed,0.6708
2,Punct. & Digits Removed,0.6716
3,"Punct, Digits & SW Removed - manually",0.6614
4,"Punct, Digits, SW & Links Removed",0.6675
5,"Punct, Digits, SW & Links Removed - Snowball",0.6635
6,"Punct, Digits, SW & Links Removed - Porter",0.6648


It seems that both forms of stemming decrease the performance of our model slightly. Let's see if lemmatization works better

### Lemmatization

In [72]:
lemmatizer = WordNetLemmatizer()
X_train['text_Lemmatized'] = X_train['text_cleaned_no_SW'].apply(lambda x: x[0:-1].split())
X_test['text_Lemmatized'] = X_test['text_cleaned_no_SW'].apply(lambda x: x[0:-1].split())

In [73]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [74]:
X_train['text_Lemmatized'] = X_train['text_Lemmatized'].apply(lambda lst: [lemmatizer.lemmatize(x, get_wordnet_pos(x)) for x in lst])
X_test['text_Lemmatized'] = X_test['text_Lemmatized'].apply(lambda lst: [lemmatizer.lemmatize(x, get_wordnet_pos(x)) for x in lst])

In [75]:
X_train['text_Lemmatized'] = X_train['text_Lemmatized'].apply(' '.join)
X_test['text_Lemmatized'] = X_test['text_Lemmatized'].apply(' '.join)

In [76]:
#vectorize
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train['text_Lemmatized'].values)
X_test_vect = vectorizer.transform(X_test['text_Lemmatized'].values)

In [77]:
NB_model = MultinomialNB()
NB_model.fit(X_train_vect, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [78]:
NB_fitted_model, score = train_model(X_train_vect, y_train, NB_model)

F1 Score = 0.6673


In [79]:
results_df.loc[len(results_df)] = ['Punct, Digits, SW & Links Removed - Lemmatized', round(score,4)]
results_df

Unnamed: 0,NB Model - Cleaning Steps,F1 Score
0,Baseline NB,0.6753
1,Punctuation Removed,0.6708
2,Punct. & Digits Removed,0.6716
3,"Punct, Digits & SW Removed - manually",0.6614
4,"Punct, Digits, SW & Links Removed",0.6675
5,"Punct, Digits, SW & Links Removed - Snowball",0.6635
6,"Punct, Digits, SW & Links Removed - Porter",0.6648
7,"Punct, Digits, SW & Links Removed - Lemmatized",0.6673


Lemmatization seems to be a better approach than stemming, most likely because with lemmatization, the meaning of the word isn't lost. For example, in the snowball stemmer, the word "residents" turned into "resid" which has no meanind the English language. With lemmatization, on the other hand, "residents" would turn into "resident", which has the same meaning and it is more easily recognized by the model. Next I will apply bigrams to the model.

#### Apply ngrams - create a column of bigrams for assessment, apply ngram_range parameter in vectorizer

In [80]:
# #convert string into list
# X_train['bigrams'] = X_train['text_cleaned'].apply(lambda x: x[0:-1].split())
# X_test['bigrams'] = X_test['text_cleaned'].apply(lambda x: x[0:-1].split())

In [81]:
# #create a new column for bigrams
# X_train['bigrams'] = X_train['bigrams'].apply(lambda row: list(ngrams(row, 2)))
# X_test['bigrams'] = X_test['bigrams'].apply(lambda row: list(ngrams(row, 2)))

In [82]:
# X_train['bigrams'] = X_train['bigrams'].apply(', '.join)
# X_test['bigrams'] = X_test['bigrams'].apply(', '.join)

In [83]:
#vectorize
vectorizer = CountVectorizer(ngram_range = (1,2))
X_train_vect = vectorizer.fit_transform(X_train['text_cleaned_no_SW'].values)
X_test_vect = vectorizer.transform(X_test['text_cleaned_no_SW'].values)

In [84]:
NB_model = MultinomialNB()
NB_model.fit(X_train_vect, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [85]:
NB_fitted_model, score = train_model(X_train_vect, y_train, NB_model)

F1 Score = 0.6653


In [86]:
results_df.loc[len(results_df)] = ['Punct, Digits, Links, SW Removed - Bigrams', round(score,4)]
results_df

Unnamed: 0,NB Model - Cleaning Steps,F1 Score
0,Baseline NB,0.6753
1,Punctuation Removed,0.6708
2,Punct. & Digits Removed,0.6716
3,"Punct, Digits & SW Removed - manually",0.6614
4,"Punct, Digits, SW & Links Removed",0.6675
5,"Punct, Digits, SW & Links Removed - Snowball",0.6635
6,"Punct, Digits, SW & Links Removed - Porter",0.6648
7,"Punct, Digits, SW & Links Removed - Lemmatized",0.6673
8,"Punct, Digits, Links, SW Removed - Bigrams",0.6653


Once again,  we see that bigrams has slightly lowered the performance, however this step proved to be helpful when applied to the test set. 

## Conclusion

After multiple rounds of trial and error, I achieved my highest score on Kaggle by applying the following data cleaning steps:
1. Remove punctuation
2. Remove digits
3. Remove stopwords
4. Remove links
5. Use bigrams

To summarize, one thing that was important to keep in mind during the assignment is that the results in the train set can be misleading, therefore it is important to make submissions to Kaggle to get an idea of the "right path" for this dataset. Another important lesson is that when dealing with unordinary textual data, such as tweets, additional cleaning steps beyond basic cleaning is required for the model to handle the data. 

## Username and score

- Username: nmatta72
- Score: 0.80674

## BERT - Attempt

I've tried implementing a BERT model, however this proved to be quite challenging in terms of installations and inconsistencies with my version of Python. Below is my code to initialize a BERT model

In [87]:
# conda install tensorflow
# pip install --upgrade tensorflow-hub
# conda install pytorch-cpu torchvision-cpu -c pytorch
# pip install pytorch_pretrained_bert
# pip install keras
# conda install -c pytorch torchvision cudatoolkit=10.1 pytorch

In [88]:
# import tensorflow as tf
# from tensorflow.keras.layers import Dense, Input
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.models import Model
# from tensorflow.keras.callbacks import ModelCheckpoint
# import tensorflow_hub as hub

In [89]:
# import sys
# import numpy as np
# import random as rn
# import torch
# from pytorch_pretrained_bert import BertModel
# from torch import nn
# from pytorch_pretrained_bert import BertTokenizer
# from keras.preprocessing.sequence import pad_sequences
# from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# from torch.optim import Adam
# from torch.nn.utils import clip_grad_norm_
# from IPython.display import clear_output

In [90]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], X_train['text_cleaned']))
# test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], X_test['text_cleaned']))

In [91]:
# train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
# test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))

In [92]:
# train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
# test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")

In [93]:
# y_train = y_train

In [94]:
# train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
# test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

In [95]:
# bert = BertModel.from_pretrained('bert-base-uncased')

In [96]:
# class BertBinaryClassifier(nn.Module):
#     def __init__(self, dropout=0.1):
#         super(BertBinaryClassifier, self).__init__()

#         self.bert = BertModel.from_pretrained('bert-base-uncased')

#         self.dropout = nn.Dropout(dropout)
#         self.linear = nn.Linear(768, 1)
#         self.sigmoid = nn.Sigmoid()
    
#     def forward(self, tokens, masks=None):
#         _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
#         dropout_output = self.dropout(pooled_output)
#         linear_output = self.linear(dropout_output)
#         proba = self.sigmoid(linear_output)
#         return proba

In [97]:
# device = torch.device("cpu")

In [98]:
# bert_clf = BertBinaryClassifier()
# bert_clf = bert_clf.cpu()

In [99]:
# train_tokens_tensor = torch.tensor(train_tokens_ids)
# train_y_tensor = torch.tensor(y_train.reshape(-1, 1)).float()

# test_tokens_tensor = torch.tensor(test_tokens_ids)
# # test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

# train_masks_tensor = torch.tensor(train_masks)
# test_masks_tensor = torch.tensor(test_masks)

In [100]:
# train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
# train_sampler = RandomSampler(train_dataset)
# train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=4)

In [101]:
# param_optimizer = list(bert_clf.sigmoid.named_parameters()) 
# optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [102]:
# optimizer = Adam(bert_clf.parameters(), lr=3e-6)

In [103]:
# for epoch_num in range(10):
#     bert_clf.train()
#     train_loss = 0
#     for step_num, batch_data in enumerate(train_dataloader):
#         token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
#         print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
#         logits = bert_clf(token_ids, masks)
        
#         loss_func = nn.BCELoss()

#         batch_loss = loss_func(logits, labels)
#         train_loss += batch_loss.item()
        
        
#         bert_clf.zero_grad()
#         batch_loss.backward()
        

#         clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
#         optimizer.step()
        
#         clear_output(wait=True)
#         print('Epoch: ', epoch_num + 1)
#         print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

In [104]:
# bert_clf = BertBinaryClassifier()
# optimizer = Adam(bert_clf.parameters(), lr=3e-6)
# bert_clf.train()
# for epoch_num in range(10):
#     for step_num, batch_data in enumerate(train_dataloader):
#         token_ids, labels = tuple(t for t in batch_data)
#         probas = bert_clf(token_ids)
#         loss_func = nn.BCELoss()
#         batch_loss = loss_func(probas, labels)
#         bert_clf.zero_grad()
#         batch_loss.backward()
#         optimizer.step()

In [105]:
# def build_model(bert_layer, max_len=512):
#     input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
#     input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
#     segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

#     _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
#     clf_output = sequence_output[:, 0, :]
#     out = Dense(1, activation='sigmoid')(clf_output)
    
#     model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
#     model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
#     return model

In [106]:
# %%time
# module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
# bert_layer = hub.KerasLayer(module_url, trainable=True)

In [107]:
# vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
# do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
# tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)