# Beginning Work on Twitter Disaster Kaggle Competition

In [1]:
import nltk
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB## for word embedding
import gensim
import gensim.downloader as gensim_api
## for deep learning
# from tensorflow.keras import models, layers, preprocessing as kprocessing
# from tensorflow.keras import backend as K
# ## for bert language model
# import transformers
import warnings
warnings.filterwarnings("ignore")

Below we will define a function that takes in a column with raw text data and returns a cleaned version of that column

In [2]:
train = pd.read_csv("..\\data\\test.csv")
test = pd.read_csv("..\\data\\train.csv")

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


Things to notice is that the size of our dataframe is 3263 but there are only 3237 non-null values and 2158 for keyword and location respectively

In [5]:
train.keyword.unique() # notice NaN is a keyword

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [6]:
train.location.unique() # again here NaN is a location

array([nan, 'London', "Niall's place | SAF 12 SQUAD |", ...,
       'Acey mountain islanddåÇTorontoåÈ', 'los angeles',
       'Brussels, Belgium'], dtype=object)

Eventually we will need to figure out what to do with these missing values, but for now lets focus on cleaning up the text!

In [7]:
train.id.unique() # no repeating ids which is good

array([    0,     2,     3, ..., 10868, 10874, 10875], dtype=int64)

In [8]:
train.columns

Index(['id', 'keyword', 'location', 'text'], dtype='object')

Found a list of contractions from a stack overflow post. This dictionary will be used to convert contractions to their root words.

In [9]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

# This function will be used to do 4 things at once:
 1) it will turn any contractions that appear above into their respected root words i.e "won't" will become "will not"

2) It will use a regular expression to remove any unwanted characters from our text. In this case unwanted characters are punctuation and any special characters sucha as '@'

3) It will remove any stop words that appear in our text. Here the stop words were pulled from a pre-defined list of stopwords from NLTK

4) Lastly it will tokenize our text. Tokenization is the process of taking a body of text and converting it into lists of strings.

In [10]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''

    # Convert words to lower case
    text = text.lower()

    # Replace contractions with their longer forms
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)

    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    # remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english")) # pulli-ng a list of stopwords from NLTK
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Tokenize each word
    text =  nltk.WordPunctTokenizer().tokenize(text)

    return text

In [11]:
train['text_cleaned'] = list(map(clean_text, train.text))
test['text_cleaned'] = list(map(clean_text, test.text))

# Lemmatizing is the process of taking a word like swimming and converting it to its dictionary root word. So swimming becomes swim, eating becomes eat. Swam becomes swim. For more information click here:
https://towardsdatascience.com/lemmatization-in-natural-language-processing-nlp-and-machine-learning-a4416f69a7b6

In [12]:
def lemmatized_words(df):
    lemm = nltk.stem.WordNetLemmatizer()
    df['lemmatized_text'] = list(map(lambda word:
                                     list(map(lemm.lemmatize, word)),
                                     df.text_cleaned))



In [13]:
lemmatized_words(train)
lemmatized_words(test)

In [14]:
train["text_str"]=train["lemmatized_text"].apply(lambda x: " ".join(x) )
test["text_str"]=test["lemmatized_text"].apply(lambda x: " ".join(x) )

In [15]:
train.head()  #ntoice our new dataframe has two new columns text_cleaned and lemmatized_text

Unnamed: 0,id,keyword,location,text,text_cleaned,lemmatized_text,text_str
0,0,,,Just happened a terrible car crash,"[happened, terrible, car, crash]","[happened, terrible, car, crash]",happened terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...","[heard, earthquake, different, cities, stay, s...","[heard, earthquake, different, city, stay, saf...",heard earthquake different city stay safe ever...
2,3,,,"there is a forest fire at spot pond, geese are...","[forest, fire, spot, pond, geese, fleeing, acr...","[forest, fire, spot, pond, goose, fleeing, acr...",forest fire spot pond goose fleeing across str...
3,9,,,Apocalypse lighting. #Spokane #wildfires,"[apocalypse, lighting, spokane, wildfires]","[apocalypse, lighting, spokane, wildfire]",apocalypse lighting spokane wildfire
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,"[typhoon, soudelor, kills, 28, china, taiwan]","[typhoon, soudelor, kill, 28, china, taiwan]",typhoon soudelor kill 28 china taiwan


In [16]:
train.loc[train.text.str.find('fuck') > 0] # we will use these explicit words to add an additional column to our dataframe

Unnamed: 0,id,keyword,location,text,text_cleaned,lemmatized_text,text_str
231,750,avalanche,,if this fucking is true i will be decapitated ...,"[fucking, true, decapitated, throw, head, aval...","[fucking, true, decapitated, throw, head, aval...",fucking true decapitated throw head avalanche
290,937,blaze,"Cleveland, OH",I never got paid to give a fuck..we might as w...,"[never, got, paid, give, fuck, might, well, bl...","[never, got, paid, give, fuck, might, well, bl...",never got paid give fuck might well blaze anot...
344,1115,blew%20up,"Coos Bay, OR",We were fucking around on Google maps at work ...,"[fucking, around, google, maps, work, pulled, ...","[fucking, around, google, map, work, pulled, b...",fucking around google map work pulled boise bl...
353,1140,blight,,@colemcfadyean fuck off cole ??????,"[colemcfadyean, fuck, cole]","[colemcfadyean, fuck, cole]",colemcfadyean fuck cole
377,1210,blizzard,That place,If blizzard did another 12 month sub thing and...,"[blizzard, another, 12, month, sub, thing, gav...","[blizzard, another, 12, month, sub, thing, gav...",blizzard another 12 month sub thing gave next ...
411,1330,blown%20up,Scout Team,If you bored as shit don't nobody fuck wit you...,"[bored, shit, nobody, fuck, wit, busy, yo, shi...","[bored, shit, nobody, fuck, wit, busy, yo, shi...",bored shit nobody fuck wit busy yo shit get bl...
414,1339,blown%20up,somewhere or other,I don't understand how White Iverson by Post M...,"[understand, white, iverson, post, malone, blo...","[understand, white, iverson, post, malone, blo...",understand white iverson post malone blown fuc...
481,1564,bomb,shanghai,Bruh this sex on the beach??is bomb as fuck yo,"[bruh, sex, beach, bomb, fuck, yo]","[bruh, sex, beach, bomb, fuck, yo]",bruh sex beach bomb fuck yo
584,1902,burning,daily ? 18 ? ?,RT: A real burn book entry about CA: why the f...,"[rt, real, burn, book, entry, ca, fuck, place,...","[rt, real, burn, book, entry, ca, fuck, place,...",rt real burn book entry ca fuck place always b...
666,2168,catastrophic,,If a å£1 rise in wages is going to have such a...,"[å, £, 1, rise, wages, going, catastrophic, im...","[å, £, 1, rise, wage, going, catastrophic, imp...",å £ 1 rise wage going catastrophic impact wage...


# This useful package will be used to check the probability of profanity in our text

In [None]:
#from profanity_check import predict, predict_prob

In [None]:
#predict_prob(['go to hell, you scum'])

## That was some basic cleaning, lemmatizing, and tokenizing: Now let us try to prepare our model

In [None]:
# Create a Vectorizer Object
vectorizer = CountVectorizer()

vectorizer.fit(test.text_str)

# Printing the identified Unique words along with their indices
print("Vocabulary: ", vectorizer.vocabulary_)

# Encode the Document
X_varied = vectorizer.transform(test.text_str)

# Summarizing the Encoded Texts
print("Encoded Document is:")
print(X_varied.toarray())

In [None]:
y_varied = test[['target']]

# logistic regression
logit_clf = LogisticRegression()
logit_clf.fit(X_varied, y_varied)
y_logit = logit_clf.predict(X_varied)

# svm
svm_clf = SVC()
svm_clf.fit(X_varied, y_varied)
y_svm = svm_clf.predict(X_varied)


# scoring (naive, not cross-validated)
print(f"Logit score: {logit_clf.score(X_varied, y_varied):.3f}")
print(f"SVM score:   {svm_clf.score(X_varied, y_varied):.3f}")

# scoring (with cross validation)
print(f"Logit score, cross validated: {np.mean(cross_val_score(logit_clf, X_varied, y_varied)):.3f}")
print(f"SVM score, cross validated:   {np.mean(cross_val_score(svm_clf, X_varied, y_varied)):.3f}")

In [None]:
# for n in range(1,10):
#     #knn
#     print(f"Number of clusters is {n}")
#     knn_clf = KNeighborsClassifier(n_neighbors=n) # instantiate
#     knn_clf.fit(X_varied, y_varied)               # fit
#     y_knn = knn_clf.predict(X_varied)             # predict
#
#     #scoring (naive, not cross-validated)
#     print(f"kNN score:   {knn_clf.score(X_varied, y_varied):.3f}")
#
#     #scoring (with cross validation)
#     print(f"kNN score, cross validated:   {np.mean(cross_val_score(knn_clf, X_varied, y_varied)):.3f}\n")

# Now that we have used a count vectorizer to transformn our cornpus let us try to use a TF-ID

In [None]:
vectorizer = TfidfVectorizer()

X_varied = vectorizer.fit_transform(test.text_str)

vectorizer.get_feature_names_out()


print(X_varied.shape)

In [None]:
y_varied = test[['target']]

# logistic regression
logit_clf = LogisticRegression()
logit_clf.fit(X_varied, y_varied)
y_logit = logit_clf.predict(X_varied)

# svm
svm_clf = SVC()
svm_clf.fit(X_varied, y_varied)
y_svm = svm_clf.predict(X_varied)

# scoring (naive, not cross-validated)
print(f"Logit score: {logit_clf.score(X_varied, y_varied):.3f}")
print(f"SVM score:   {svm_clf.score(X_varied, y_varied):.3f}")

# scoring (with cross validation)
print(f"Logit score, cross validated: {np.mean(cross_val_score(logit_clf, X_varied, y_varied)):.3f}")
print(f"SVM score, cross validated:   {np.mean(cross_val_score(svm_clf, X_varied, y_varied)):.3f}\n")



In [None]:
# for n in range(7, 15): # it seens like 11 is the best score for our model
#     #knn
#     print(f"Number of clusters is {n}")
#     knn_clf = KNeighborsClassifier(n_neighbors=n)  # instantiate
#     knn_clf.fit(X_varied, y_varied)  # fit
#     y_knn = knn_clf.predict(X_varied)  # predict
#
#     #scoring (naive, not cross-validated)
#     print(f"kNN score:   {knn_clf.score(X_varied, y_varied):.3f}")
#
#     #scoring (with cross validation)
#     print(f"kNN score, cross validated:   {np.mean(cross_val_score(knn_clf, X_varied, y_varied)):.3f}\n")

In [None]:
# Classifers to test
classifiers = {
    'kNN': KNeighborsClassifier(n_neighbors = 11),
    'Logit':LogisticRegression(),
    'Tree': DecisionTreeClassifier(),
    'Bayes': MultinomialNB()
}

scores1 = {} # Store cross-validation results in a dictionary
for classifier in classifiers:
    scores1[classifier] = cross_validate( # perform cross-validation
        classifiers[classifier], # classifier object
        X_varied, # feature matrix
        y_varied, # gold labels
        cv=10, #number of folds
        scoring=['accuracy', 'recall', 'f1'] # scoring methods
    )

# Lets try the same approach but without removing stopwords

In [None]:
train['text_lower'] = train.text.apply(lambda x: x.lower())
test['text_lower'] = test.text.apply(lambda x: x.lower())
test.columns

In [None]:
vectorizer.fit(test['text_lower'])
X = vectorizer.transform(test['text_lower'])
y =  test[["target"]]

# Classifers to test
classifiers = {
    'kNN': KNeighborsClassifier(n_neighbors = 11),
    'Logit':LogisticRegression(),
    'Tree': DecisionTreeClassifier(),
    'Bayes': MultinomialNB()
}

scores2 = {} # Store cross-validation results in a dictionary
for classifier in classifiers:
    scores2[classifier] = cross_validate( # perform cross-validation
        classifiers[classifier], # classifier object
        X, # feature matrix
        y, # gold labels
        cv=10, #number of folds
        scoring=['accuracy', 'recall', 'f1'] # scoring methods
    )

In [None]:
# Examine the performance of our simple classifiers

def compare_scores(scores_dict):
    '''
    Takes a dictionary of cross_validate scores.
    Returns a color-coded Pandas dataframe that summarizes those scores.
    '''
    df = pd.DataFrame(scores_dict).T.applymap(np.mean).style.background_gradient(cmap='RdYlGn')
    return df
# Compare cross-validation scores

In [None]:
compare_scores(scores1)

In [None]:
compare_scores(scores2)

# Time for a different apporach. Word Embeddings

In [None]:
#nlp = gensim_api.load("word2vec-google-news-300")

corpus = train["text"]

## create list of lists of unigrams
lst_corpus = []
for string in corpus:
    lst_words = string.split()
    lst_grams = [" ".join(lst_words[i:i+1])
                 for i in range(0, len(lst_words), 1)]
    lst_corpus.append(lst_grams)

## detect bigrams and trigrams
bigrams_detector = gensim.models.phrases.Phrases(lst_corpus,
                                                 delimiter=" ".encode(), min_count=5, threshold=10)
bigrams_detector = gensim.models.phrases.Phraser(bigrams_detector)
trigrams_detector = gensim.models.phrases.Phrases(bigrams_detector[lst_corpus],
                                                  delimiter=" ".encode(), min_count=5, threshold=10)
trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)