<a href="https://colab.research.google.com/github/micheusch/sagemaker/blob/main/quick_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#hide
# # import warnings
# warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np


In [None]:
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv('drive/MyDrive/Colab Notebooks/data/disaster_tweets.csv')
data.head()

Mounted at /content/drive


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [None]:
data.nunique()

id          7613
keyword      221
location    3341
text        7503
target         2
dtype: int64

#### 1. Data prep

In [None]:
import re
import spacy

def remove_at_hash(sent):
    """ Returns a string with @-symbols and hashtags removed. """
    return re.sub(r'@|#', r'', sent.lower())

def remove_sites(sent):
    """ Returns a string with any websites starting with 'http.' removed. """
    return re.sub(r'http.*', r'', sent.lower())

def remove_punct(sent):
    """ Returns a string with only English unicode word characters ([a-zA-Z0-9_]). """
    return ' '.join(re.findall(r'\w+', sent.lower()))

def spacy_cleaning(doc):
    """ Returns a string that has been lemmatized and rid of stop words via SpaCy. """
    doc = nlp(doc.lower())
    text = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(text)

In [None]:
nlp = spacy.load('en', disable=['ner', 'parser'])


data['text_simple'] = data['text'].apply(lambda x: remove_punct(remove_sites(remove_at_hash(x))))
data['text_spacy'] = data['text'].apply(lambda x: spacy_cleaning(x))

#### n-grams

In [None]:
from gensim.models.phrases import Phrases, Phraser

text = [re.split('\s+', tweet) for tweet in data['text']]
bigram_phrases = Phrases(text, min_count=30)
bigram = Phraser(bigram_phrases)
bigram_text = bigram[text]

trigram_phrases = Phrases(bigram_text, min_count=30)
trigram = Phraser(trigram_phrases)
trigram_text = trigram[bigram_text]

data['text_trigram'] = [' '.join(tweet) for tweet in trigram_text]

text_simple = [re.split('\s+', tweet) for tweet in data['text_simple']]

bigram_phrases = Phrases(text_simple, min_count=30)
bigram = Phraser(bigram_phrases)
bigram_text_simple = bigram[text_simple]

trigram_phrases = Phrases(bigram_text_simple, min_count=30)
trigram = Phraser(trigram_phrases)
trigram_text_simple = trigram[bigram_text_simple]

data['text_trigram_simple'] = [' '.join(tweet) for tweet in trigram_text_simple]

text_spacy = [re.split('\s+', tweet) for tweet in data['text_spacy']]

bigram_phrases = Phrases(text_spacy, min_count=30)
bigram = Phraser(bigram_phrases)
bigram_text_spacy = bigram[text_spacy]

trigram_phrases = Phrases(bigram_text_spacy, min_count=30)
trigram = Phraser(trigram_phrases)
trigram_text_spacy = trigram[bigram_text_spacy]

data['text_trigram_spacy'] = [' '.join(tweet) for tweet in trigram_text_spacy]

#### train/test split

In [None]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(data, random_state=24)

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
train[['text','target']].rename(columns={'target':'label'}).to_csv('drive/MyDrive/Colab Notebooks/data/disaster_tweets_train.csv', index=False)
valid[['text','target']].rename(columns={'target':'label'}).to_csv('drive/MyDrive/Colab Notebooks/data/disaster_tweets_val.csv', index=False)

train.shape, valid.shape, data.shape

disasters = train[train['target'] == 1].reset_index()
not_disasters = train[train['target'] == 0].reset_index()

In [None]:
disasters.nunique()

level_0                2450
index                  2450
id                     2450
keyword                 220
location               1197
text                   2418
target                    1
text_simple            2130
text_spacy             2417
text_trigram           2417
text_trigram_simple    2130
text_trigram_spacy     2416
dtype: int64

In [None]:
not_disasters.nunique()

level_0                3259
index                  3259
id                     3259
keyword                 216
location               1668
text                   3239
target                    1
text_simple            3069
text_spacy             3238
text_trigram           3239
text_trigram_simple    3069
text_trigram_spacy     3238
dtype: int64

#### TF-IDF

In [None]:
from collections import defaultdict
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import MatrixSimilarity
from sklearn.metrics import accuracy_score, f1_score

In [None]:
disaster_tweets = disasters['text'].tolist()
not_disaster_tweets = not_disasters['text'].tolist()

disaster_tweets_split = [
    [word for word in tweet.split()]
    for tweet in disaster_tweets
]
not_disaster_tweets_split = [
    [word for word in tweet.split()]
    for tweet in not_disaster_tweets
]

In [None]:
disaster_tweets_word_frequency = defaultdict(int)
for tweet in disaster_tweets_split:
    for word in tweet:
        disaster_tweets_word_frequency[word] += 1
        
not_disaster_tweets_word_frequency = defaultdict(int)
for tweet in not_disaster_tweets_split:
    for word in tweet:
        not_disaster_tweets_word_frequency[word] += 1

disaster_tweets_split = [
    [word for word in tweet if disaster_tweets_word_frequency[word] > 1]
    for tweet in disaster_tweets_split
]

not_disaster_tweets_split = [
    [word for word in tweet if not_disaster_tweets_word_frequency[word] > 1]
    for tweet in not_disaster_tweets_split
]

In [None]:
disaster_tweets_dct = Dictionary(disaster_tweets_split)
not_disaster_tweets_dct = Dictionary(not_disaster_tweets_split)

disaster_tweets_corpus = [disaster_tweets_dct.doc2bow(tweet) for tweet in disaster_tweets_split]
not_disaster_tweets_corpus = [not_disaster_tweets_dct.doc2bow(tweet) for tweet in not_disaster_tweets_split]

In [None]:
disaster_tweets_tfidf = TfidfModel(disaster_tweets_corpus)
not_disaster_tweets_tfidf = TfidfModel(not_disaster_tweets_corpus)

disaster_tweets_tfidf_vectors = disaster_tweets_tfidf[disaster_tweets_corpus]
not_disaster_tweets_tfidf_vectors = not_disaster_tweets_tfidf[not_disaster_tweets_corpus]

disaster_tweets_similarity = MatrixSimilarity(disaster_tweets_tfidf_vectors)
not_disaster_tweets_similarity = MatrixSimilarity(not_disaster_tweets_tfidf_vectors)

In [None]:
valid_tweets = valid['text'].tolist()

valid_tweets_split = [
    [word for word in tweet.split()]
    for tweet in valid_tweets
]

valid_tweets_word_frequency = defaultdict(int)
for tweet in valid_tweets_split:
    for word in tweet:
        valid_tweets_word_frequency[word] += 1
    
valid_tweets_split = [
    [word for word in tweet if valid_tweets_word_frequency[word] > 1]
    for tweet in valid_tweets_split
]

In [None]:
valid['prediction'] = np.zeros(len(valid)).astype('int')

In [None]:
for row in range(len(valid)):
    tweet = valid_tweets_split[row]
    
    tweet_bow_with_disasters_dct = disaster_tweets_dct.doc2bow(tweet)
    tweet_bow_with_not_disasters_dct = not_disaster_tweets_dct.doc2bow(tweet)
    
    tweet_tfidf_vector_with_disasters_tfidf = disaster_tweets_tfidf[tweet_bow_with_disasters_dct]
    tweet_tfidf_vector_with_not_disasters_tfidf = not_disaster_tweets_tfidf[tweet_bow_with_not_disasters_dct]
    
    disaster_similarity_vector = disaster_tweets_similarity[tweet_tfidf_vector_with_disasters_tfidf]
    not_disaster_similarity_vector = not_disaster_tweets_similarity[tweet_tfidf_vector_with_not_disasters_tfidf]
    
    disaster_tally = np.where(disaster_similarity_vector > 0.1)[0].size # np.where() returns a tuple, so we have to index into [0] to get what we want
    not_disaster_tally = np.where(not_disaster_similarity_vector > 0.1)[0].size
    
    if disaster_tally > not_disaster_tally:
        valid.loc[row, 'prediction'] = 1

KeyboardInterrupt: ignored

In [None]:
accuracy = accuracy_score(valid['target'], valid['prediction'])
F1 = f1_score(valid['target'], valid['prediction'])
accuracy, F1

#### Word2Vec

In [None]:
#collapse
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import time

In [None]:
#collapse
def replace_unknowns(search_texts, min_count):
    """
    Replaces words that occur less than a certain number of times
    in a string or list of strings with 'UNK'.
    
    Parameters
    ----------
    search_texts : list
        A list of input strings to iterate over.
    min_count : int
        An integer specify the minimum count a word should occur in
        the search_texts to not be replaced with 'UNK'.
    
    Returns
    -------
    list
        List of search_texts with words that occur less than the min_count
        amount of times replaced with 'UNK'.
    
    """
    
    # Get all tweets lowered and tokenized.
    # This makes sense because we'd never want to
    # treat an 'a' different from an 'A'.
    # (Capitalization is just an orthographical convention)
    texts = [
        [word for word in re.split('\s+', text.lower())]
        for text in search_texts
    ]

    # create a dictionary that stores the count of each
    # word in our uncleaned tweets. We can insert new words
    # into the dict or add to their count if their already in it.
    vocab_counts = defaultdict(int)

    # Create a list that we can append words that occur more than
    # the desired threshold number of times to.
    vocab = []

    for text in texts:
        for word in text:
            vocab_counts[word] += 1

    # Now go through the vocab_counts and get rid of
    # words that occur less than five times.
    for word in vocab_counts.keys():
        if vocab_counts[word] > min_count:
            vocab.append(word)

    # Now initialize a new column in data that will hold
    # the tweets with 'UNK' replacing words that occur
    # across the entire vocabulary less than five times.
    # This creates congruency later on in the model.
    # data['text_count_5'] = np.empty(len(data), dtype=str) # ***** DO THIS OUTSIDE FUNC IN WORD2VEC SECTION

    # Now, go through each tweet and replace the words that
    # occur less than 5 times throughout the entire corpus
    # with 'UNK'. Then, we insert the new tweet into a new
    # column in the original dataframe.

    out = []
    # this process takes about a minute
    for i, text in enumerate(texts):
        text_replaced = []
        for word in text:
            if word in vocab:
                text_replaced.append(word)
            else:
                text_replaced.append('UNK')
        text_replaced = ' '.join(text_replaced)
        out.append(text_replaced)
        
    return out

In [None]:
#collapse
valid['text_count_5'] = replace_unknowns(valid['text_trigram'], 5)
valid['text_simple_5'] = replace_unknowns(valid['text_trigram_simple'], 5)
valid['text_spacy_5'] = replace_unknowns(valid['text_trigram_spacy'], 5)

disasters['text_count_5'] = replace_unknowns(disasters['text_trigram'], 5)
disasters['text_simple_5'] = replace_unknowns(disasters['text_trigram_simple'], 5)
disasters['text_spacy_5'] = replace_unknowns(disasters['text_trigram_spacy'], 5)

not_disasters['text_count_5'] = replace_unknowns(not_disasters['text_trigram'], 5)
not_disasters['text_simple_5'] = replace_unknowns(not_disasters['text_trigram_simple'], 5)
not_disasters['text_spacy_5'] = replace_unknowns(not_disasters['text_trigram_spacy'], 5)

In [None]:
data['text_count_5'] = replace_unknowns(data['text_trigram'], 5)
data['text_simple_5'] = replace_unknowns(data['text_trigram_simple'], 5)
data['text_spacy_5'] = replace_unknowns(data['text_trigram_spacy'], 5)


In [None]:
valid = valid.drop(columns=['prediction'])

In [None]:
model = Word2Vec(min_count=5, sample=1e-3, workers=4, seed=24)

In [None]:
tweets = [
    [wd for wd in tweet.split(' ')]
    for tweet in data['text_count_5']
]

model.build_vocab(tweets)
model.train(tweets, total_examples=model.corpus_count, epochs=30)

(2007377, 3383040)

In [None]:
model.wv.init_sims(replace=True)

In [None]:
valid['prediction'] = np.zeros(len(valid)).astype('int')

In [None]:
start_time = time.time()

for valid_row in range(len(valid)):
    valid_tweet = valid.loc[valid_row, 'text_count_5']
    tokenized_valid_tweet = re.split('\s+', valid_tweet) # split on all whitespace characters
    
    disaster_count = 0
    not_disaster_count = 0
    
    # we can just reuse "disasters" and
    # "not_disasters" from earlier!
    for disaster_row in range(len(disasters)):
        disaster_tweet = disasters.loc[disaster_row, 'text_count_5']
        tokenized_disaster_tweet = re.split('\s+', disaster_tweet)
        if model.wv.n_similarity(tokenized_valid_tweet, tokenized_disaster_tweet) > 0.7:
            disaster_count += 1
        
    for not_disaster_row in range(len(not_disasters)):
        not_disaster_tweet = not_disasters.loc[not_disaster_row, 'text_count_5']
        tokenized_not_disaster_tweet = re.split('\s+', not_disaster_tweet)
        if model.wv.n_similarity(tokenized_valid_tweet, tokenized_not_disaster_tweet) > 0.7:
            not_disaster_count += 1
            
    if disaster_count > not_disaster_count:
        valid.loc[valid_row, 'prediction'] = 1
        
end_time = time.time()
print(f'Runtime: {(end_time - start_time) / 60.0} mins')

Runtime: 23.706074607372283 mins


In [None]:
valid.head()

Unnamed: 0,index,id,keyword,location,text,target,text_simple,text_spacy,text_trigram,text_trigram_simple,text_trigram_spacy,text_count_5,text_simple_5,text_spacy_5,prediction
0,3068,4402,electrocute,,Kids got Disney version of the game Operation ...,0,kids got disney version of the game operation ...,kid get disney version game operation 2 aa bat...,Kids got Disney version of the game Operation ...,kids got disney version of the game operation ...,kid get disney version game operation 2 aa bat...,UNK got UNK UNK of the game UNK only 2 UNK UNK...,kids got UNK UNK of the game UNK only 2 UNK UN...,kid get UNK version game UNK 2 UNK UNK ? UNK o...,0
1,3148,4522,emergency,"Indianapolis, IN",UPDATE: Indiana State Police reopening I-65 ne...,1,update indiana state police reopening i 65 nea...,update : indiana state police reopen i-65 near...,UPDATE: Indiana State Police reopening I-65 ne...,update indiana state police reopening i 65 nea...,update : indiana state police reopen i-65 near...,update: UNK state police UNK UNK near UNK UNK ...,update UNK state police UNK i UNK near UNK UNK...,update : UNK state police UNK UNK near UNK UNK...,0
2,3139,4511,emergency,Phoenix,God forbid anyone in my family knows how to an...,0,god forbid anyone in my family knows how to an...,god forbid family know answer phone . need new...,God forbid anyone in my family knows how to an...,god forbid anyone in my family knows how to an...,god forbid family know answer phone . need new...,god UNK UNK in my family UNK how to UNK a UNK ...,god UNK UNK in my family UNK how to UNK a phon...,god UNK family know UNK phone . need new emerg...,0
3,7485,10707,wreck,"Alabama, USA",First wreck today. So so glad me and mom are o...,0,first wreck today so so glad me and mom are ok...,wreck today . glad mom okay . lot bad . happy ...,First wreck today. So so glad me and mom are o...,first wreck today so so glad me and mom are ok...,wreck today . glad mom okay . lot bad . happy ...,first wreck UNK so so UNK me and UNK are UNK U...,first wreck today so so UNK me and UNK are UNK...,wreck today . UNK UNK UNK . lot bad . UNK UNK ...,0
4,6023,8608,seismic,Somalia,Exploration takes seismic shift in Gabon to So...,0,exploration takes seismic shift in gabon to so...,exploration take seismic shift gabon somalia -...,Exploration takes seismic shift in Gabon to So...,exploration takes seismic shift in gabon to so...,exploration take seismic shift gabon somalia -...,UNK UNK seismic UNK in UNK to UNK - UNK UNK UN...,UNK UNK seismic UNK in UNK to UNK UNK UNK,UNK take seismic UNK UNK UNK - UNK ( UNK ) UNK...,0


In [None]:
accuracy = accuracy_score(valid['target'], valid['prediction'])
F1 = f1_score(valid['target'], valid['prediction'])
accuracy, F1

(0.6313025210084033, 0.26875)

#### Tranformers

In [None]:
!pip install transformers

In [None]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
!pip install datasets

In [None]:
# from datasets import load_dataset
# raw_datasets = load_dataset("imdb")

In [None]:
from datasets import load_dataset
train_ds = load_dataset('csv', data_files='drive/MyDrive/Colab Notebooks/data/disaster_tweets_train.csv')
valid_ds = load_dataset('csv', data_files='drive/MyDrive/Colab Notebooks/data/disaster_tweets_val.csv')

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


small_train_dataset = train_ds.map(tokenize_function, batched=True)
small_eval_dataset = valid_ds.map(tokenize_function, batched=True)

In [None]:
# tf_train_dataset = train_ds.remove_columns(["text"]).with_format("tensorflow")
# tf_eval_dataset = valid_ds.remove_columns(["text"]).with_format("tensorflow")
tf_train_dataset = train_ds.with_format("tensorflow")
tf_eval_dataset  = valid_ds.with_format("tensorflow")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tf_train_ds = train_ds.map(tokenize_function, batched=True)['train']
tf_eval_ds = valid_ds.map(tokenize_function, batched=True)['train']

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer")

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args, 
                  train_dataset=tf_train_ds, 
                  eval_dataset=tf_eval_ds)


In [None]:
trainer.train()

***** Running training *****
  Num examples = 5709
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2142


Step,Training Loss
500,0.4702
1000,0.3842


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin


KeyboardInterrupt: ignored