In [1]:
import sklearn
import numpy
import pandas
import nltk
import regex as re
import sklearn.ensemble
import sklearn.neural_network
import sklearn.model_selection
import pickle

In [2]:
# Read in training data
training_data = pandas.read_csv("Data/train.csv")

In [3]:
# Print head and tail to make sure that the dataset is loaded in correctly
training_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# Get basic information on dataset 
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
# Check to see if there are duplicated rows 
duplicated_data = (training_data[training_data.duplicated(keep=False)])
print(duplicated_data)

Empty DataFrame
Columns: [id, keyword, location, text, target]
Index: []


In [6]:
# Explore the keyword tag
training_data[training_data["keyword"].isnull()==False]

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...,...
7578,10830,wrecked,,@jt_ruff23 @cameronhacker and I wrecked you both,0
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0
7580,10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0
7581,10833,wrecked,Lincoln,@engineshed Great atmosphere at the British Li...,0


In [7]:
# Explore the location tag
training_data[training_data["location"].isnull()==False]

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...,...
7575,10826,wrecked,TN,On the bright side I wrecked http://t.co/uEa0t...,0
7577,10829,wrecked,#NewcastleuponTyne #UK,@widda16 ... He's gone. You can relax. I thoug...,0
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0
7580,10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0


In [8]:
# Drop the keyword and location tags, don't appear to provide relevant information for differntiating real and fake disasters 
training_data = training_data.drop(columns=["keyword", "location"])
# Verify that the columns have been dropped 
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7613 non-null   int64 
 1   text    7613 non-null   object
 2   target  7613 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 178.6+ KB


In [9]:
# Define methods needed for preprocessing tweets
# This code froms https://www.kaggle.com/amackcrane/python-version-of-glove-twitter-preprocess-script
FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = "<hashtag> {} <allcaps>".format(hashtag_body.lower())
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps> " # amackcrane added trailing space


def process(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\w+", hashtag)  # amackcrane edit
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    

    ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
    # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
    #text = re_sub(r"([A-Z]){2,}", allcaps)  # moved below -amackcrane

    # amackcrane additions
    text = re_sub(r"([a-zA-Z<>()])([?!.:;,])", r"\1 \2")
    text = re_sub(r"\(([a-zA-Z<>]+)\)", r"( \1 )")
    text = re_sub(r"  ", r" ")
    text = re_sub(r" ([A-Z]){2,} ", allcaps)
    
    return text


In [10]:
# Define test string
text = "I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!"

# Define twitter tokenizer
tokenizer = nltk.tokenize.TweetTokenizer()

# Test that preprocessing and tokenizing works 
processed_text = process(text)
text = tokenizer.tokenize(processed_text)
print(processed_text)
print()
print(text)

I test  <allcaps> al <elong> kinds of <hashtag> hashtags and <hashtag> hashtags <allcaps> , <user> and <number> ( <url> ) . w / <heart> <smile> haha ! <repeat>

['I', 'test', '<allcaps>', 'al', '<elong>', 'kinds', 'of', '<hashtag>', 'hashtags', 'and', '<hashtag>', 'hashtags', '<allcaps>', ',', '<user>', 'and', '<number>', '(', '<url>', ')', '.', 'w', '/', '<heart>', '<smile>', 'haha', '!', '<repeat>']


In [11]:
# Convert the text and targets to lists for easier processing
tweets = training_data["text"].values.tolist()
labels = training_data["target"].values.tolist()

In [14]:
# Import glove embeddings, and create an embedding dictionary
embedding_dictionary = {}

# Use twitter trained embeddings with 200 dim embedding vector
with open('Data/glove.twitter.27B.200d.txt', 'r', encoding="utf-8") as encodings:
    # Iterate through file
    # Each line is formatted as word, embedding vector
    # Split lines using the fom
    average_vec = numpy.zeros(200)
    line_count = 0
    for line in encodings:
        values = line.split(' ')
        word = values[0]
        vector = numpy.asarray(values[1:], "float32")
        embedding_dictionary[word] = vector
        line_count += 1
        average_vec += vector
    # Create average of all embeddings to represent unknown tokens
    average_vec = average_vec / line_count
    embedding_dictionary["<unknown>"] = average_vec

In [15]:
# Process and tokenize all sentences so they are properly formatted for glove embeddings 
tokenized_tweets = []
for tweet in tweets:
    processed_tweet = process(tweet)
    tokenized_tweets.append(tokenizer.tokenize(processed_tweet))

# Check that tokenization worked
tokenized_tweets

[['Our',
  'Deeds',
  'are',
  'the',
  'Reason',
  'of',
  'this',
  '<hashtag>',
  'earthquake',
  'May',
  'allah',
  '<allcaps>',
  'Forgive',
  'us',
  'all'],
 ['Forest', 'fire', 'near', 'La', 'Ronge', 'Sask', '.', 'Canada'],
 ['All',
  'residents',
  'asked',
  'to',
  "'",
  'shelter',
  'in',
  'place',
  "'",
  'are',
  'being',
  'notified',
  'by',
  'officers',
  '.',
  'No',
  'other',
  'evacuation',
  'or',
  'shelter',
  'in',
  'place',
  'orders',
  'are',
  'expected'],
 ['<number>',
  'people',
  'receive',
  '<hashtag>',
  'wildfires',
  'evacuation',
  'orders',
  'in',
  'California'],
 ['Just',
  'got',
  'sent',
  'this',
  'photo',
  'from',
  'Ruby',
  '<hashtag>',
  'Alaska',
  'as',
  'smoke',
  'from',
  '<hashtag>',
  'wildfires',
  'pours',
  'into',
  'a',
  'school'],
 ['<hashtag>',
  'Rocky',
  'Fire',
  'Update',
  '=',
  '>',
  'California',
  'Hwy',
  '.',
  '<number>',
  'closed',
  'in',
  'both',
  'directions',
  'due',
  'to',
  'Lake',
  'Co

In [16]:
# Covert the tokenized sentences to a feature vector
# This is done by taking an average of the embeddings for the tokens in the tweet
tweets_as_embeddings = []
not_in_dict = 0
for tweet in tokenized_tweets:
    tweet_embedding = numpy.zeros(200)
    for token in tweet:
        if token in embedding_dictionary:
            tweet_embedding = tweet_embedding + embedding_dictionary[token]
        else:
            tweet_embedding = tweet_embedding + embedding_dictionary["<unknown>"]

            
    tweet_embedding = tweet_embedding / len(tweet)
    tweets_as_embeddings.append(tweet_embedding)

# Convert the list to a numpy array for cleaness
tweets_as_embeddings = numpy.array(tweets_as_embeddings)

# Check that the conversion worked
tweets_as_embeddings

array([[ 0.11760515,  0.08210948,  0.09769516, ..., -0.09296505,
         0.02571119,  0.08509148],
       [-0.12978727, -0.07557999, -0.09554107, ..., -0.08212609,
         0.11236199, -0.00320835],
       [-0.02803857,  0.0344715 , -0.14127986, ...,  0.028314  ,
         0.07909081,  0.00812336],
       ...,
       [ 0.11238382, -0.07432353,  0.07280683, ...,  0.04205782,
        -0.13706134,  0.43532688],
       [-0.0170595 , -0.08372771, -0.08439747, ...,  0.18867598,
         0.15529744, -0.01096035],
       [-0.02523925,  0.02564444, -0.02052576, ..., -0.02517713,
        -0.10594992,  0.07157612]])

In [17]:
# Break training data into a training set and a validation set 
validation_data = tweets_as_embeddings[0:1000]
validation_labels = labels[0:1000]
training_tweets = tweets_as_embeddings[1000:]
training_labels = labels[1000:]

In [None]:
# Perform grid search to find best possible hyperparams for random forest
random_forest = sklearn.ensemble.RandomForestClassifier(oob_score=True)
params = {
    "n_estimators": [100,200,300,400,500,750,1000],
    "criterion": ['gini','entropy'],
    "max_features": ["auto","sqrt","log2",20,50,70,100,150,200]
}
grid_search = sklearn.model_selection.GridSearchCV(estimator=random_forest, param_grid=params, cv=10, n_jobs=12)
grid_search.fit(tweets_as_embeddings,labels)
best_model = grid_search.best_estimator_

In [27]:
# Score the model
best_model.score(tweets_as_embeddings,labels)

0.9841061342440562

In [26]:
# Write model to file
with open("best_random_forest.pickle",'wb') as model:
    pickle.dump(best_model,model)

In [21]:
# Import test set, and perform the same processing as training set to embedd the tweets
test_data = pandas.read_csv('Data/test.csv')
test_data = test_data.drop(columns=["keyword", "location"])

# Convert the text and targets to lists for easier processing for test set
test_tweets = test_data["text"].values.tolist()

# Preproces and tokenize test data
tokenized_tweets_test = []
for tweet in test_tweets:
    processed_tweet = process(tweet)
    tokenized_tweets_test.append(tokenizer.tokenize(processed_tweet))

# Covert the tokenized sentences to an embedding vector
# This is done by taking an average of the embeddings for the tokens in the tweet
tweets_as_embeddings_test = []
for tweet in tokenized_tweets_test:
    tweet_embedding = numpy.zeros(200)
    for token in tweet:
        if token in embedding_dictionary:
            tweet_embedding = tweet_embedding + embedding_dictionary[token]
        else:
            tweet_embedding = tweet_embedding + embedding_dictionary['<unknown>']

            
    tweet_embedding = tweet_embedding / len(tweet)
    tweets_as_embeddings_test.append(tweet_embedding)

tweets_as_embeddings_test = numpy.array(tweets_as_embeddings_test)

tweets_as_embeddings_test

array([[ 0.062859  ,  0.01061584, -0.19109777, ...,  0.12521795,
         0.12256117,  0.04277267],
       [ 0.06882608, -0.02544418, -0.05030517, ..., -0.09823683,
        -0.00775425,  0.25495916],
       [ 0.01680457, -0.01223924,  0.05610262, ..., -0.04014433,
        -0.11645214,  0.04329538],
       ...,
       [-0.058275  , -0.01203   ,  0.04802833, ..., -0.15971233,
         0.01915166,  0.11188667],
       [ 0.1429977 ,  0.0633    , -0.0857314 , ..., -0.202848  ,
        -0.2739963 ,  0.2823256 ],
       [ 0.11229323, -0.0535166 , -0.01932584, ..., -0.12111883,
        -0.04422525,  0.42631332]])

In [22]:
# Create a new pandas frame with the same ids 
submission = pandas.DataFrame(data=test_data['id'], columns=['id'])

# Predict the label for each test case 
predicted_labels = best_model.predict(tweets_as_embeddings_test)

# Assign predicted labels to the ids in submission dataset and output to csv
submission['target'] = predicted_labels

# Write the submission to a csv file
submission.to_csv("submission_4.csv",index=False)