In [45]:
import pandas as pd

In [46]:
train_data = pd.read_csv("input/train.csv", index_col = "id")
train_data.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [47]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7613 entries, 1 to 10873
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   keyword   7552 non-null   object
 1   location  5080 non-null   object
 2   text      7613 non-null   object
 3   target    7613 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 297.4+ KB


In [48]:
test_data = pd.read_csv("input/test.csv", index_col="id")
test_data.head()

Unnamed: 0_level_0,keyword,location,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,,Just happened a terrible car crash
2,,,"Heard about #earthquake is different cities, s..."
3,,,"there is a forest fire at spot pond, geese are..."
9,,,Apocalypse lighting. #Spokane #wildfires
11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [49]:
disaster_tweet_example = train_data[train_data['target']==1]['text'].values
disaster_tweet_example

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)

In [50]:
non_disaster_tweet_example = train_data[train_data['target']==0]['text'].values
non_disaster_tweet_example

array(["What's up man?", 'I love fruits', 'Summer is lovely', ...,
       'These boxes are ready to explode! Exploding Kittens finally arrived! gameofkittens #explodingkittens\x89Û_ https://t.co/TFGrAyuDC5',
       'Sirens everywhere!',
       'I just heard a really loud bang and everyone is asleep great'],
      dtype=object)

Text preprocessing :
1. Tokenization
2. Remove stop words
3. Word Embeddings

1. Tokenization:
Here I have two options TweetTokenizer and WordTokenizer, I choose to use WordTokenizer, as it separate # from the tag in hashtags.
So, in an example above we have #earthquake, with WordTokenizer we'll get #, and earthquake

Reasoning:
In this case the target, depends on the words used in the tweet, so we the word "earthquake" will provide more value rather than the hashtag. 

In [51]:
import nltk
from nltk.tokenize import word_tokenize
train_data['tokenized_text'] = train_data['text'].apply(word_tokenize)

train_data.head()

Unnamed: 0_level_0,keyword,location,text,target,tokenized_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1,"[Our, Deeds, are, the, Reason, of, this, #, ea..."
4,,,Forest fire near La Ronge Sask. Canada,1,"[Forest, fire, near, La, Ronge, Sask, ., Canada]"
5,,,All residents asked to 'shelter in place' are ...,1,"[All, residents, asked, to, 'shelter, in, plac..."
6,,,"13,000 people receive #wildfires evacuation or...",1,"[13,000, people, receive, #, wildfires, evacua..."
7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[Just, got, sent, this, photo, from, Ruby, #, ..."


2. Remove Punctuations
Let's remove punctuation using isalnum()

In [52]:
def remove_punctuation(arr) :
    for w in arr:
        if(not w.isalnum()):
            arr.remove(w)
    return arr

In [53]:
train_data['no_punc_tokens'] = train_data['tokenized_text'].apply(remove_punctuation)

In [54]:
train_data.head()

Unnamed: 0_level_0,keyword,location,text,target,tokenized_text,no_punc_tokens
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1,"[Our, Deeds, are, the, Reason, of, this, earth...","[Our, Deeds, are, the, Reason, of, this, earth..."
4,,,Forest fire near La Ronge Sask. Canada,1,"[Forest, fire, near, La, Ronge, Sask, Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]"
5,,,All residents asked to 'shelter in place' are ...,1,"[All, residents, asked, to, in, place, are, be...","[All, residents, asked, to, in, place, are, be..."
6,,,"13,000 people receive #wildfires evacuation or...",1,"[people, receive, wildfires, evacuation, order...","[people, receive, wildfires, evacuation, order..."
7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[Just, got, sent, this, photo, from, Ruby, Ala...","[Just, got, sent, this, photo, from, Ruby, Ala..."


3. Word Embeddings

Let's try out with simplest word embedding CountVectorizer

In [55]:
from sklearn import feature_extraction 

count_vectorizer = feature_extraction.text.CountVectorizer(stop_words='english', tokenizer=word_tokenize)

In [56]:
training_vectors = count_vectorizer.fit_transform(train_data['text'])

# print(training_vectors)

In [57]:
test_vectors = count_vectorizer.transform(test_data['text'])

In [58]:
training_vectors[1].todense()

matrix([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Applying ML model to just the vectors, not considering Keyword and Location at the moment

In [59]:
from sklearn import linear_model, model_selection
model1 = linear_model.RidgeClassifier()
scores = model_selection.cross_val_score(model1, training_vectors, train_data['target'], cv=5, scoring='accuracy')

scores

array([0.70124754, 0.60669731, 0.63164806, 0.6346912 , 0.70367937])

In [60]:
train_data['target'].shape

(7613,)

In [61]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(training_vectors, train_data['target'], test_size=0.2)

In [44]:
from sklearn.ensemble import RandomForestClassifier

randomForestModel = RandomForestClassifier()

randomForestModel.fit(train_X, train_y)
train_mean_accuracy = randomForestModel.score(train_X, train_y)
val_mean_accuracy = randomForestModel.score(val_X, val_y)

print("Mean Accuracy of training: ", train_mean_accuracy)
print("Mean Accuracy of validation: ", val_mean_accuracy)

    

Mean Accuracy of training:  0.9919425468558416
Mean Accuracy of validation:  0.7547268907563025


In [45]:
model1.fit(training_vectors, train_data['target'])
submission = pd.read_csv("input/submission.csv")
submission['target'] = model1.predict(test_vectors)
submission.to_csv("output/submission1.csv", index=False)