In [1]:
import pandas as pd
from nlp_disaster.methods.formatting import clean_stopwords, clean_puncts

import json

In [2]:
import os
DATA_PATH = "./data/"
INPUT_PATH = "./data/input/"
OUT_PATH = "./out/"

if not os.path.exists(OUT_PATH):
    os.mkdir(OUT_PATH)

In [18]:
train_df = pd.read_csv(f"{INPUT_PATH}train.csv").fillna("")
test_df = pd.read_csv(f"{INPUT_PATH}test.csv").fillna("")

In [19]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [20]:
with open(f"{DATA_PATH}formatting.json") as infile:
    FORMATTING = json.load(infile)

# Removing Punctuations

Tweets contains hastags and @ symbols. These are removed to aide with analysis.

In [21]:
train_df["no-punct"] = train_df["text"].apply(
    lambda x: clean_puncts(x, FORMATTING)
)

In [22]:
train_df.loc[train_df["keyword"] != "", :].head()

Unnamed: 0,id,keyword,location,text,target,no-punct
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,Wholesale Markets ablaze http://t.co/lHYXEOHY6C
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,We always try to bring the heavy. metal RT htt...
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,AFRICANBAZE: Breaking news:Nigeria flag set ab...
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0,Crying out for more! Set me ablaze
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...


In [42]:
test_df["no-punct"] = test_df["text"].apply(
    lambda x: clean_puncts(x, FORMATTING)
)

# Removing Stopwords

Stopwords are common words such as 'a', 'the', 'and'. These words adds little meaning to the tweet itself. It acts more like noise in Image Processing and are removed.

The function ```clean_stopwords``` removes these stopwords using the ```nltk``` library

In [23]:
train_df["filtered-text"] = train_df["no-punct"].apply(clean_stopwords)

As you can see the ```filtered-text``` column contains the filtered keywords. 

You might also notice that the words are stored in a list instead of a string. This is a process called Tokenization. 

Tokenization splits sentences into discrete words called tokens. This is analogous to converting continuous data to discrete data. Converting a continous string to tokens aides processing as we can now apply mathematical methods to each tokens separately. 

In [24]:
train_df.loc[train_df["keyword"] != "", :].head()

Unnamed: 0,id,keyword,location,text,target,no-punct,filtered-text
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,Wholesale Markets ablaze http://t.co/lHYXEOHY6C,"[Wholesale, Markets, ablaze, http, :, //t.co/l..."
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,We always try to bring the heavy. metal RT htt...,"[We, always, try, bring, heavy, ., metal, RT, ..."
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,AFRICANBAZE: Breaking news:Nigeria flag set ab...,"[AFRICANBAZE, :, Breaking, news, :, Nigeria, f..."
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0,Crying out for more! Set me ablaze,"[Crying, !, Set, ablaze]"
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,"[On, plus, side, LOOK, AT, THE, SKY, LAST, NIG..."


# Keyword and Location Column

In [25]:
train_df.loc[train_df["keyword"] != "", :].head(10)

Unnamed: 0,id,keyword,location,text,target,no-punct,filtered-text
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,Wholesale Markets ablaze http://t.co/lHYXEOHY6C,"[Wholesale, Markets, ablaze, http, :, //t.co/l..."
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,We always try to bring the heavy. metal RT htt...,"[We, always, try, bring, heavy, ., metal, RT, ..."
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1,AFRICANBAZE: Breaking news:Nigeria flag set ab...,"[AFRICANBAZE, :, Breaking, news, :, Nigeria, f..."
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0,Crying out for more! Set me ablaze,"[Crying, !, Set, ablaze]"
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,"[On, plus, side, LOOK, AT, THE, SKY, LAST, NIG..."
36,54,ablaze,Pretoria,@PhDSquares #mufc they've built so much hype a...,0,mufc they've built so much hype around new acq...,"[mufc, 've, built, much, hype, around, new, ac..."
37,55,ablaze,World Wide!!,INEC Office in Abia Set Ablaze - http://t.co/3...,1,INEC Office in Abia Set Ablaze - http://t.co/3...,"[INEC, Office, Abia, Set, Ablaze, -, http, :, ..."
38,56,ablaze,,Barbados #Bridgetown JAMAICA ÛÒ Two cars set ...,1,Barbados Bridgetown JAMAICA ÛÒ Two cars set a...,"[Barbados, Bridgetown, JAMAICA, ÛÒ, Two, cars..."
39,57,ablaze,Paranaque City,Ablaze for you Lord :D,0,Ablaze for you Lord :D,"[Ablaze, Lord, :, D]"
40,59,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...,0,Check these out: http://t.co/rOI2NSmEJJ http:/...,"[Check, :, http, :, //t.co/rOI2NSmEJJ, http, :..."


In [26]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [27]:
count_vectorizer = feature_extraction.text.CountVectorizer()

In [60]:
train_vector = count_vectorizer.fit_transform(train_df["no-punct"])
test_vector = count_vectorizer.transform(test_df["no-punct"])

In [61]:
clf = linear_model.RidgeClassifier()

In [62]:
scores = model_selection.cross_val_score(
     clf, 
     train_vector, 
     train_df["target"], 
     cv=3, scoring="f1"
)

In [63]:
scores

array([0.58604156, 0.56188605, 0.63933626])

In [64]:
clf.fit(train_vector, train_df["target"])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [68]:
sample_submission = pd.read_csv(f"{INPUT_PATH}sample_submission.csv")

In [69]:
test_df.head()

Unnamed: 0,id,keyword,location,text,no-punct
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...","Heard about earthquake is different cities, st..."
2,3,,,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. Spokane wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan


In [70]:
sample_submission["target"] = clf.predict(test_vector)

In [72]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [73]:
sample_submission.to_csv(f"{OUT_PATH}submission.csv", index=False)