In [1]:
#importing relevant libraries
import numpy as np 
import pandas as pd 
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

In [2]:
#importing data into dataframes
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
train_df.describe(include='all')

Unnamed: 0,id,keyword,location,text,target
count,7613.0,7552,5080,7613,7613.0
unique,,221,3341,7503,
top,,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...,
freq,,45,104,10,
mean,5441.934848,,,,0.42966
std,3137.11609,,,,0.49506
min,1.0,,,,0.0
25%,2734.0,,,,0.0
50%,5408.0,,,,0.0
75%,8146.0,,,,1.0


61 missing keywords and 2533 missing locations

## Steps in Model  
    1.remove urls    
    2.remove emojis      
    3.remove contractions 
    4.remove punctuations      
    5.tokenize and lemmatize      
    6.Vectorize dataset to feed into model  
    7.train model and evaluate model accuracy      
    8.generate submission file  

In [3]:
#1. remove urls
import re
def remove_URL(text):
    urls = re.compile(r'https?://\S+|www\.\S+')
    text_without_https =  urls.sub(r'',text)
    url = re.compile(r'http?://\S+|www\.\S+')
    return url.sub(r'',text_without_https)
train_df['text']=train_df['text'].apply(remove_URL)
test_df['text']=test_df['text'].apply(remove_URL)

In [4]:
#2. remove emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
train_df['text']=train_df['text'].apply(remove_emoji)
test_df['text']=test_df['text'].apply(remove_emoji)

In [5]:
#3. remove contractions
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are",
"thx"   : "thanks"
}

def remove_contractions(text):
    temp = text.split()
    final_text =''
    for wrd in temp:
        if wrd.lower() in contractions:
            replaced_word = contractions[wrd.lower()] + ' '
        else:
            replaced_word = wrd + ' '
        final_text  = final_text + replaced_word
    return final_text.strip()

train_df['text']=train_df['text'].apply(remove_contractions)
test_df['text']=test_df['text'].apply(remove_contractions)

In [6]:
#4. remove punctuation
import string
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)
train_df['text']=train_df['text'].apply(remove_punct)
test_df['text']=test_df['text'].apply(remove_punct)

In [7]:
#5. tokenize and lemmatize
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:
def tokenize_and_lemmatize(text):    
    text_tokens = word_tokenize(text.lower())
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    word_list = [lemmatizer.lemmatize(w) for w in tokens_without_sw]
    return word_list
train_df['text_tokens']=train_df['text'].apply(tokenize_and_lemmatize)
test_df['text_tokens']=test_df['text'].apply(tokenize_and_lemmatize)

In [9]:
#reconstruct clean sentence from tokens
def reconstruct_sentence(tokens):
    sentence = ''
    for i in tokens:
        sentence = sentence + ' ' + i
    return sentence.strip()
train_df['clean_text']= train_df['text_tokens'].apply(reconstruct_sentence)
test_df['clean_text']= test_df['text_tokens'].apply(reconstruct_sentence)
#checking first few records
train_df.head(5)

Unnamed: 0,id,keyword,location,text,target,text_tokens,clean_text
0,1,,,Our Deeds are the Reason of this earthquake Ma...,1,"[deed, reason, earthquake, may, allah, forgive...",deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask Canada,1,"[forest, fire, near, ronge, sask, canada]",forest fire near ronge sask canada
2,5,,,All residents asked to shelter in place are be...,1,"[resident, asked, shelter, place, notified, of...",resident asked shelter place notified officer ...
3,6,,,13000 people receive wildfires evacuation orde...,1,"[13000, people, receive, wildfire, evacuation,...",13000 people receive wildfire evacuation order...
4,7,,,Just got sent this photo from Ruby Alaska as s...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi...",got sent photo ruby alaska smoke wildfire pour...


In [10]:
#6. converting data into TFID vectors and also using bigrams as they add more context when words are combined
tfid_vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2))
train_vectors_tfid = tfid_vectorizer.fit_transform(train_df["clean_text"])
test_vectors_tfid = tfid_vectorizer.transform(test_df["clean_text"])

In [12]:
#7. Training model using RgidgeClassifier
clf = linear_model.RidgeClassifier()
scores = model_selection.cross_val_score(clf, train_vectors_tfid, train_df["target"], cv=5, scoring="f1")
print("average score of model is {}".format(np.mean(scores)))

average score of model is 0.5484058791442074


In [13]:
clf.fit(train_vectors_tfid, train_df["target"])
pred=clf.predict(test_vectors_tfid)
print(tfid_vectorizer.vocabulary_)



In [14]:
#8. generating submission file  
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission['target']=clf.predict(test_vectors_tfid)
sample_submission.to_csv('submission0403_bigram.csv',index=False)

### Summary
This was my first foray into natural language processing and this was a model that was pretty straight forward. I think it can be used as a good baseline to reference as I start using more advance machine learning algorithims and techniques.  
Using this I acheived a score of 0.80263 