In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
train = pd.read_csv('train.csv').fillna('undef')
test = pd.read_csv('test.csv').fillna('undef').drop('id', axis=1)
X = train
y = train['target']

In [3]:
def separate_second_tag(df):
    col1 = []
    col2 = []
    for x in df['keyword']:
        try:
            smth = x.split("%20")
            col2.append(smth[1])
            col1.append(smth[0])
        except:
            col2.append("undef")
            col1.append(x)
    df.drop('keyword', axis=1, inplace = True)
    df['keyword_one'] = col1
    df['keyword_two'] = col2
    
    return df

In [4]:
X = separate_second_tag(X)
test = separate_second_tag(test)

In [5]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

mispell_dict = {"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"couldnt" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"doesnt" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"havent" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"shouldnt" : "should not",
"that's" : "that is",
"thats" : "that is",
"there's" : "there is",
"theres" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"theyre":  "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not"}

In [6]:
def replace_typical_misspell(text):
    mispellings_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    def replace(match):
        return mispell_dict[match.group(0)]
    return mispellings_re.sub(replace, text)

def text_cleaning(ln):
    corpus = []
    wordnet = WordNetLemmatizer()
    for text in ln:
        text = replace_typical_misspell(text)
        text = ' '.join(text.split('.'))
        text = re.sub(r'\s+', ' ', re.sub('[^A-Za-z0-9]', ' ', text.strip().lower())).strip()
        text = re.sub(r'\W+', ' ', text.strip().lower()).strip()
        text = [wordnet.lemmatize(word) for word in text.split(" ") if word not in set(stopwords.words('english'))]
        text = ' '.join(text)
        corpus.append(text)
        pass
    return corpus

In [7]:
X['text'] = text_cleaning(list(X['text']))
test['text'] = text_cleaning(list(test['text']))

In [8]:
X['location'] = text_cleaning(list(X['location']))
test['location'] = text_cleaning(list(test['location']))

In [9]:
#location
import spacy
nlp = spacy.load('en_core_web_lg')

def location_detection(ln):
    bins = []
    for text in ln:
        doc = nlp(text)
        entities = []
        for ent in doc.ents:
            entities.append(ent)
        if len(entities)>0:
            bins.append(1)
        else:
            bins.append(0)
    return bins

X['location_legit'] = location_detection(X['location'])
test['location_legit'] = location_detection(test['location'])

In [10]:
disaster_tweets =' '.join(train[train['target'] == 1]['text'].tolist())
non_disaster_tweets = ' '.join(train[train['target'] == 0]['text'].tolist())

In [11]:
import nltk
def return_top_words(text,n = 10):
    allWords = nltk.tokenize.word_tokenize(text)
    allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allWords if w not in set(stopwords.words('english')))  
    mostCommontuples= allWordExceptStopDist.most_common(n)
    mostCommon = [tupl[0] for tupl in mostCommontuples]
    return mostCommon

In [12]:
top_50_disaster_words = return_top_words(disaster_tweets,50)
top_50_nondisaster_words = return_top_words(non_disaster_tweets,50)

In [13]:
top_500_disaster_words = return_top_words(disaster_tweets,500)
top_500_nondisaster_words = return_top_words(non_disaster_tweets,500)
top_disaster_exclusive = list(set(top_500_disaster_words[0]).difference(set(top_500_nondisaster_words[0])))
top_nondisaster_exclusive = list(set(top_500_nondisaster_words[0]).difference(set(top_500_disaster_words[0])))

In [14]:
total_vocab = top_disaster_exclusive + top_nondisaster_exclusive

In [15]:
for word in total_vocab:
    X['Is_'+word+'_present'] = X['text'].apply(lambda x: (word in x)*1)
    test['Is_'+word+'_present'] = test['text'].apply(lambda x: (word in x)*1)

In [16]:
X

Unnamed: 0,id,location,text,target,keyword_one,keyword_two,location_legit
0,1,undef,deed reason earthquake may allah forgive u,1,undef,undef,0
1,4,undef,forest fire near la ronge sask canada,1,undef,undef,0
2,5,undef,resident asked shelter place notified officer ...,1,undef,undef,0
3,6,undef,13 000 people receive wildfire evacuation orde...,1,undef,undef,0
4,7,undef,got sent photo ruby alaska smoke wildfire pour...,1,undef,undef,0
...,...,...,...,...,...,...,...
7608,10869,undef,two giant crane holding bridge collapse nearby...,1,undef,undef,0
7609,10870,undef,aria ahrary thetawniest control wild fire cali...,1,undef,undef,0
7610,10871,undef,m1 94 01 04 utc 5km volcano hawaii http co zdt...,1,undef,undef,0
7611,10872,undef,police investigating e bike collided car littl...,1,undef,undef,0


In [17]:
train

Unnamed: 0,id,location,text,target,keyword_one,keyword_two,location_legit
0,1,undef,deed reason earthquake may allah forgive u,1,undef,undef,0
1,4,undef,forest fire near la ronge sask canada,1,undef,undef,0
2,5,undef,resident asked shelter place notified officer ...,1,undef,undef,0
3,6,undef,13 000 people receive wildfire evacuation orde...,1,undef,undef,0
4,7,undef,got sent photo ruby alaska smoke wildfire pour...,1,undef,undef,0
...,...,...,...,...,...,...,...
7608,10869,undef,two giant crane holding bridge collapse nearby...,1,undef,undef,0
7609,10870,undef,aria ahrary thetawniest control wild fire cali...,1,undef,undef,0
7610,10871,undef,m1 94 01 04 utc 5km volcano hawaii http co zdt...,1,undef,undef,0
7611,10872,undef,police investigating e bike collided car littl...,1,undef,undef,0


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer(ngram_range=(1, 3),
                         binary=True,
                         max_features = 5000,
                         smooth_idf=False)
X_tfidf = tf_idf.fit_transform(X['text'])
test_tfidf = tf_idf.transform(test['text'])

In [19]:
tf_location = TfidfVectorizer(ngram_range = (1,2),
                              binary = True,
                              max_features = 1500,
                              smooth_idf = False)
location_X_tfidf = tf_location.fit_transform(X['location'])
location_test_tfidf = tf_location.transform(test['location'])

In [20]:
train_data = pd.concat([X,
                        pd.DataFrame(X_tfidf.toarray(),
                                     columns = ['text_contains_'+ str(text) for text in tf_idf.get_feature_names()]),
                        pd.DataFrame(location_X_tfidf.toarray(),
                                     columns = ['location_contains_'+str(text) for text in tf_location.get_feature_names()])],axis = 1)
test_data = pd.concat([test,
                       pd.DataFrame(test_tfidf.toarray(),
                                    columns = ['text_contains_'+ str(text) for text in tf_idf.get_feature_names()]),
                       pd.DataFrame(location_test_tfidf.toarray(),
                                    columns = ['location_contains_'+str(text) for text in tf_location.get_feature_names()])],axis = 1)

In [21]:
def create_vec(dataframe):
    texts = dataframe['text'].tolist()
    vectors = []
    for doc in nlp.pipe(texts):
        vectors.append(list(doc.vector))
    df = pd.DataFrame(vectors,columns = ['vec_'+str(i) for i in range(300)])
    return df
vec_train = create_vec(train_data)
vec_test = create_vec(test_data)
train_data = pd.concat([train_data,vec_train],axis = 1)
test_data = pd.concat([test_data,vec_test],axis = 1)

train_data = train_data.drop(['keyword_one', 'keyword_two','location','text'],axis = 1)
test_data = test_data.drop(['keyword_one', 'keyword_two','location','text'],axis = 1)