In [6]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

## Read in Data

In [7]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample = pd.read_csv('data/sample_submission.csv')

## EDA

In [8]:
pd.DataFrame({'count': train.target.value_counts(), 
              'percentage': train.target.value_counts(normalize=True)})

Unnamed: 0,count,percentage
0,4342,0.57034
1,3271,0.42966


In [9]:
train["target"].value_counts().plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x128657e10>

give keyords distinct numbers

In [10]:
def map_keywords(series):
    mapper = {}
    u_series = series.unique()
    for i in range(len(u_series)):
        mapper[u_series[i]] = i
        
    return mapper

train_keyword_map = map_keywords(train.keyword)

train['keyword_num'] = train['keyword'].map(train_keyword_map)
test['keyword_num'] = test['keyword'].map(train_keyword_map)

In [11]:
def text_preprocessing(data):
    #remove whitespace and lower all words
    data = data.apply(lambda x: x.strip().lower())
    #replace digits
    data = data.apply(lambda x: re.sub(r'\d+', '', x))
    #replace punctuation
    data = data.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    #tokenize
    data = data.apply(lambda x : word_tokenize(x))
    #filter out stopwords
    data = data.apply(lambda x: [word for word in x if word not in stop_words])
    #remove inflection and return base word
    lemmatizer = WordNetLemmatizer()
    data = data.apply(lambda x: [lemmatizer.lemmatize(word, pos ='v') for word in x])
    #parts of speech tagging
    #data = data.apply(lambda x: [pos_tag(x)])
    return data

In [12]:
train['pro_text'] = text_preprocessing(train.text)
test['pro_text'] = text_preprocessing(test.text)

In [13]:
train.head()

Unnamed: 0,id,keyword,location,text,target,keyword_num,pro_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0,"[deeds, reason, earthquake, may, allah, forgiv..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,0,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,0,"[residents, ask, shelter, place, notify, offic..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,0,"[people, receive, wildfires, evacuation, order..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,0,"[get, send, photo, ruby, alaska, smoke, wildfi..."


## TFIDF Vectorizer
Matrix of token counts with TF-IDF transformation

In [22]:
vectorizer = TfidfVectorizer()
#joining words and fit transofrming
vector = vectorizer.fit_transform(["".join(i) for i in train["text"]])
vector = vector.todense()
vector = np.concatenate((vector, np.reshape(np.array(train["keyword"]), (train.keyword.shape[0],-1))), axis=1)
print(vector.shape)

# vector_test = vectorizer.fit_transform(["".join(i) for i in test["text"]])
vector_test = vectorizer.transform(["".join(i) for i in test["text"]])
vector_test = vector_test.todense()
vector_test = np.concatenate((vector_test, np.reshape(np.array(test["keyword"]), (test.keyword.shape[0],-1))), axis=1)
print(vector_test.shape)

(7613, 21638)
(3263, 21638)
