In [1]:
import numpy as np
import pandas as pd
import nltk

In [2]:
dataset = pd.read_csv('train.csv')
dataset.head(8)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1


In [3]:
dataset.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [4]:
 dataset.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [5]:
dataset.fillna("", inplace=True)

In [6]:
from typing import List
import string

def tokenize(text: str) -> List[str]:
    return nltk.word_tokenize(text)


def remove_punctuation(text: str) -> str:
    return "".join([i for i in text if i not in string.punctuation])

In [7]:
dataset['text'] = dataset['text'].apply(lambda x: x.lower())
dataset['text'] = dataset['text'].apply(lambda x: tokenize(remove_punctuation(str(x))))

dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[our, deeds, are, the, reason, of, this, earth...",1
1,4,,,"[forest, fire, near, la, ronge, sask, canada]",1
2,5,,,"[all, residents, asked, to, shelter, in, place...",1
3,6,,,"[13000, people, receive, wildfires, evacuation...",1
4,7,,,"[just, got, sent, this, photo, from, ruby, ala...",1


In [8]:
dataset['keyword'] = dataset['keyword'].apply(lambda x: x.split('%20'))

dataset.drop(columns=['id'], inplace=True)

In [9]:
stopwords = nltk.corpus.stopwords.words("english")

def remove_stopwords(words: List[str]) -> List[str]:
    return [word for word in words if word not in stopwords]

dataset['text'] = dataset['text'].apply(lambda x: remove_stopwords(x))

dataset.head()

Unnamed: 0,keyword,location,text,target
0,[],,"[deeds, reason, earthquake, may, allah, forgiv...",1
1,[],,"[forest, fire, near, la, ronge, sask, canada]",1
2,[],,"[residents, asked, shelter, place, notified, o...",1
3,[],,"[13000, people, receive, wildfires, evacuation...",1
4,[],,"[got, sent, photo, ruby, alaska, smoke, wildfi...",1


In [10]:
stemmer = nltk.stem.PorterStemmer()

def do_stemming(words: List[str]) -> List[str]:
    return [stemmer.stem(word) for word in words]

dataset['text'] = dataset['text'].apply(lambda x: do_stemming(x))

dataset.head()

Unnamed: 0,keyword,location,text,target
0,[],,"[deed, reason, earthquak, may, allah, forgiv, us]",1
1,[],,"[forest, fire, near, la, rong, sask, canada]",1
2,[],,"[resid, ask, shelter, place, notifi, offic, ev...",1
3,[],,"[13000, peopl, receiv, wildfir, evacu, order, ...",1
4,[],,"[got, sent, photo, rubi, alaska, smoke, wildfi...",1


In [11]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def do_lemmatization(words: List[str]) -> List[str]:
    return [lemmatizer.lemmatize(word) for word in words]

# dataset['text'] = dataset['text'].apply(lambda x: do_lemmatization(x))

# dataset.head()

In [12]:
dataset['text_str'] = dataset['text'].apply(lambda x: " ".join([str(word) for word in x]))
dataset['text_str'].head()                                    

0            deed reason earthquak may allah forgiv us
1                 forest fire near la rong sask canada
2    resid ask shelter place notifi offic evacu she...
3    13000 peopl receiv wildfir evacu order california
4    got sent photo rubi alaska smoke wildfir pour ...
Name: text_str, dtype: object

In [13]:
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer()
X = vectorizer.fit_transform(dataset['text_str'])
X_train = np.array(X.toarray())
X_train.shape

(7613, 19325)

In [14]:
y_train = dataset['target']
y_train.shape

(7613,)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


lr = LogisticRegression(random_state=25)
lr.fit(X_train, y_train)

pred_lr = lr.predict(X_train)

accuracy = accuracy_score(y_train, pred_lr)
accuracy

0.9621699724156049

In [16]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(random_state=24)
rf.fit(X_train, y_train)

pred_rf = rf.predict(X_train)

accuracy = accuracy_score(y_train, pred_rf)
accuracy

0.9963220806515172

In [17]:
test = pd.read_csv('test.csv')
id_test = test.id
test.fillna('', inplace=True)

test.drop(columns=['id', 'keyword', 'location'], inplace=True)

test['text'] = test['text'].apply(lambda x: x.lower())
test['text'] = test['text'].apply(lambda x: tokenize(remove_punctuation(str(x))))
test['text'] = test['text'].apply(lambda x: remove_stopwords(x))
test['text'] = test['text'].apply(lambda x: do_stemming(x))
test['text_str'] = test['text'].apply(lambda x: " ".join([str(word) for word in x]))

test.head()

Unnamed: 0,text,text_str
0,"[happen, terribl, car, crash]",happen terribl car crash
1,"[heard, earthquak, differ, citi, stay, safe, e...",heard earthquak differ citi stay safe everyon
2,"[forest, fire, spot, pond, gees, flee, across,...",forest fire spot pond gees flee across street ...
3,"[apocalyps, light, spokan, wildfir]",apocalyps light spokan wildfir
4,"[typhoon, soudelor, kill, 28, china, taiwan]",typhoon soudelor kill 28 china taiwan


In [23]:
X_test = vectorizer.transform(test['text_str'])
X_test = np.array(X_test.toarray())

y_pred_lr = lr.predict(X_test)
y_pred_rf = rf.predict(X_test)

In [24]:
result_lr = pd.DataFrame({'id': id_test,
                      'target': y_pred_lr})
result_lr.to_csv("submission_lr.csv", index=False)

In [25]:
result_rf = pd.DataFrame({'id': id_test,
                      'target': y_pred_rf})
result_rf.to_csv("submission_rf.csv", index=False)