In [1]:
import numpy as np
from wordcloud import WordCloud
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import pandas as pd
from nltk.tokenize import TweetTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

In [2]:
path = "/kaggle/input/nlp-getting-started/"
train_data = pd.read_csv(path+"train.csv", index_col=0)

In [3]:
train_data.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


#### 2- preprocessing the data:

In [4]:
y_train = train_data["target"]
x_train = train_data.drop(["keyword","location","target"],axis = 1)


In [5]:
x_train.shape

(7613, 1)

In [6]:
def preprocess(text: str) -> str :
    return ' '.join([token.lower() for token in TweetTokenizer().tokenize(text)])

preprocess(x_train.loc[1].text)

'our deeds are the reason of this #earthquake may allah forgive us all'

In [7]:
x_train['processed_text'] = x_train.text.apply(lambda txt : preprocess(txt))

In [8]:
x_train.head()

Unnamed: 0_level_0,text,processed_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Our Deeds are the Reason of this #earthquake M...,our deeds are the reason of this #earthquake m...
4,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask . canada
5,All residents asked to 'shelter in place' are ...,all residents asked to ' shelter in place ' ar...
6,"13,000 people receive #wildfires evacuation or...","13,000 people receive #wildfires evacuation or..."
7,Just got sent this photo from Ruby #Alaska as ...,just got sent this photo from ruby #alaska as ...


In [9]:
cv = CountVectorizer()

X = cv.fit_transform(x_train.processed_text)

In [10]:
y = y_train
# 1. Declare the model
clf = MultinomialNB()

# 2. Train the model
clf.fit(X, y)

# 3. Make predictions 
yhat = clf.predict(X)

# 4. score
print("F1 score: ",f1_score(y, yhat))

F1 score:  0.8822006472491908


In [11]:
x_test = pd.read_csv(path+"test.csv", index_col=0)
x_test['processed_text'] = x_test.text.apply(lambda txt : preprocess(txt))
x_test.head()

Unnamed: 0_level_0,keyword,location,text,processed_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,,Just happened a terrible car crash,just happened a terrible car crash
2,,,"Heard about #earthquake is different cities, s...","heard about #earthquake is different cities , ..."
3,,,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond , geese ar..."
9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting . #spokane #wildfires
11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills 28 in china and taiwan


In [12]:
x_test_ = cv.transform(x_test.processed_text)

In [13]:
y_test = clf.predict(x_test_)

In [14]:
y_test.shape

(3263,)

In [15]:
x_test_.shape

(3263, 21615)

In [16]:
submition = pd.DataFrame({'id':x_test.index,'target':y_test})

In [17]:
submition.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [18]:
submition.to_csv('submition.csv',index=False)