In [1]:
import numpy as np
from wordcloud import WordCloud
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import seaborn as sns
from tqdm import tqdm_notebook
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

In [2]:
path = "/kaggle/input/nlp-getting-started/"
train_data = pd.read_csv(path+"train.csv", index_col=0)

In [3]:
train_data.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


*
#### 2- preprocessing the data:

In [4]:
y_train = train_data["target"]
x_train = train_data.drop(["keyword","location","target"],axis = 1)


In [5]:
x_train.shape

(7613, 1)

In [6]:
nlp = spacy.load("en_core_web_lg")
def preprocess(text: str) -> str :
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not (token.is_stop and token.is_punct)]
    return ' '.join(tokens)

In [7]:
nlp.Defaults.stop_words.add("`,")
nlp.Defaults.stop_words.add("``")

In [8]:
x_train['processed_text'] = x_train.text.apply(lambda txt : preprocess(txt))

In [9]:
x_train.head()

Unnamed: 0_level_0,text,processed_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Our Deeds are the Reason of this #earthquake M...,our deed be the reason of this # earthquake ma...
4,Forest fire near La Ronge Sask. Canada,Forest fire near La Ronge Sask . Canada
5,All residents asked to 'shelter in place' are ...,all resident ask to ' shelter in place ' be be...
6,"13,000 people receive #wildfires evacuation or...","13,000 people receive # wildfire evacuation or..."
7,Just got sent this photo from Ruby #Alaska as ...,just got send this photo from Ruby # Alaska as...


In [10]:
cv = CountVectorizer()

X = cv.fit_transform(x_train.processed_text)

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

y = y_train
# 1. Declare the model
clf = MultinomialNB()

# 2. Train the model
clf.fit(X, y)

# 3. Make predictions 
yhat = clf.predict(X)

# 4. score
print("Accuracy: ",accuracy_score(y, yhat))

Accuracy:  0.899119926441613


In [12]:
x_test = pd.read_csv(path+"test.csv", index_col=0)
x_test['processed_text'] = x_test.text.apply(lambda txt : preprocess(txt))
x_test.head()

Unnamed: 0_level_0,keyword,location,text,processed_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,,Just happened a terrible car crash,just happen a terrible car crash
2,,,"Heard about #earthquake is different cities, s...","hear about # earthquake be different city , st..."
3,,,"there is a forest fire at spot pond, geese are...","there be a forest fire at spot pond , goose be..."
9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting . # Spokane # wildfire
11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kill 28 in China and Taiwan


In [13]:
x_test_ = cv.transform(x_test.processed_text)

In [14]:
y_test = clf.predict(x_test_)

In [15]:
y_test.shape

(3263,)

In [16]:
x_test_.shape

(3263, 19957)

In [17]:
submition = pd.DataFrame({'id':x_test.index,'target':y_test})

In [18]:
submition.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1


In [19]:
submition.to_csv('submition.csv',index=False)