In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import string
import nltk
import nltk.corpus
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [13]:
test_data = pd.read_csv("test_disaster_tweet.csv")
train = pd.read_csv("train_disaster_tweet.csv")

# Data Understanding

In [14]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [15]:
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [16]:
train.shape

(7613, 5)

In [17]:
test_data.shape

(3263, 4)

In [18]:
train.sample(5)

Unnamed: 0,id,keyword,location,text,target
658,952,blaze,,What Dems do. Blaze covered months ago.Chicago...,0
4809,6845,loud%20bang,,need to work in an office I can bang all my fa...,0
1026,1488,body%20bags,Charlotte NC,The Body Bags has a show on 08/07/2015 at 07:3...,0
679,981,blazing,,REAL ViBEZ RADIO - BLAZING THE BEST VIBEZ!!! h...,0
2529,3633,desolation,??????,The Hobbit: The Desolation of Smaug - Ed Sheer...,0


In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [20]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [25]:
train.isnull().mean()*100

id           0.000000
keyword      0.801261
location    33.272035
text         0.000000
target       0.000000
dtype: float64

In [22]:
test_data.isnull().mean()*100

id           0.000000
keyword      0.796813
location    33.864542
text         0.000000
dtype: float64

In [23]:
train.duplicated().sum()

0

In [24]:
test_data.duplicated().sum()

0

# Data Cleaning / Preprocessing

In [29]:
train.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [30]:
test.head()

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan


In [28]:
train = train.drop(columns=["id", "keyword", "location"])
test = test_data.drop(columns=["id", "keyword", "location"])

In [32]:
train["text"][0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [33]:
train["text"][1]

'Forest fire near La Ronge Sask. Canada'

In [34]:
train["target"].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [35]:
#REMOVING LINKS, TOKENISATION AND REMOVING PUNCTUATIONS

def preprocessing1(text):
    te = re.sub(r'http\S+', "", text)
    tokens = word_tokenize(te)
    
    punctuations = string.punctuation
    txt = []
    for i in tokens:
        if i not in punctuations:
            txt.append(i)
    return txt

In [36]:
train["text"] = train["text"].apply(preprocessing1)

In [37]:
train["text"][0]

['Our',
 'Deeds',
 'are',
 'the',
 'Reason',
 'of',
 'this',
 'earthquake',
 'May',
 'ALLAH',
 'Forgive',
 'us',
 'all']

In [38]:
#REMOVING STOPWORDS & CONVERTING TO LOWERCASE

def preprocessing2(text):
    stopword = stopwords.words("english")
    ntxt = []
    for j in text:
        if j not in stopword:
            ntxt.append(j.lower())
    return ntxt

In [39]:
train["text"] = train["text"].apply(preprocessing2)

In [40]:
train["text"][0]

['our', 'deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us']

In [41]:
#STEMMING

def preprocessing3(text):
    stem = []
    Stemmer = PorterStemmer()
    for i in text:
        stem.append(Stemmer.stem(i))
    return " ".join(stem)

In [42]:
train["text"] = train["text"].apply(preprocessing3)

In [43]:
train["text"][0]

'our deed reason earthquak may allah forgiv us'

In [44]:
test["text"] = test["text"].apply(preprocessing1)
test["text"] = test["text"].apply(preprocessing2)
test["text"] = test["text"].apply(preprocessing3)

In [45]:
test["text"][0]

'just happen terribl car crash'

In [46]:
train.head(10)

Unnamed: 0,text,target
0,our deed reason earthquak may allah forgiv us,1
1,forest fire near la rong sask canada,1
2,all resid ask 'shelter place notifi offic no e...,1
3,"13,000 peopl receiv wildfir evacu order califo...",1
4,just got sent photo rubi alaska smoke wildfir ...,1
5,rockyfir updat california hwi 20 close direct ...,1
6,flood disast heavi rain caus flash flood stree...,1
7,i 'm top hill i see fire wood ...,1
8,there 's emerg evacu happen build across street,1
9,i 'm afraid tornado come area ...,1


In [47]:
test.head(10)

Unnamed: 0,text
0,just happen terribl car crash
1,heard earthquak differ citi stay safe everyon
2,forest fire spot pond gees flee across street ...
3,apocalyps light spokan wildfir
4,typhoon soudelor kill 28 china taiwan
5,we 're shake ... it 's earthquak
6,they 'd probabl still show life arsen yesterda...
7,hey how
8,what nice hat
9,fuck


In [48]:
train.isna().sum()

text      0
target    0
dtype: int64

In [49]:
test.isna().sum()

text    0
dtype: int64

In [50]:
train.shape

(7613, 2)

In [51]:
test.shape

(3263, 1)

# MODEL BUILDING

In [52]:
train.head()

Unnamed: 0,text,target
0,our deed reason earthquak may allah forgiv us,1
1,forest fire near la rong sask canada,1
2,all resid ask 'shelter place notifi offic no e...,1
3,"13,000 peopl receiv wildfir evacu order califo...",1
4,just got sent photo rubi alaska smoke wildfir ...,1


In [58]:
x = train.iloc[:,0]
y = train.iloc[:,-1]

In [59]:
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(x)

In [60]:
test_transformed = tfidf.transform(test["text"])

In [61]:
x.shape

(7613, 14245)

In [62]:
test_transformed.shape

(3263, 14245)

In [63]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=None)

**Logistic Regression**

In [64]:
logr = LogisticRegression()
model = logr.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

81.68089297439265

In [65]:
score = []
for i in range(1000):
    x = train.iloc[:,0]
    y = train.iloc[:,-1]
    x = tfidf.fit_transform(x)
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    
    logr = LogisticRegression()
    logr.fit(x_train, y_train)
    pred = logr.predict(x_test)
    score.append(accuracy_score(y_test,pred)*100)
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

i=  459
83.25673013788575


In [66]:
logr = LogisticRegression()
x = train.iloc[:,0]
y = train.iloc[:,-1]
x = tfidf.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model1 = logr.fit(x_train,y_train)
pred = model1.predict(x_test)
accuracy_score(y_test, pred)*100

83.25673013788575

In [67]:
# Training Data Accuracy

pred = model1.predict(x_train)
accuracy_score(y_train, pred)*100     #Generalized

88.48932676518884

In [68]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod1 = cross_val_score(LogisticRegression(), x_train, y_train, cv=10)
print(mod1)
print(np.round(np.mean(mod1),2)*100)

[0.80952381 0.79967159 0.77996716 0.79310345 0.80788177 0.80295567
 0.81444992 0.77996716 0.77011494 0.79310345]
80.0


**Decision Tree Classifier**

In [69]:
dtc = DecisionTreeClassifier(random_state=4)
model = dtc.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

75.31188443860802

In [70]:
score = []
for i in range(100):
    x = train.iloc[:,0]
    y = train.iloc[:,-1]
    x = tfidf.fit_transform(x)
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    
    dtc = DecisionTreeClassifier(random_state=4)
    dtc.fit(x_train, y_train)
    pred = dtc.predict(x_test)
    score.append(accuracy_score(y_test,pred)*100)
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

i=  84
76.75640183847669


In [71]:
dtc = DecisionTreeClassifier(random_state=4)
x = train.iloc[:,0]
y = train.iloc[:,-1]
x = tfidf.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model2 = dtc.fit(x_train,y_train)
pred = model2.predict(x_test)
accuracy_score(y_test, pred)*100

76.75640183847669

In [72]:
# Training Data Accuracy

pred = model2.predict(x_train)
accuracy_score(y_train, pred)*100     #Overfitting

98.85057471264368

In [73]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod2 = cross_val_score(DecisionTreeClassifier(random_state=4), x_train, y_train, cv=5)
print(mod2)
print(np.round(np.mean(mod2),2)*100)

[0.72660099 0.75287356 0.75779967 0.74220033 0.71428571]
74.0


**RandomForest Classifier**

In [74]:
rfc = RandomForestClassifier(random_state=4)
model = rfc.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

80.4333552199606

In [None]:
score = []
for i in range(50):
    x = train.iloc[:,0]
    y = train.iloc[:,-1]
    x = tfidf.fit_transform(x)
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    
    rfc = RandomForestClassifier(random_state=4)
    rfc.fit(x_train, y_train)
    pred = rfc.predict(x_test)
    score.append(accuracy_score(y_test,pred)*100)
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

In [None]:
rfc = RandomForestClassifier(random_state=4)
x = train.iloc[:,0]
y = train.iloc[:,-1]
x = tfidf.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model3 = rfc.fit(x_train,y_train)
pred = model3.predict(x_test)
accuracy_score(y_test, pred)*100

In [None]:
# Training Data Accuracy

pred = model3.predict(x_train)
accuracy_score(y_train, pred)*100     #Overfitting

In [None]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod3 = cross_val_score(RandomForestClassifier(random_state=4), x_train, y_train, cv=10)
print(mod3)
print(np.round(np.mean(mod3),2)*100)

**Support Vector Classifier**

In [None]:
svc = SVC(kernel="linear",tol=0.1)
model = svc.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

In [None]:
score = []
for i in range(50):
    x = train.iloc[:,0]
    y = train.iloc[:,-1]
    x = tfidf.fit_transform(x)    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    
    svc = SVC(kernel="linear",tol=0.1)
    svc.fit(x_train, y_train)
    pred = svc.predict(x_test)
    score.append(accuracy_score(y_test,pred)*100)
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

In [None]:
svc = SVC(kernel="linear",tol=0.1)
x = train.iloc[:,0]
y = train.iloc[:,-1]
x = tfidf.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model4 = svc.fit(x_train,y_train)
pred = model4.predict(x_test)
accuracy_score(y_test, pred)*100

In [None]:
# Training Data Accuracy

pred = model4.predict(x_train)
accuracy_score(y_train, pred)*100     #Generalized

In [None]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod4 = cross_val_score(SVC(kernel="linear",tol=0.1), x_train, y_train, cv=4)
print(mod4)
print(np.round(np.mean(mod4),2)*100)

In [None]:
test_pred = model1.predict(test_transformed)
test_pred

In [None]:
submission = pd.DataFrame({"id": test_data["id"],"target": test_pred})

In [None]:
submission.head()

In [None]:
submission.to_csv('disaster_tweet_submission.csv', index=False)

# MODEL & TEXT VECTORIZATION

In [176]:
x = train["text"]
y = train["target"]

In [177]:
model = Pipeline([
    ("tfidf", TfidfVectorizer()),
    #("lsvc", LinearSVC()),
    #("rf", RandomForestClassifier()),
    #("dt", DecisionTreeClassifier()),
    ("lor", LogisticRegression()),
    
])

In [178]:
x

0               deed reason earthquak may allah forgiv us
1                    forest fire near la rong sask canada
2       resid ask 'shelter place notifi offic evacu sh...
3       13,000 peopl receiv wildfir evacu order califo...
4       got sent photo rubi alaska smoke wildfir pour ...
                              ...                        
7608       two giant crane hold bridg collaps nearbi home
7609    aria_ahrari thetawniest control wild fire cali...
7610                   m1.94 01:04 utc 5km volcano hawaii
7611    polic investig e-bik collid car littl portug e...
7612    latest home raze northern california wildfir a...
Name: text, Length: 7613, dtype: object

In [179]:
score = []
for i in range(60):
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    model.fit(x_train,y_train)
    pred = model.predict(x_test)
    score.append(accuracy_score(y_test,pred))
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

i=  57
0.8319107025607354


In [180]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=57)
model.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test,pred)*100

83.19107025607354

In [181]:
test_pred = model.predict(test["text"])

In [182]:
submission = pd.DataFrame({"id": test["id"],"target": test_pred})
submission.to_csv('disaster_tweet_submission.csv', index=False)

In [183]:
""""LSVC
i=  48
0.8056467498358503
LOR
i=  57
0.8319107025607354
RF
i=  48
0.8089297439264609
DT
i=  16
0.7570584372948129""""

SyntaxError: EOL while scanning string literal (<ipython-input-183-d97e451f986d>, line 12)