## Importing the libraries 


---



In [990]:
import pandas as pd 
import string 
import re
import nltk 
import xgboost as xgb
from xgboost import XGBClassifier
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.ensemble import forest 
from sklearn import tree
from sklearn import linear_model
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import statistics as stats
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [991]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [992]:
df.shape

(7613, 5)

In [993]:
duplicate = df.duplicated().sum()
print(duplicate)

0


In [994]:
missing_columns = (df.isna().sum()/len(df))*100
print(missing_columns)

id           0.000000
keyword      0.801261
location    33.272035
text         0.000000
target       0.000000
dtype: float64


## Let's clean the messy data

In [995]:
keyword = df['keyword']
location = df['location']
new_df = df.drop(['keyword','location','id'],axis = 1)
new_df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


# Removing  punctuation


---



In [996]:
def rem_punctate(txt):
  txt_nopunt="".join([ c for c in txt if c not in string.punctuation])
  return txt_nopunt

new_df['text_clean']= new_df['text'].apply(lambda x: rem_punctate(x))

new_df.head()



Unnamed: 0,text,target,text_clean
0,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada
2,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...
3,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...
4,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...


there are links, maybe useful for validation of the tweets are fake or not. But still let's drop it 

In [997]:
new_df['text_clean'].astype(str).str.contains("http","https://")

0       False
1       False
2       False
3       False
4       False
        ...  
7608     True
7609    False
7610     True
7611    False
7612     True
Name: text_clean, Length: 7613, dtype: bool

Let's drop even the emojis


In [998]:

new_df["text_clean"] = new_df['text_clean'].str.replace('http\S+|www.\S+', '', case=False)
new_df['text_clean'] = new_df['text_clean'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)
# new_df.head()

## Tokenization 


---



In [999]:
def tokenize(txt):
  tokens=re.split("\W+",txt)# "W+" is represented for non words and "+" represents one or more words
  return tokens

In [1000]:
new_df["text_clean_tokenized"]=new_df["text_clean"].apply(lambda x:tokenize(x))
new_df.head()

Unnamed: 0,text,target,text_clean,text_clean_tokenized
0,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"[Our, Deeds, are, the, Reason, of, this, earth..."
1,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]"
2,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[All, residents, asked, to, shelter, in, place..."
3,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation..."
4,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"[Just, got, sent, this, photo, from, Ruby, Ala..."


## Remove Stop Words.


---


what is stop words?

words like is,are,the.. let's drop all those characters words.


In [1035]:
stop_words=nltk.corpus.stopwords.words('english')

In [1002]:
def rem_stopwords(txt_tokenized):
  txt_clean=[word for word in txt_tokenized if word not in stop_words]
  return txt_clean

In [1003]:
new_df["text_stopwords_clean"]=new_df["text_clean_tokenized"].apply(lambda x: rem_stopwords(x))
new_df.head()

Unnamed: 0,text,target,text_clean,text_clean_tokenized,text_stopwords_clean
0,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"[Our, Deeds, are, the, Reason, of, this, earth...","[Our, Deeds, Reason, earthquake, May, ALLAH, F..."
1,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]"
2,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[All, residents, asked, to, shelter, in, place...","[All, residents, asked, shelter, place, notifi..."
3,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation...","[13000, people, receive, wildfires, evacuation..."
4,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"[Just, got, sent, this, photo, from, Ruby, Ala...","[Just, got, sent, photo, Ruby, Alaska, smoke, ..."


## Stemming 


---



In [1004]:
ps=PorterStemmer()

In [1005]:
def stemming(tokenized_text):
  text=[ps.stem(word) for word in tokenized_text]
  return text

In [1006]:
new_df["text_stemmed_data"]=new_df["text_stopwords_clean"].apply(lambda x: stemming(x))
new_df.head()

Unnamed: 0,text,target,text_clean,text_clean_tokenized,text_stopwords_clean,text_stemmed_data
0,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"[Our, Deeds, are, the, Reason, of, this, earth...","[Our, Deeds, Reason, earthquake, May, ALLAH, F...","[our, deed, reason, earthquak, may, allah, for..."
1,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]","[forest, fire, near, La, rong, sask, canada]"
2,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[All, residents, asked, to, shelter, in, place...","[All, residents, asked, shelter, place, notifi...","[all, resid, ask, shelter, place, notifi, offi..."
3,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation...","[13000, people, receive, wildfires, evacuation...","[13000, peopl, receiv, wildfir, evacu, order, ..."
4,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"[Just, got, sent, this, photo, from, Ruby, Ala...","[Just, got, sent, photo, Ruby, Alaska, smoke, ...","[just, got, sent, photo, rubi, alaska, smoke, ..."


## Lemmatization


---



In [1007]:
wn=nltk.WordNetLemmatizer()

In [1008]:
def lemmatization(token_txt):
  text=[wn.lemmatize(word) for word in token_txt]
  return text

In [1009]:
new_df['text_tokenized']=new_df['text_stopwords_clean'].apply(lambda x:lemmatization(x))
new_df.head()

Unnamed: 0,text,target,text_clean,text_clean_tokenized,text_stopwords_clean,text_stemmed_data,text_tokenized
0,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"[Our, Deeds, are, the, Reason, of, this, earth...","[Our, Deeds, Reason, earthquake, May, ALLAH, F...","[our, deed, reason, earthquak, may, allah, for...","[Our, Deeds, Reason, earthquake, May, ALLAH, F..."
1,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[Forest, fire, near, La, Ronge, Sask, Canada]","[forest, fire, near, La, rong, sask, canada]","[Forest, fire, near, La, Ronge, Sask, Canada]"
2,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[All, residents, asked, to, shelter, in, place...","[All, residents, asked, shelter, place, notifi...","[all, resid, ask, shelter, place, notifi, offi...","[All, resident, asked, shelter, place, notifie..."
3,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation...","[13000, people, receive, wildfires, evacuation...","[13000, peopl, receiv, wildfir, evacu, order, ...","[13000, people, receive, wildfire, evacuation,..."
4,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"[Just, got, sent, this, photo, from, Ruby, Ala...","[Just, got, sent, photo, Ruby, Alaska, smoke, ...","[just, got, sent, photo, rubi, alaska, smoke, ...","[Just, got, sent, photo, Ruby, Alaska, smoke, ..."


## Count Vectorizer


---



In [1010]:
cv=CountVectorizer(analyzer=lemmatization)

In [1011]:
X = cv.fit_transform(new_df["text_tokenized"])
print(X.shape)

(7613, 21166)


In [1012]:
data_sample=new_df[0:100]
cv1=CountVectorizer(analyzer=lemmatization)

In [1013]:
X=cv1.fit_transform(data_sample["text_tokenized"])
print(X.shape)

(100, 630)


In [1014]:
df=pd.DataFrame(X.toarray(), columns=cv1.get_feature_names())
df.head()

Unnamed: 0,Unnamed: 1,080615,110358,115,118,13000,150,16,17TH,18,19,2,20,2013,23,2781,293,2k13,3,30,31,4,40,5,752,80,86,862015209,8m,9,A,ABLAZE,ACCIDENT,AFRICANBAZE,ALLAH,AM,ANOTHER,ARE,AT,AV,...,tornado,tracklist,traffic,training,trampling,traveling,treatment,truck,try,turned,u,upon,use,used,usual,vandalized,vehicle,video,visiting,wait,wanted,wave,wayI,wear,week,weird,wife,wildfire,wished,wmv,wonderful,wood,work,would,year,youre,youve,Û,ÛÒ,ÛÓ
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


## TFIDF-Vectorizer


---




In [1015]:
tf = TfidfVectorizer(analyzer = lemmatization,ngram_range=(2, 3))

In [1016]:
data_sample=new_df[0:10]
X=tf.fit_transform(data_sample["text_tokenized"])
print(X.shape)

(10, 77)


In [1017]:
df=pd.DataFrame(X.toarray(), columns=tf.get_feature_names())

In [1018]:
df.head()

Unnamed: 0,Unnamed: 1,13000,20,ALLAH,Alaska,All,CAfire,California,Canada,Colorado,County,Deeds,Forest,Forgive,Heavy,Hwy,I,Im,Just,La,Lake,Manitou,May,No,Our,Reason,RockyFire,Ronge,Ruby,Sask,Springs,Theres,Update,across,afraid,area,asked,building,cause,closed,coming,direction,disaster,due,earthquake,emergency,evacuation,expected,fire,flash,flood,flooding,got,happening,hill,near,notified,officer,order,people,photo,place,pours,rain,receive,resident,school,see,sent,shelter,smoke,street,top,tornado,u,wildfire,wood
0,0.0,0.0,0.0,0.353553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,0.0,0.353553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,0.0,0.353553,0.353553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.390639,0.0,0.0,0.0,0.390639,0.0,0.0,0.0,0.0,0.0,0.0,0.390639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.390639,0.0,0.390639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.29053,0.0,0.0,0.0,0.0,0.0,0.0,0.390639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.247873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.247873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.247873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18435,0.247873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.247873,0.247873,0.210715,0.0,0.0,0.495746,0.0,0.0,0.0,0.247873,0.0,0.0,0.0,0.495746,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.339379,0.399227,0.0,0.0,0.0,0.0,0.0,0.339379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.296917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.339379,0.399227,0.0,0.0,0.0,0.0,0.399227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.296917,0.0
4,0.265191,0.0,0.0,0.0,0.311955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311955,0.0,0.311955,0.0,0.0,0.0,0.311955,0.0,0.311955,0.0,0.311955,0.0,0.0,0.0,0.0,0.23201,0.0


## Hash Vectorizer


---



In [1019]:
hv = HashingVectorizer(analyzer = lemmatization, n_features=(3**3))

In [1020]:
X = hv.fit_transform(new_df["text_tokenized"])
print(X.shape)

(7613, 27)


In [1021]:
data_sample=new_df[0:100]
hv1=HashingVectorizer(analyzer=lemmatization)

In [1022]:
X=hv1.fit_transform(new_df["text_tokenized"])
print(X.shape)

(7613, 1048576)


In [1023]:
# df=pd.DataFrame(X.toarray(), columns=tf.get_feature_names())

In [1024]:
df.head()

Unnamed: 0,Unnamed: 1,13000,20,ALLAH,Alaska,All,CAfire,California,Canada,Colorado,County,Deeds,Forest,Forgive,Heavy,Hwy,I,Im,Just,La,Lake,Manitou,May,No,Our,Reason,RockyFire,Ronge,Ruby,Sask,Springs,Theres,Update,across,afraid,area,asked,building,cause,closed,coming,direction,disaster,due,earthquake,emergency,evacuation,expected,fire,flash,flood,flooding,got,happening,hill,near,notified,officer,order,people,photo,place,pours,rain,receive,resident,school,see,sent,shelter,smoke,street,top,tornado,u,wildfire,wood
0,0.0,0.0,0.0,0.353553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,0.0,0.353553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,0.0,0.353553,0.353553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.390639,0.0,0.0,0.0,0.390639,0.0,0.0,0.0,0.0,0.0,0.0,0.390639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.390639,0.0,0.390639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.29053,0.0,0.0,0.0,0.0,0.0,0.0,0.390639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.247873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.247873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.247873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18435,0.247873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.247873,0.247873,0.210715,0.0,0.0,0.495746,0.0,0.0,0.0,0.247873,0.0,0.0,0.0,0.495746,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.339379,0.399227,0.0,0.0,0.0,0.0,0.0,0.339379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.296917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.339379,0.399227,0.0,0.0,0.0,0.0,0.399227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.296917,0.0
4,0.265191,0.0,0.0,0.0,0.311955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311955,0.0,0.311955,0.0,0.0,0.0,0.311955,0.0,0.311955,0.0,0.311955,0.0,0.0,0.0,0.0,0.23201,0.0


## Let's start Model. 


---



In [1025]:
x = new_df["text_tokenized"]
y = new_df["target"]

In [1026]:
x_train,x_test,y_train,y_test = train_test_split = train_test_split(x,y, test_size=0.2, random_state=0)
print(x_train.shape)
print(x_test.shape)

(6090,)
(1523,)


In [1027]:
x_train = tf.fit_transform(x_train)
x_test = tf.transform(x_test)

## Cross-Validation Scores


---



In [1028]:
xg = xgb.XGBClassifier()
Rf =  forest.RandomForestClassifier()
Dtree = tree.DecisionTreeClassifier()
lr = linear_model.LogisticRegression()
svm = svm.SVC()

xgb_score = cross_val_score(xg,x_train,y_train,cv=5,n_jobs=5,fit_params=None)
ran_score = cross_val_score(Rf,x_train,y_train,cv=5)
dtree_score = cross_val_score(Dtree,x_train,y_train,cv=5)
log_score = cross_val_score(lr,x_train,y_train,cv=5)
svm_score = cross_val_score(svm,x_train,y_train,cv=5)
df_score = pd.DataFrame({"model":["XGBoost","RandomForestClassifier","DecisionTreeClassifier","LogisticRegression","Support vector machine"],"score":[stats.mean(xgb_score),stats.mean(ran_score),stats.mean(dtree_score),stats.mean(log_score),stats.mean(svm_score)]})
print(df_score)

                    model     score
0                 XGBoost  0.710345
1  RandomForestClassifier  0.766174
2  DecisionTreeClassifier  0.708867
3      LogisticRegression  0.786864
4  Support vector machine  0.782102


In [1029]:
# model = XGBClassifier(eta=0.3,min_child_weight=1,max_depth=6,gamma=0,subsample=0.5,colsample_bytree=0.5,colsample_bylevel=1)
# model  = model.fit(x_train,y_train)
# score = round(model.score(x_train, y_train) * 100, 2)
# print(score)

## Logistic Regression Model


---



In [1030]:
model = LogisticRegression()
model  = model.fit(x_train,y_train)
score = round(model.score(x_train, y_train) * 100, 2)
print(score)
test_score = round(model.score(x_test,y_test)*100,2)
print(test_score)

89.93
79.65


## Prediction


---



In [1031]:
test = pd.read_csv("test.csv")

tweet_id =  test["id"]
text = test["text"]
print(text.shape)

(3263,)


In [1032]:
text = tf.transform(text)

In [1033]:
pred = model.predict(text)
new_df = {"id":tweet_id,"target":pred}
new_df = pd.DataFrame(new_df)
print(new_df)

         id  target
0         0       0
1         2       0
2         3       0
3         9       0
4        11       0
...     ...     ...
3258  10861       1
3259  10865       0
3260  10868       0
3261  10874       0
3262  10875       0

[3263 rows x 2 columns]


In [1034]:
disaster_pred = new_df.to_csv("prediction.csv",index = False)
print(disaster_pred)

None
