In [None]:


import numpy as np 
import pandas as pd 


In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report,accuracy_score

import  string,nltk,re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [None]:
# used to download the necessary data files for NLTK's tokenization module called "Punkt".
nltk.download("punkt")
# used to download the stopwords corpus from NLTK
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#defining training and testing dataset to variables and then displaying the top 5 rows of training dataset
train_df=pd.read_csv("/content/sample_data/train.csv")
test_df = pd.read_csv("/content/sample_data/test.csv")
train_df.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
#counting the number of null rows for each column
train_df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [None]:
train = train_df.drop(columns=["keyword","location"])
train.head(5)



Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
train.dtypes

id         int64
text      object
target     int64
dtype: object

In [None]:
test_df.isnull().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [None]:
test_df.dtypes

id           int64
keyword     object
location    object
text        object
dtype: object

In [None]:
train["target"].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [None]:
print(f"natural disaster tweets  : {train.loc[train['target']==1,'text'][1:6].values}")

natural disaster tweets  : ['Forest fire near La Ronge Sask. Canada'
 "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"
 '13,000 people receive #wildfires evacuation orders in California '
 'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school '
 '#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires']


In [None]:
print(f" not natural disaster tweets :{train.loc[train['target']==0,'text'][1:6].values}")

 not natural disaster tweets :['I love fruits' 'Summer is lovely' 'My car is so fast'
 'What a goooooooaaaaaal!!!!!!' 'this is ridiculous....']


In [None]:
#preprocessing
def preprocessing(x):
    x= x.lower()
    pattern = re.compile("[^a-z]")
    words = nltk.word_tokenize(x)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words=[PorterStemmer().stem(word) for word in words if word.lower() not in stop_words]
    
    preprocessed_text = ' '.join(words)
    
    return preprocessed_text
    
    

In [None]:
#applying preprocessing function in training dataset
train['cleaned_text']=train['text'].apply(preprocessing)
test_df['text']=test_df['text'].apply(preprocessing)

train

Unnamed: 0,id,text,target,cleaned_text
0,1,Our Deeds are the Reason of this #earthquake M...,1,deed reason # earthquak may allah forgiv us
1,4,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask . canada
2,5,All residents asked to 'shelter in place' are ...,1,resid ask 'shelter place ' notifi offic . evac...
3,6,"13,000 people receive #wildfires evacuation or...",1,"13,000 peopl receiv # wildfir evacu order cali..."
4,7,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi # alaska smoke # wildfir p...
...,...,...,...,...
7608,10869,Two giant cranes holding a bridge collapse int...,1,two giant crane hold bridg collaps nearbi home...
7609,10870,@aria_ahrary @TheTawniest The out of control w...,1,@ aria_ahrari @ thetawniest control wild fire ...
7610,10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,m1.94 [ 01:04 utc ] ? 5km volcano hawaii . htt...
7611,10872,Police investigating after an e-bike collided ...,1,polic investig e-bik collid car littl portug ....


In [None]:
x= train['cleaned_text'].values
y=train['target'].values

In [None]:
# This line of code creates a TfidfVectorizer object
classifier =  TfidfVectorizer()
#The fit_transform() method first fits the TfidfVectorizer object to the text data.
x=classifier.fit_transform(x)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=30,stratify=y)

In [None]:
#Model Selection 
#1st Model - Logistic Regression
logRegression=LogisticRegression(penalty='l2')
logRegression.fit(x_train,y_train)

In [None]:
#2nd model - Support Vector Machine
from sklearn.svm import SVC
svc_model=SVC()
svc_model.fit(x_train,y_train)

In [None]:

#1st model - Logistic Regression
output1=logRegression.predict(x_train)
accuracy_score(y_train,output1)

0.8876847290640394

In [None]:

#2nd model - Support Vector Machine
output2=svc_model.predict(x_train)
accuracy_score(y_train,output2)

0.9706075533661741

In [None]:
#Model Evaluation
#1st Model - Logistic Regression
pred = logRegression.predict(x_test)
accuracy_score(y_test,pred)

0.8030203545633617

In [None]:
#Model Evaluation 
#2nd model -  Support Vector Machine
pred1 = svc_model.predict(x_test)
score= accuracy_score(y_test,pred1,normalize=True)
score

0.7951411687458962

In [None]:
#prediction on test dataset
x = classifier.transform(test_df['text'])
predict = svc_model.predict(x)

In [None]:
accuracy = accuracy_score(y_test,pred1)
accuracy

0.7951411687458962

In [None]:
submission = pd.DataFrame({'Id':test_df["id"],'target':predict})
submission.to_csv('submission.csv',index=False)