In [228]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [229]:
train=pd.read_csv("E:\\nlp-getting-started\\train.csv")
test=pd.read_csv("E:\\nlp-getting-started\\test.csv")

In [230]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [231]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [232]:
null=((train.isnull().sum())/len(train))*100
null

id           0.000000
keyword      0.801261
location    33.272035
text         0.000000
target       0.000000
dtype: float64

In [233]:
def clean_text(text):
    # remove punctuation
    text = text.translate(text.maketrans("", "", string.punctuation))
    
    # Remove special characters
    text = re.sub(r"[^a-zA-Z]", " ", text)
    
    # Remove numbers
    text=re.sub(r'[^a-zA-z.,!?/:;\"\'\s]', " ", text)
    
    # Removing URLs:
    text = re.sub(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})', "", text)
    
    # remove http
    text = re.sub(r"\S*https?:\S*", "", text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]
    
    # Join the words back into one string separated by space and return the result
    return " ".join(words)

In [234]:
train=train.drop(columns=['keyword','location'])

In [235]:
train['text1']=train['text'].apply(clean_text)

In [236]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001f600-\U0001f64f"  # emoticons
        u"\U0001f300-\U0001f5ff"  # symbols & pictographs
        u"\U0001f680-\U0001f6ff"  # transport & map symbols
        u"\U0001f1e0-\U0001f1ff"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [237]:
train['text2']=train['text1'].apply(clean_text)

In [238]:
def lemmatize(text):
    ls = WordNetLemmatizer()
    words = text.split()
    lemmatized_words = [ls.lemmatize(word) for word in words]
    return [" ".join(lemmatized_words)]

In [239]:
train['text3']=train['text2'].apply(lemmatize)

In [240]:
train

Unnamed: 0,id,text,target,text1,text2,text3
0,1,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake may allah forgive us,deeds reason earthquake may allah forgive us,[deed reason earthquake may allah forgive u]
1,4,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,forest fire near la ronge sask canada,[forest fire near la ronge sask canada]
2,5,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...,residents asked shelter place notified officer...,[resident asked shelter place notified officer...
3,6,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders cal...,people receive wildfires evacuation orders cal...,[people receive wildfire evacuation order cali...
4,7,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...,got sent photo ruby alaska smoke wildfires pou...,[got sent photo ruby alaska smoke wildfire pou...
...,...,...,...,...,...,...
7608,10869,Two giant cranes holding a bridge collapse int...,1,two giant cranes holding bridge collapse nearb...,two giant cranes holding bridge collapse nearb...,[two giant crane holding bridge collapse nearb...
7609,10870,@aria_ahrary @TheTawniest The out of control w...,1,ariaahrary thetawniest control wild fires cali...,ariaahrary thetawniest control wild fires cali...,[ariaahrary thetawniest control wild fire cali...
7610,10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,utc km volcano hawaii httptcozdtoyd ebj,utc km volcano hawaii httptcozdtoyd ebj,[utc km volcano hawaii httptcozdtoyd ebj]
7611,10872,Police investigating after an e-bike collided ...,1,police investigating ebike collided car little...,police investigating ebike collided car little...,[police investigating ebike collided car littl...


In [241]:
# Create sentences to get clean text as input for vectors

def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [242]:
train['text4']=train['text3'].apply(return_sentences)

In [243]:
train

Unnamed: 0,id,text,target,text1,text2,text3,text4
0,1,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake may allah forgive us,deeds reason earthquake may allah forgive us,[deed reason earthquake may allah forgive u],deed reason earthquake may allah forgive u
1,4,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,forest fire near la ronge sask canada,[forest fire near la ronge sask canada],forest fire near la ronge sask canada
2,5,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...,residents asked shelter place notified officer...,[resident asked shelter place notified officer...,resident asked shelter place notified officer ...
3,6,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders cal...,people receive wildfires evacuation orders cal...,[people receive wildfire evacuation order cali...,people receive wildfire evacuation order calif...
4,7,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...,got sent photo ruby alaska smoke wildfires pou...,[got sent photo ruby alaska smoke wildfire pou...,got sent photo ruby alaska smoke wildfire pour...
...,...,...,...,...,...,...,...
7608,10869,Two giant cranes holding a bridge collapse int...,1,two giant cranes holding bridge collapse nearb...,two giant cranes holding bridge collapse nearb...,[two giant crane holding bridge collapse nearb...,two giant crane holding bridge collapse nearby...
7609,10870,@aria_ahrary @TheTawniest The out of control w...,1,ariaahrary thetawniest control wild fires cali...,ariaahrary thetawniest control wild fires cali...,[ariaahrary thetawniest control wild fire cali...,ariaahrary thetawniest control wild fire calif...
7610,10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,utc km volcano hawaii httptcozdtoyd ebj,utc km volcano hawaii httptcozdtoyd ebj,[utc km volcano hawaii httptcozdtoyd ebj],utc km volcano hawaii httptcozdtoyd ebj
7611,10872,Police investigating after an e-bike collided ...,1,police investigating ebike collided car little...,police investigating ebike collided car little...,[police investigating ebike collided car littl...,police investigating ebike collided car little...


In [244]:
x=train['text4']
y=train.target

In [245]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.10,random_state=44)

In [246]:
from sklearn.feature_extraction.text import CountVectorizer

In [247]:
cnt=CountVectorizer()
x_train=cnt.fit_transform(x_train)
x_test=cnt.transform(x_test)

In [248]:
x_train

<6851x20113 sparse matrix of type '<class 'numpy.int64'>'
	with 68458 stored elements in Compressed Sparse Row format>

In [249]:
from sklearn.svm import SVC
model=SVC(kernel = 'rbf')
model.fit(x_train,y_train)

SVC()

In [250]:
y_pre=model.predict(x_test)

In [251]:
from sklearn.metrics import accuracy_score ,classification_report,confusion_matrix

In [252]:
from sklearn.metrics import accuracy_score ,classification_report,confusion_matrix
score=accuracy_score(y_test,y_pre,normalize=True)
print(f'Accuracy : {round(score, 4)*100}%')

Accuracy : 80.45%


In [253]:
print("classification_report is : \n",classification_report(y_test,y_pre))

classification_report is : 
               precision    recall  f1-score   support

           0       0.78      0.92      0.84       435
           1       0.86      0.65      0.74       327

    accuracy                           0.80       762
   macro avg       0.82      0.79      0.79       762
weighted avg       0.81      0.80      0.80       762



In [254]:
from sklearn.ensemble import RandomForestClassifier

In [255]:
import xgboost as xg

In [256]:
xg_model=xg.XGBClassifier()
xg_model.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [257]:
score=xg_model.score(x_train,y_train)
score

0.8352065391913589

In [258]:
pred=xg_model.predict(x_test)

In [259]:
from sklearn.metrics import accuracy_score ,classification_report,confusion_matrix
score=accuracy_score(y_test,pred,normalize=True)
print(f'Accuracy : {round(score, 4)*100}%')

Accuracy : 78.74%


In [260]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score,KFold

In [261]:
print((cross_val_score(xg_model,x_train,y_train,cv=5)).mean())

0.7832426647926016


In [262]:
# naive baiyes

In [263]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB

In [264]:
g_model=MultinomialNB()
g_model.fit(x_train,y_train)

MultinomialNB()

In [265]:
score=g_model.score(x_train,y_train)

In [266]:
score

0.9224930667055904

In [267]:
pred=g_model.predict(x_test)

In [268]:
print((cross_val_score(g_model,x_train,y_train,cv=5)).mean())

0.795942649352862


In [269]:
score=g_model.score(x_test,y_test)

In [270]:
score

0.8018372703412073

In [271]:
test=test.drop(columns=['keyword','location'],axis=1)

In [272]:
test.sample(10)

Unnamed: 0,id,text
862,2823,@ERPESTAR i aint a bitch girl popobawa revolve...
1782,6018,50k plays on our 'Delirious' remix!! Happy it'...
2264,7541,'California: Spring Oil Spill Estimate Grows '...
1450,4811,Pelling hotels: no strings concealment from st...
2932,9711,Rly tragedy in MP: Some live to recount horror...
300,975,My big buzzy John BlaZe. Jus Kame home from a ...
3190,10597,You still acting like you were the one wounded...
2377,7947,@WerdEmUp It's already been messed up thanks t...
3014,9957,Today we played mini golf in the rain I was ca...
523,1713,Australia's Ashes disaster - how the collapse ...


In [273]:
test['text']=test['text'].apply(clean_text)

In [274]:
test['text']=test['text'].apply(remove_emoji)

In [275]:
test['text']=test['text'].apply(lemmatize)

In [276]:
test['text']=test['text'].apply(return_sentences)

In [277]:
test.sample(10)

Unnamed: 0,id,text
1476,4916,fact checking machine must exploded today foll...
2090,7016,thedreaminggoat mayhem robertosaguedes mayhem ...
2582,8603,antifracking group seek review lancashire seis...
929,3065,betrayedhunter reading paused came across coup...
2513,8379,fly major problem u ruin batch need destroy ev...
2031,6831,heard really loud bang outside bedroom door im...
3152,10463,stampede fire brought home love make stay tame...
2085,7003,slayer reflects low mayhem festival attendance...
1052,3453,coolbreezet train derailed smithsonianno passe...
2709,9019,see jimmiejohnson wreck person first thought h...


In [281]:
data_test=cnt.transform(test['text'])

In [282]:
answer = pd.read_csv("E:\\nlp-getting-started\\sample_submission.csv")
answer['target'] = g_model.predict(data_test)
answer.to_csv('submission.csv', index=False)