# **Disaster Tweets**

# Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

# Importing the training dataset

In [2]:
dataset_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')

# Importing the test dataset

In [3]:
dataset_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# Cleaning the training dataset texts

In [4]:
corpus_train = []
for i in range(0, 7613):
    text_train = re.sub('[^a-zA-Z]', ' ', dataset_train['text'][i])
    text_train = text_train.lower()
    text_train = text_train.split()
    ps_train = PorterStemmer()
    text_train = [ps_train.stem(word) for word in text_train if not word in set(stopwords.words('english'))]
    text_train = ' '.join(text_train)
    corpus_train.append(text_train)

# Cleaning the test dataset texts

In [5]:
corpus_test = []
for i in range(0, 3263):
    text_test = re.sub('[^a-zA-Z]', ' ', dataset_test['text'][i])
    text_test = text_test.lower()
    text_test = text_test.split()
    ps_test = PorterStemmer()
    text_test = [ps_test.stem(word) for word in text_test if not word in set(stopwords.words('english'))]
    text_test = ' '.join(text_test)
    corpus_test.append(text_test)

# Creating the Bag of Words model for train and Test dataset

In [6]:
cv = CountVectorizer(max_features = 1840)
X_train = cv.fit_transform(corpus_train).toarray()
X_test = cv.fit_transform(corpus_test).toarray()
y_train = dataset_train.iloc[:, 4].values

# Training the model on the Training set

In [7]:
from xgboost import XGBClassifier
classifier = XGBClassifier(booster = 'gbtree', gamma = 0.8, max_depth = 30, learning_rate = 0.62,n_estimators=150)
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0.8, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.62, max_delta_step=0, max_depth=30,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=150, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

# Predicting the Training set results

In [8]:
y_pred_train = classifier.predict(X_train)

print('Confusion Matrix :')
print(confusion_matrix(y_train, y_pred_train)) 
print('Accuracy Score :',accuracy_score(y_train, y_pred_train))
print('Report : ')
print(classification_report(y_train, y_pred_train))

Confusion Matrix :
[[4182  160]
 [ 458 2813]]
Accuracy Score : 0.9188230658084855
Report : 
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      4342
           1       0.95      0.86      0.90      3271

    accuracy                           0.92      7613
   macro avg       0.92      0.91      0.92      7613
weighted avg       0.92      0.92      0.92      7613



# Predicting the test dataset Results

In [9]:
y_pred_test = classifier.predict(X_test)
output = pd.DataFrame({'id': dataset_test.id, 'target': y_pred_test})
output.to_csv('my_submission_nlp_30.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
