In [51]:
# this project is about applying nlp techniques to understand if a given tweet is about a real disaster or not.

In [52]:
# import statements
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import seaborn

In [53]:
# load the data
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

# check the data
train.head(-5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7603,10862,,,Officials say a quarantine is in place at an A...,1
7604,10863,,,#WorldNews Fallen powerlines on G:link tram: U...,1
7605,10864,,,on the flip side I'm at Walmart and there is a...,1
7606,10866,,,Suicide bomber kills 15 in Saudi security site...,1


In [54]:
# check for missing values
print(train.isnull().sum())

print(test.isnull().sum())


id             0
keyword       61
location    2533
text           0
target         0
dtype: int64
id             0
keyword       26
location    1105
text           0
dtype: int64


In [55]:
# unique values in keyword and location columns 
train.keyword.nunique()

221

In [56]:
train.location.nunique()

3341

In [57]:
# Fill missing values in training data
train['location'] = train['location'].fillna('Unknown')
train['keyword'] = train['keyword'].fillna('None')

# Fill missing values in test data
test['location'] = test['location'].fillna('Unknown')
test['keyword'] = test['keyword'].fillna('None')

In [58]:

# Combine text columns for vectorization
train['text_combined'] = train['keyword'] + ' ' + train['location'] + ' ' + train['text']
test['text_combined'] = test['keyword'] + ' ' + test['location'] + ' ' + test['text']

# Vectorize text data using Bag of Words
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(max_features=10000)

# Vectorize text data using Tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)

X_train = vectorizer.fit_transform(train['text_combined'])
X_test = vectorizer.transform(test['text_combined'])

In [59]:
y_train = train['target']

In [60]:
from sklearn.model_selection import train_test_split

X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [61]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# Assuming X_train_split, X_val, y_train_split, y_val are already defined
# Define the pipeline
pipeline = Pipeline([
	('clf', LogisticRegression())
])

# Define the parameter grid
param_grid = [
	{
		'clf': [LogisticRegression()],
		'clf__C': [0.1, 1, 10]
	},
	{
		'clf': [RandomForestClassifier()],
		'clf__n_estimators': [50, 100, 200],
		'clf__max_depth': [None, 10, 20]
	},
	{
		'clf': [MultinomialNB()],
		'clf__alpha': [0.01, 0.1, 1]
	},
	{
		'clf': [XGBClassifier()],
		'clf__n_estimators': [50, 100, 200],
		'clf__max_depth': [3, 6, 9],
		'clf__learning_rate': [0.01, 0.1, 0.2]
	},
	{
		'clf': [SVC()],
		'clf__C': [0.1, 1, 10],
		'clf__kernel': ['linear', 'rbf']
	}
]

# Perform Grid Search CV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_split, y_train_split)

# Best model and parameters
print("Best Model:", grid_search.best_estimator_)
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

# Evaluate on the validation set
best_model = grid_search.best_estimator_
val_predictions = best_model.predict(X_val)
from sklearn.metrics import f1_score
val_f1_score = f1_score(y_val, val_predictions)
print("Validation F1 Score:", val_f1_score)

Best Model: Pipeline(steps=[('clf', SVC(C=1, kernel='linear'))])
Best Parameters: {'clf': SVC(), 'clf__C': 1, 'clf__kernel': 'linear'}
Best F1 Score: 0.7541665322785917
Validation F1 Score: 0.7627677100494233


In [65]:
from sklearn.metrics import classification_report, accuracy_score

# Predict on validation set
y_val_pred = best_model.predict(X_val)

# Evaluate the model
print(classification_report(y_val, y_val_pred))
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("F1 Score:", f1_score(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       874
           1       0.82      0.71      0.76       649

    accuracy                           0.81      1523
   macro avg       0.81      0.80      0.80      1523
weighted avg       0.81      0.81      0.81      1523

Accuracy: 0.8108995403808273
F1 Score: 0.7627677100494233


In [68]:
# Predict on test set
y_test_pred = best_model.predict(X_test)

In [69]:
# Create a submission file
submission = pd.DataFrame({'id': test['id'], 'target': y_test_pred})
submission.to_csv('Data/submission.csv', index=False)