In [1]:
# this project is about applying nlp techniques to understand if a given tweet is about a real disaster or not.

In [2]:
# import statements
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import seaborn

In [3]:
# load the data
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

# check the data
train.head(-5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7603,10862,,,Officials say a quarantine is in place at an A...,1
7604,10863,,,#WorldNews Fallen powerlines on G:link tram: U...,1
7605,10864,,,on the flip side I'm at Walmart and there is a...,1
7606,10866,,,Suicide bomber kills 15 in Saudi security site...,1


In [4]:
# check for missing values
print(train.isnull().sum())

print(test.isnull().sum())


id             0
keyword       61
location    2533
text           0
target         0
dtype: int64
id             0
keyword       26
location    1105
text           0
dtype: int64


In [5]:
# unique values in keyword and location columns 
train.keyword.nunique()

221

In [6]:
train.location.nunique()

3341

In [7]:
# Fill missing values in training data
train['location'] = train['location'].fillna('Unknown')
train['keyword'] = train['keyword'].fillna('None')

# Fill missing values in test data
test['location'] = test['location'].fillna('Unknown')
test['keyword'] = test['keyword'].fillna('None')

In [8]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download the VADER lexicon
# nltk.download('vader_lexicon')

def sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_scores = analyzer.polarity_scores(text)
    compound_score = sentiment_scores['compound']
    
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

train['sentiment'] = train['text'].apply(sentiment)
test['sentiment'] = test['text'].apply(sentiment)

In [9]:
train.head()

Unnamed: 0,id,keyword,location,text,target,sentiment
0,1,,Unknown,Our Deeds are the Reason of this #earthquake M...,1,positive
1,4,,Unknown,Forest fire near La Ronge Sask. Canada,1,negative
2,5,,Unknown,All residents asked to 'shelter in place' are ...,1,negative
3,6,,Unknown,"13,000 people receive #wildfires evacuation or...",1,neutral
4,7,,Unknown,Just got sent this photo from Ruby #Alaska as ...,1,neutral


In [10]:
# check the distribution of sentiment
print(train['sentiment'].value_counts())

# check for nan values in sentiment
print(train['sentiment'].isnull().sum())

sentiment
negative    3707
neutral     2013
positive    1893
Name: count, dtype: int64
0


In [11]:

# # Combine text columns for vectorization
# train['text_combined'] = train['keyword'] + ' ' + train['location'] + ' ' + train['text']
# test['text_combined'] = test['keyword'] + ' ' + test['location'] + ' ' + test['text']

# # Vectorize text data using Tf-idf
# from sklearn.feature_extraction.text import TfidfVectorizer
# vectorizer = TfidfVectorizer(max_features=10000)

# X_train = vectorizer.fit_transform(train['text_combined'])
# X_test = vectorizer.transform(test['text_combined'])

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import hstack

# Assuming train and test DataFrames are already loaded
# Vectorize each column individually
tfidf_vectorizer = TfidfVectorizer()
keyword_vectorizer = CountVectorizer()
location_vectorizer = CountVectorizer()
sentiment_vectorizer = CountVectorizer()

# Use tf-idf vectorizer for text column and count vectorizer for keyword and location columns
text_vectorized = tfidf_vectorizer.fit_transform(train['text'])
keyword_vectorized = keyword_vectorizer.fit_transform(train['keyword'].fillna(''))
location_vectorized = location_vectorizer.fit_transform(train['location'].fillna(''))
sentiment_vectorized = sentiment_vectorizer.fit_transform(train['sentiment'].fillna(''))

# Combine the vectorized columns and sentiment column using hstack
X_train = hstack([text_vectorized, keyword_vectorized, location_vectorized, sentiment_vectorized])

# Print the shape of the combined sparse matrix
print(f"X_train shape: {X_train.shape}")

# Vectorize test data
text_vectorized_test = tfidf_vectorizer.transform(test['text'])
keyword_vectorized_test = keyword_vectorizer.transform(test['keyword'].fillna(''))
location_vectorized_test = location_vectorizer.transform(test['location'].fillna(''))
sentiment_vectorized_test = sentiment_vectorizer.transform(test['sentiment'].fillna(''))

# Combine the vectorized columns and sentiment column using hstack
X_test = hstack([text_vectorized_test, keyword_vectorized_test, location_vectorized_test, sentiment_vectorized_test])

# Print the shape of the combined sparse matrix
print(f"X_test shape: {X_test.shape}")

X_train shape: (7613, 25141)
X_test shape: (3263, 25141)


In [13]:
y_train = train['target']

In [14]:
# # fit logistic regression model to the training data
# model = LogisticRegression()
# model.fit(X_train, y_train)

# # make predictions on the test data
# y_pred = model.predict(X_test)

# # create a submission file
# submission = pd.DataFrame({'id': test['id'], 'target': y_pred})
# submission.to_csv('submission.csv', index=False)

In [15]:
from sklearn.model_selection import train_test_split

X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [18]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


# Convert X_train_split to dataframe
X_train_split_df = pd.DataFrame.sparse.from_spmatrix(X_train_split)


# Define the pipeline
pipeline = Pipeline([
	('clf', LogisticRegression(max_iter=200))  
])

# Define the parameter grid
param_grid = [
	{
		'clf': [LogisticRegression(max_iter=200)],  
		'clf__C': [0.1, 1, 10]
	},
	{
		'clf': [RandomForestClassifier()],
		'clf__n_estimators': [50, 100, 200],
		'clf__max_depth': [None, 10, 20]
	},
	{
		'clf': [MultinomialNB()],
		'clf__alpha': [0.01, 0.1, 1]
	},
	{
		'clf': [XGBClassifier()],
		'clf__n_estimators': [50, 100, 200],
		'clf__max_depth': [3, 6, 9],
		'clf__learning_rate': [0.01, 0.1, 0.2]
	},
	{
		'clf': [SVC()],
		'clf__C': [0.1, 1, 10],
		'clf__kernel': ['linear', 'rbf']
	}
]

# Perform Grid Search CV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_split_df, y_train_split)

# Best model and parameters
print("Best Model:", grid_search.best_estimator_)
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

# Evaluate on the validation set
best_model = grid_search.best_estimator_
val_predictions = best_model.predict(X_val)
from sklearn.metrics import f1_score
val_f1_score = f1_score(y_val, val_predictions)
print("Validation F1 Score:", val_f1_score)

30 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/naveenmalla/Documents/Projects/Kaggle-Projects/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/naveenmalla/Documents/Projects/Kaggle-Projects/venv/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/naveenmalla/Documents/Projects/Kaggle-Projects/venv/lib/python3.9/site-packages/sklearn/pipeline.py", line 476, in fit
    self._final_estimator.fit(Xt, y, **last_step_par

Best Model: Pipeline(steps=[('clf', MultinomialNB(alpha=0.1))])
Best Parameters: {'clf': MultinomialNB(), 'clf__alpha': 0.1}
Best F1 Score: 0.74234368372303
Validation F1 Score: 0.7278241091736164


In [19]:
from sklearn.metrics import classification_report, accuracy_score

# Predict on validation set
y_val_pred = best_model.predict(X_val)

# Evaluate the model
print(classification_report(y_val, y_val_pred))
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("F1 Score:", f1_score(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.80      0.78      0.79       874
           1       0.72      0.74      0.73       649

    accuracy                           0.76      1523
   macro avg       0.76      0.76      0.76      1523
weighted avg       0.77      0.76      0.76      1523

Accuracy: 0.7642810242941562
F1 Score: 0.7278241091736164


In [20]:
# Predict on test set
y_test_pred = best_model.predict(X_test)

In [21]:
# Create a submission file
submission = pd.DataFrame({'id': test['id'], 'target': y_test_pred})
submission.to_csv('Data/submission.csv', index=False)