In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
import spacy
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import GradientBoostingClassifier
import sys
import xgboost as xgb
import lightgbm as lgb
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/utk.tennessee.edu/nnaraya2/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
sys.path.append('/home/utk.edu/nnaraya2/phase2/CS_522/Final_Project/')

In [3]:
%load_ext autoreload
%autoreload 1
%aimport twitter_disaster_classification

In [4]:
training_data = pd.read_csv('/home/utk.edu/nnaraya2/phase2/CS_522/Final_Project/train.csv', index_col='id')
print('The shape of the training data is ', training_data.shape)
training_data.head(3)

The shape of the training data is  (7613, 4)


Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1


### CountVectorizer

In [5]:

# Create CountVectorizer vectors
count_vecs = twitter_disaster_classification.count_vectorizer(training_data['text'])


# Convert embeddings to DataFrame
embedding_data = pd.DataFrame(count_vecs, index=training_data.index)

# # Concatenate the original data with the embeddings and CountVectorizer vectors
training_data = pd.concat([training_data, embedding_data], axis=1)

# # Drop the 'text', 'processed_text', and 'embeddings' columns
training_data.drop(['keyword','location','text'], axis=1, inplace=True)

# # Display the DataFrame
training_data


Unnamed: 0_level_0,target,col_about,col_after,col_all,col_amp,col_an,col_and,col_are,col_as,col_at,...,col_what,col_when,col_who,col_why,col_will,col_with,col_would,col_you,col_your,col_û_
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10869,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10870,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10871,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10872,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [6]:
# Assuming 'training_data' is your DataFrame and 'target' is the column with labels
X = training_data.drop('target', axis=1)  # Features
y = training_data['target']  # Target variable

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Logistic Regression**

In [7]:


# Example usage with Logistic Regression
log_reg = LogisticRegression(max_iter=5000)
log_reg_metrics = twitter_disaster_classification.evaluate_model(log_reg, X_train, y_train, X_test, y_test)





Cross-validated scores: [0.69704433 0.72824302 0.72495895 0.68555008 0.69458128]
Mean CV Accuracy: 0.7060755336617406
Recall: 0.5855161787365177
Precision: 0.7063197026022305
Accuracy: 0.7196323046618516
TPR: 0.5855161787365177
TNR: 0.8192219679633868


In [8]:

# Define the parameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear', 'sag', 'saga']
}

# Create a Logistic Regression Classifier
log_reg = LogisticRegression(max_iter=5000)

# Instantiate the Grid Search model
grid_search_lr = GridSearchCV(estimator=log_reg, param_grid=param_grid_lr, cv=5, n_jobs=-1, verbose=1)
grid_search_lr.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters for Logistic Regression:", grid_search_lr.best_params_)

# Evaluate the best model on the test set
best_lr = grid_search_lr.best_estimator_
y_pred_lr = best_lr.predict(X_test)

# Calculate metrics
lr_metrics = twitter_disaster_classification.calculate_metrics(y_test, y_pred_lr)
print(lr_metrics)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters for Logistic Regression: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
{'recall': 0.5793528505392912, 'precision': 0.7001862197392924, 'accuracy': 0.7150361129349967, 'tpr': 0.5793528505392912, 'tnr': 0.8157894736842105}


**Random Forest**

In [9]:
# Example usage with Random Forest
random_forest = RandomForestClassifier()
rf_metrics = twitter_disaster_classification.evaluate_model(random_forest, X_train, y_train, X_test, y_test)


Cross-validated scores: [0.70689655 0.70279146 0.71182266 0.67816092 0.69868637]
Mean CV Accuracy: 0.6996715927750411
Recall: 0.5577812018489985
Precision: 0.6843100189035917
Accuracy: 0.7019041365725541
TPR: 0.5577812018489985
TNR: 0.8089244851258581


In [10]:

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest Classifier
rf = RandomForestClassifier()

# Instantiate the Grid Search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model on the test set
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Calculate metrics
rf_metrics = twitter_disaster_classification.calculate_metrics(y_test, y_pred)
print(rf_metrics)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
{'recall': 0.5423728813559322, 'precision': 0.7537473233404711, 'accuracy': 0.7294812869336835, 'tpr': 0.5423728813559322, 'tnr': 0.868421052631579}


**SVM**

In [11]:
# Example usage with SVM
svm = SVC()
svm_metrics = twitter_disaster_classification.evaluate_model(svm, X_train, y_train, X_test, y_test)

Cross-validated scores: [0.71100164 0.73809524 0.73399015 0.69129721 0.7044335 ]
Mean CV Accuracy: 0.7157635467980296
Recall: 0.5408320493066255
Precision: 0.7420718816067653
Accuracy: 0.7242284963887065
TPR: 0.5408320493066255
TNR: 0.8604118993135011


In [12]:

# Define the parameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'linear']
}

# Create an SVM Classifier
svm = SVC()

# Instantiate the Grid Search model
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=5, n_jobs=-1, verbose=1)
grid_search_svm.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters for SVM:", grid_search_svm.best_params_)

# Evaluate the best model on the test set
best_svm = grid_search_svm.best_estimator_
y_pred_svm = best_svm.predict(X_test)

# Calculate metrics
svm_metrics = twitter_disaster_classification.calculate_metrics(y_test, y_pred_svm)
print(svm_metrics)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters for SVM: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
{'recall': 0.5408320493066255, 'precision': 0.7420718816067653, 'accuracy': 0.7242284963887065, 'tpr': 0.5408320493066255, 'tnr': 0.8604118993135011}


**Gradient Boosting Classifier**

In [13]:

# Base Gradient Boosting Classifier
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)

gb_metrics = twitter_disaster_classification.evaluate_model(gb, X_train, y_train, X_test, y_test)


Cross-validated scores: [0.70279146 0.73316913 0.72413793 0.68965517 0.70607553]
Mean CV Accuracy: 0.7111658456486042
Recall: 0.5331278890600925
Precision: 0.7163561076604554
Accuracy: 0.711096520026264
TPR: 0.5331278890600925
TNR: 0.8432494279176201


In [14]:

# Parameter grid for Gradient Boosting
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Grid Search for Gradient Boosting
grid_search_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gb, cv=5, n_jobs=-1, verbose=1)
grid_search_gb.fit(X_train, y_train)

# Best parameters and model evaluation
print("Best Parameters:", grid_search_gb.best_params_)
best_gb = grid_search_gb.best_estimator_
y_pred = best_gb.predict(X_test)
gb_metrics = twitter_disaster_classification.calculate_metrics(y_test, y_pred)
print(gb_metrics)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
{'recall': 0.5269645608628659, 'precision': 0.7184873949579832, 'accuracy': 0.7104399212081418, 'tpr': 0.5269645608628659, 'tnr': 0.8466819221967964}


**XGBoost**

In [15]:

# Base XGBoost Classifier
xgb_clf = xgb.XGBClassifier(random_state=42)
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)

xgb_metrics = twitter_disaster_classification.evaluate_model(xgb_clf, X_train, y_train, X_test, y_test)

# Calculate and print metrics
xgb_metrics = twitter_disaster_classification.calculate_metrics(y_test, y_pred)
print(xgb_metrics)

Cross-validated scores: [0.69458128 0.71921182 0.72495895 0.66995074 0.68801314]
Mean CV Accuracy: 0.6993431855500821
Recall: 0.5855161787365177
Precision: 0.7238095238095238
Accuracy: 0.7281680892974393
TPR: 0.5855161787365177
TNR: 0.834096109839817
{'recall': 0.5855161787365177, 'precision': 0.7238095238095238, 'accuracy': 0.7281680892974393, 'tpr': 0.5855161787365177, 'tnr': 0.834096109839817}


In [16]:
# Parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

# Grid Search for XGBoost
grid_search_xgb = GridSearchCV(xgb.XGBClassifier(random_state=42), param_grid_xgb, cv=5, n_jobs=-1, verbose=1)
grid_search_xgb.fit(X_train, y_train)

# Best parameters and model evaluation
print("Best Parameters:", grid_search_xgb.best_params_)
best_xgb = grid_search_xgb.best_estimator_
y_pred = best_xgb.predict(X_test)

xgb_metrics = twitter_disaster_classification.calculate_metrics(y_test, y_pred)
print(xgb_metrics)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1}
{'recall': 0.5454545454545454, 'precision': 0.7283950617283951, 'accuracy': 0.7196323046618516, 'tpr': 0.5454545454545454, 'tnr': 0.8489702517162472}


**LightGBM**

In [17]:

# Base LightGBM Classifier
lgb_clf = lgb.LGBMClassifier(random_state=42)
lgb_clf.fit(X_train, y_train)
y_pred = lgb_clf.predict(X_test)

lgb_metrics = twitter_disaster_classification.evaluate_model(lgb_clf, X_train, y_train, X_test, y_test)


# Calculate and print metrics
lgb_metrics = twitter_disaster_classification.calculate_metrics(y_test, y_pred)
print(lgb_metrics)


Cross-validated scores: [0.69704433 0.72495895 0.73809524 0.68144499 0.69950739]
Mean CV Accuracy: 0.7082101806239736
Recall: 0.5839753466872111
Precision: 0.7219047619047619
Accuracy: 0.726854891661195
TPR: 0.5839753466872111
TNR: 0.8329519450800915
{'recall': 0.5839753466872111, 'precision': 0.7219047619047619, 'accuracy': 0.726854891661195, 'tpr': 0.5839753466872111, 'tnr': 0.8329519450800915}


In [19]:
# Parameter grid for LightGBM
param_grid_lgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [-1, 5, 7],
    'num_leaves': [31, 50],
    'min_child_samples': [20, 30]
}

# Grid Search for LightGBM
grid_search_lgb = GridSearchCV(lgb.LGBMClassifier(random_state=42), param_grid_lgb, cv=5, n_jobs=-1, verbose=1)
grid_search_lgb.fit(X_train, y_train)

# Best parameters and model evaluation
print("Best Parameters:", grid_search_lgb.best_params_)
best_lgb = grid_search_lgb.best_estimator_
y_pred = best_lgb.predict(X_test)
lgb_metrics = twitter_disaster_classification.calculate_metrics(y_test, y_pred)
print(lgb_metrics)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'min_child_samples': 20, 'n_estimators': 100, 'num_leaves': 31}
{'recall': 0.5500770416024653, 'precision': 0.7241379310344828, 'accuracy': 0.7189757058437295, 'tpr': 0.5500770416024653, 'tnr': 0.8443935926773455}


**Using sentiments too**

In [23]:
training_data

Unnamed: 0_level_0,target,col_about,col_after,col_all,col_amp,col_an,col_and,col_are,col_as,col_at,...,col_what,col_when,col_who,col_why,col_will,col_with,col_would,col_you,col_your,col_û_
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10869,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10870,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10871,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10872,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [25]:
pwd

'/home/utk.edu/nnaraya2/doctoral_research_utk'

In [26]:
training_data = pd.read_csv('/home/utk.edu/nnaraya2/phase2/CS_522/Final_Project/train.csv', index_col='id')
print('The shape of the training data is ', training_data.shape)

sia = SentimentIntensityAnalyzer()

# Create CountVectorizer vectors
count_vecs = twitter_disaster_classification.count_vectorizer(training_data['text'])


# Convert embeddings to DataFrame
embedding_data = pd.DataFrame(count_vecs, index=training_data.index)

# # Concatenate the original data with the embeddings and CountVectorizer vectors
training_data = pd.concat([training_data, embedding_data], axis=1)

training_data['sentiment'] =training_data['text'].map(lambda x:sia.polarity_scores(x)['neg'])

# # Drop the 'text', 'processed_text', and 'embeddings' columns
training_data.drop(['keyword','location','text'], axis=1, inplace=True)

# # Display the DataFrame
training_data


The shape of the training data is  (7613, 4)


Unnamed: 0_level_0,target,col_about,col_after,col_all,col_amp,col_an,col_and,col_are,col_as,col_at,...,col_when,col_who,col_why,col_will,col_with,col_would,col_you,col_your,col_û_,sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.000
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.286
5,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.095
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.000
7,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10869,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.262
10870,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.166
10871,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.000
10872,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0.345


In [27]:
# Assuming 'training_data' is your DataFrame and 'target' is the column with labels
X = training_data.drop('target', axis=1)  # Features
y = training_data['target']  # Target variable

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Creating and training the SVM model
svm_model = SVC(kernel='linear')  # You can try different kernels like 'rbf', 'poly', etc.
svm_model.fit(X_train, y_train)

# Making predictions
y_pred = svm_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
svm_metrics = twitter_disaster_classification.calculate_metrics(y_test, y_pred)
print(svm_metrics)


Accuracy: 0.7156927117531189
{'recall': 0.5855161787365177, 'precision': 0.6985294117647058, 'accuracy': 0.7156927117531189, 'tpr': 0.5855161787365177, 'tnr': 0.8123569794050344}


In [31]:
from sklearn.feature_extraction.text import CountVectorizer

# Example sentences
corpus = [
    "Hello, how are you?",
    "Im fine, how about you?",
    "Glad to hear you're fine."
]

# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Fit and transform the corpus
X = vectorizer.fit_transform(corpus)

# Convert the result to an array
count_array = X.toarray()

# Display the feature names (words) and the count array
feature_names = vectorizer.get_feature_names_out()
print("Features (Words):", feature_names)
print("Count Array:\n", count_array)


Features (Words): ['about' 'are' 'fine' 'glad' 'hear' 'hello' 'how' 'im' 're' 'to' 'you']
Count Array:
 [[0 1 0 0 0 1 1 0 0 0 1]
 [1 0 1 0 0 0 1 1 0 0 1]
 [0 0 1 1 1 0 0 0 1 1 1]]
