#Bag of Words

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import randint

# Load the datasets
trainData = pd.read_csv('/content/trainData.csv').dropna(subset=['Body'])
testData = pd.read_csv('/content/testData.csv').dropna(subset=['Body'])
valData = pd.read_csv('/content/validationData.csv').dropna(subset=['Body'])

# Setup the data
X_train = trainData['Body']
y_train = trainData['label']
X_test = testData['Body']
y_test = testData['label']
X_val = valData['Body']
y_val = valData['label']

# Initialize Stratified K-Fold Cross-Validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the pipeline
pipeline_rf = Pipeline([
    ('bow', CountVectorizer()),  # Changed from 'tfidf' to 'bow'
    ('clf', RandomForestClassifier(random_state=42)),
])

# Parameters to tune
parameters_rf = {
    'bow__max_df': (0.75, 0.85),
    'bow__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
    'clf__n_estimators': [100, 300],  # number of trees in the forest
    'clf__max_depth': [10, 20, None]  # maximum depth of the tree
}

# # Parameters to tune
# parameters_rf = {
#     'clf__n_estimators': [100,300,350,400],  # number of trees in the forest
#     'clf__max_depth': [10,20,None]  # maximum depth of the tree
# }

# Use GridSearchCV for hyperparameter tuning with StratifiedKFold Cross-Validation
grid_search_rf = GridSearchCV(pipeline_rf, parameters_rf, cv=stratified_kfold, n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train, y_train)

# Print best score and parameters
print("Best score (Random Forest): %0.3f" % grid_search_rf.best_score_)
print("Best parameters set (Random Forest):")
best_parameters_rf = grid_search_rf.best_estimator_.get_params()
for param_name in sorted(parameters_rf.keys()):
    print("\t%s: %r" % (param_name, best_parameters_rf[param_name]))

# Use the best parameters to re-train the final model on the entire training data
pipeline_rf.set_params(**best_parameters_rf)
pipeline_rf.fit(X_train, y_train)

# Evaluate the model on the validation data
predictions_val_rf = pipeline_rf.predict(X_val)
val_accuracy = accuracy_score(y_val, predictions_val_rf)
print(f'Validation Accuracy (Random Forest): {val_accuracy:.4f}')
print("Confusion Matrix (Random Forest) - Validation:")
print(confusion_matrix(y_val, predictions_val_rf))
print("Classification Report (Random Forest) - Validation:")
print(classification_report(y_val, predictions_val_rf))

# Evaluate the model on the test data
predictions_rf = pipeline_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, predictions_rf)
print(f'Test Accuracy (Random Forest): {test_accuracy:.4f}')
print("Confusion Matrix (Random Forest) - Test:")
print(confusion_matrix(y_test, predictions_rf))
print("Classification Report (Random Forest) - Test:")
print(classification_report(y_test, predictions_rf))

# Calculate additional metrics
y_test_proba = pipeline_rf.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
auc = roc_auc_score(y_test, y_test_proba)
f1 = f1_score(y_test, predictions_rf)
fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)
tpf = tpr[1]  # True Positive Fraction (same as TPR for the positive class)
fpf = fpr[1]  # False Positive Fraction (same as FPR for the positive class)

print(f"AUC: {auc:.4f}")
print(f"F1-Score: {f1:.4f}")

# Visualize the confusion matrix for test data
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, predictions_rf), annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted Class 0', 'Predicted Class 1'],
            yticklabels=['Actual Class 0', 'Actual Class 1'])
plt.title('Confusion Matrix - Test Data')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Create scatter plot of AUC vs F1-Score
plt.figure(figsize=(8, 6))
plt.scatter(auc, f1, color='red')
plt.title('Scatter Plot of AUC vs F1-Score')
plt.xlabel('AUC')
plt.ylabel('F1-Score')
plt.grid(True)
plt.show()


#TF-IDF

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import numpy as np
import nltk

nltk.download('punkt')

# Load the datasets
trainData = pd.read_csv('/content/trainData.csv').dropna(subset=['Body'])
testData = pd.read_csv('/content/testData.csv').dropna(subset=['Body'])
valData = pd.read_csv('/content/validationData.csv').dropna(subset=['Body'])

# Setup the data
X_train = trainData['Body']
y_train = trainData['label']
X_test = testData['Body']
y_test = testData['label']
X_val = valData['Body']
y_val = valData['label']

# Define the pipeline with TF-IDF
pipeline_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.7)),
    ('clf', RandomForestClassifier(random_state=42)),
])

# # Parameters to tune
# parameters_tfidf = {
#     'tfidf__max_df': (0.75, 0.85),
#     'tfidf__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
#     'clf__n_estimators': [100, 200,300],  # number of trees in the forest
#     'clf__oob_score': [True]
#     'clf__max_depth': [10, 20, None]  # maximum depth of the tree
# }
# Parameters to tune
parameters_tfidf = {
    'tfidf__max_df': (0.75, 0.85),
    'tfidf__ngram_range': [(1, 1)],  # unigrams or bigrams
    'clf__n_estimators': [500],  # number of trees in the forest
    'clf__oob_score': [True],
    'clf__max_depth': [None],  # maximum depth of the tree
    'clf__min_samples_leaf':[2]

}

# Use GridSearchCV for hyperparameter tuning with StratifiedKFold Cross-Validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search_tfidf = GridSearchCV(pipeline_tfidf, parameters_tfidf, cv=stratified_kfold, n_jobs=-1, verbose=2)
grid_search_tfidf.fit(X_train, y_train)

# Print best score and parameters
print("Best score (TF-IDF + Random Forest): %0.3f" % grid_search_tfidf.best_score_)
print("Best parameters set (TF-IDF + Random Forest):")
best_parameters_tfidf = grid_search_tfidf.best_estimator_.get_params()
for param_name in sorted(parameters_tfidf.keys()):
    print("\t%s: %r" % (param_name, best_parameters_tfidf[param_name]))

# Use the best parameters to re-train the final model on the entire training data
pipeline_tfidf.set_params(**best_parameters_tfidf)
pipeline_tfidf.fit(X_train, y_train)

# Evaluate the model on the validation data
predictions_val_tfidf = pipeline_tfidf.predict(X_val)
val_accuracy_tfidf = accuracy_score(y_val, predictions_val_tfidf)
print(f'Validation Accuracy (TF-IDF + Random Forest): {val_accuracy_tfidf:.4f}')
print("Confusion Matrix (TF-IDF + Random Forest) - Validation:")
print(confusion_matrix(y_val, predictions_val_tfidf))
print("Classification Report (TF-IDF + Random Forest) - Validation:")
print(classification_report(y_val, predictions_val_tfidf))

# Evaluate the model on the test data
predictions_tfidf = pipeline_tfidf.predict(X_test)
test_accuracy_tfidf = accuracy_score(y_test, predictions_tfidf)
print(f'Test Accuracy (TF-IDF + Random Forest): {test_accuracy_tfidf:.4f}')
print("Confusion Matrix (TF-IDF + Random Forest) - Test:")
print(confusion_matrix(y_test, predictions_tfidf))
print("Classification Report (TF-IDF + Random Forest) - Test:")
print(classification_report(y_test, predictions_tfidf))

# Calculate additional metrics for TF-IDF
y_test_proba_tfidf = pipeline_tfidf.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
auc_tfidf = roc_auc_score(y_test, y_test_proba_tfidf)
f1_tfidf = f1_score(y_test, predictions_tfidf)
fpr_tfidf, tpr_tfidf, thresholds_tfidf = roc_curve(y_test, y_test_proba_tfidf)
tpf_tfidf = tpr_tfidf[1]  # True Positive Fraction (same as TPR for the positive class)
fpf_tfidf = fpr_tfidf[1]  # False Positive Fraction (same as FPR for the positive class)

print(f"AUC (TF-IDF): {auc_tfidf:.4f}")
print(f"F1-Score (TF-IDF): {f1_tfidf:.4f}")

# Visualize the confusion matrix for test data
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, predictions_tfidf), annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted Class 0', 'Predicted Class 1'],
            yticklabels=['Actual Class 0', 'Actual Class 1'])
plt.title('Confusion Matrix - Test Data (TF-IDF)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


#Word2Vec

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, f1_score
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import numpy as np
import nltk

nltk.download('punkt')

# Load the datasets
trainData = pd.read_csv('/content/trainData.csv').dropna(subset=['Body'])
testData = pd.read_csv('/content/testData.csv').dropna(subset=['Body'])
valData = pd.read_csv('/content/validationData.csv').dropna(subset=['Body'])

# Setup the data
X_train = trainData['Body']
y_train = trainData['label']
X_test = testData['Body']
y_test = testData['label']
X_val = valData['Body']
y_val = valData['label']

# Tokenize the messages for Word2Vec
X_train_tokenized = X_train.apply(word_tokenize)
X_val_tokenized = X_val.apply(word_tokenize)
X_test_tokenized = X_test.apply(word_tokenize)

# Train a Word2Vec model
w2v_model = Word2Vec(sentences=X_train_tokenized, vector_size=100, window=5, min_count=1, workers=4)

# Function to convert text to Word2Vec vectors
def text_to_w2v(text, model):
    words = word_tokenize(text)
    words = [word for word in words if word in model.wv]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[words], axis=0)

# Custom transformer for Word2Vec
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([text_to_w2v(text, self.model) for text in X])

# Define the pipeline with Word2Vec
pipeline_w2v = Pipeline([
    ('w2v', Word2VecTransformer(w2v_model)),
    ('clf', RandomForestClassifier(random_state=42)),
])

# Parameters to tune
parameters_w2v = {
    'clf__n_estimators': [100, 300],  # number of trees in the forest
    'clf__max_depth': [10, 20, None]  # maximum depth of the tree
}

# Use GridSearchCV for hyperparameter tuning with StratifiedKFold Cross-Validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search_w2v = GridSearchCV(pipeline_w2v, parameters_w2v, cv=stratified_kfold, n_jobs=-1, verbose=1)
grid_search_w2v.fit(X_train, y_train)

# Print best score and parameters
print("Best score (Word2Vec + Random Forest): %0.3f" % grid_search_w2v.best_score_)
print("Best parameters set (Word2Vec + Random Forest):")
best_parameters_w2v = grid_search_w2v.best_estimator_.get_params()
for param_name in sorted(parameters_w2v.keys()):
    print("\t%s: %r" % (param_name, best_parameters_w2v[param_name]))

# Use the best parameters to re-train the final model on the entire training data
pipeline_w2v.set_params(**best_parameters_w2v)
pipeline_w2v.fit(X_train, y_train)

# Evaluate the model on the validation data
predictions_val_w2v = pipeline_w2v.predict(X_val)
val_accuracy_w2v = accuracy_score(y_val, predictions_val_w2v)
print(f'Validation Accuracy (Word2Vec + Random Forest): {val_accuracy_w2v:.4f}')
print("Confusion Matrix (Word2Vec + Random Forest) - Validation:")
print(confusion_matrix(y_val, predictions_val_w2v))
print("Classification Report (Word2Vec + Random Forest) - Validation:")
print(classification_report(y_val, predictions_val_w2v))

# Evaluate the model on the test data
predictions_w2v = pipeline_w2v.predict(X_test)
test_accuracy_w2v = accuracy_score(y_test, predictions_w2v)
print(f'Test Accuracy (Word2Vec + Random Forest): {test_accuracy_w2v:.4f}')
print("Confusion Matrix (Word2Vec + Random Forest) - Test:")
print(confusion_matrix(y_test, predictions_w2v))
print("Classification Report (Word2Vec + Random Forest) - Test:")
print(classification_report(y_test, predictions_w2v))

# Calculate additional metrics for Word2Vec
y_test_proba_w2v = pipeline_w2v.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
auc_w2v = roc_auc_score(y_test, y_test_proba_w2v)
f1_w2v = f1_score(y_test, predictions_w2v)
fpr_w2v, tpr_w2v, thresholds_w2v = roc_curve(y_test, y_test_proba_w2v)
tpf_w2v = tpr_w2v[1]  # True Positive Fraction (same as TPR for the positive class)
fpf_w2v = fpr_w2v[1]  # False Positive Fraction (same as FPR for the positive class)

print(f"AUC (Word2Vec): {auc_w2v:.4f}")
print(f"F1-Score (Word2Vec): {f1_w2v:.4f}")

# Visualize the confusion matrix for test data
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, predictions_w2v), annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted Class 0', 'Predicted Class 1'],
            yticklabels=['Actual Class 0', 'Actual Class 1'])
plt.title('Confusion Matrix - Test Data (Word2Vec)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


#Scatter plot of AUC vs F1-Score for TFIDF and Word2Vec

In [None]:
# Create scatter plot of AUC vs F1-Score for both models
plt.figure(figsize=(8, 6))
plt.scatter(auc_tfidf, f1_tfidf, color='red', label='TF-IDF + Random Forest')
plt.scatter(auc_w2v, f1_w2v, color='blue', label='Word2Vec + Random Forest')
plt.title('Scatter Plot of AUC vs F1-Score')
plt.xlabel('AUC')
plt.ylabel('F1-Score')
plt.legend()
plt.grid(True)
plt.show()