### Optimizing Rumor Detection: A Dual Feature Extraction Approach with LIME-Based Model Explanation

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import string
import re
import random
from collections import Counter

import warnings
warnings.filterwarnings("ignore")


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec

import nltk
from nltk.corpus import stopwords, wordnet
# from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold



from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

import joblib

In [None]:
# dataset link = https://www.kaggle.com/c/fake-news/data

In [None]:
df = pd.read_csv("train.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.label.value_counts()

1 ==> unreliable / rumor

0 ==> reliable / non-rumor

Balanced Class Labels

In [None]:
df = df.drop(['id', 'author', 'title'], axis = 1)
df.head()

In [None]:
df.info()

### Null value

In [None]:
df.isnull().sum()

In [None]:
# The forwardfill() method is used to fill in missing values 
# in a DataFrame or Series with the previous valid observation

df['text'].fillna(method='ffill', inplace=True)

In [None]:
df.isnull().sum()

### Duplicate value

In [None]:
df.duplicated().value_counts()

In [None]:
# dropping Duplicates

df = df.drop_duplicates(keep='first')

In [None]:
df.duplicated().value_counts()

In [None]:
df.shape

## Preprocessing

### Data Cleaning

In [None]:
# Ensure you have the necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Constants for cleaning
PUNCT_TO_REMOVE = string.punctuation + '“”'
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

In [None]:
# Combine all cleaning functions
def clean_text(text, FREQWORDS=None, RAREWORDS=None):
    if isinstance(text, str):
        # 1. Convert to lowercase
        text = text.lower()
        
        # 2. Remove punctuation
        text = text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
        
        # 3. Remove hyperlinks, markup, numbers, and special symbols
        text = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', text)
        text = re.sub('&gt;', "", text)  # Greater than sign
        text = re.sub('&#x27;', "'", text)  # Apostrophe
        text = re.sub('&quot;', '"', text)
        text = re.sub('&#x2F;', ' ', text)
        text = re.sub('<p>', ' ', text)  # Paragraph tag
        text = re.sub('<i>', ' ', text)  # Italics tag
        text = re.sub('</i>', '', text)
        text = re.sub('&#62;', '', text)
        text = re.sub("\n", '', text)  # Newline
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = re.sub(r"[’']", "", text)  # Remove special apostrophes

        # 4. Remove stopwords
        text = " ".join([word for word in text.split() if word not in STOPWORDS])
        
        # 5. Remove frequent words if provided
        if FREQWORDS:
            text = " ".join([word for word in text.split() if word not in FREQWORDS])
        
        # 6. Remove rare words if provided
        if RAREWORDS:
            text = " ".join([word for word in text.split() if word not in RAREWORDS])
        
        # 7. Lemmatization
        pos_tagged_text = nltk.pos_tag(text.split())
        text = " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
        
        # 8. Remove URLs
        url_pattern = re.compile(r'https?://\S+|www\.\S+')
        text = url_pattern.sub(r'', text)
        
    return text

In [None]:
# Apply the cleaning function on the dataset
def preprocess_df(df):
    # Remove URLs, Punctuation, Lowercase, etc. in one go
    df["text"] = df["text"].apply(lambda text: clean_text(text))

    # Get frequent words and remove them
    cnt = Counter()
    for text in df["text"].values:
        for word in text.split():
            cnt[word] += 1
    FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
    
    # Get rare words and remove them
    rare_word_count = pd.Series(' '.join(df['text']).split()).value_counts()[-2:]
    RAREWORDS = list(rare_word_count.index)
    
    # Apply removal of frequent and rare words
    df["text"] = df["text"].apply(lambda text: clean_text(text, FREQWORDS=FREQWORDS, RAREWORDS=RAREWORDS))

    return df

In [None]:
# Use the function to clean and preprocess the dataframe
df = preprocess_df(df)
df.head()

## Split Data

In [None]:
X = df['text']  # Features
y = df['label'] # Target

In [None]:
print(X[:1])

In [None]:
print(y)

### Feature Extraction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Custom parameters for CountVectorizer
# count_vect = CountVectorizer(max_features=10000, ngram_range=(1, 2))

# Custom parameters for TfidfVectorizer
tfidf_vect = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

In [None]:
# Apply CountVectorizer
# X_count = count_vect.fit_transform(X)


# Apply TfidfVectorizer
X_tfidf = tfidf_vect.fit_transform(X)

In [None]:
# Example of correct data splitting
# X_train_count, X_test_count, y_train, y_test = train_test_split(X_count, y, test_size=0.2, random_state=42)
# X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [None]:
# print(f"X_train_count shape: {X_train_count.shape}")
# print(f"y_train shape: {y_train.shape}")
# print(f"X_test_count shape: {X_test_count.shape}")
# print(f"y_test shape: {y_test.shape}")

# print(f"X_train_tfidf shape: {X_train_tfidf.shape}")
# print(f"y_train shape: {y_train.shape}")
# print(f"X_test_tfidf shape: {X_test_tfidf.shape}")
# print(f"y_train shape: {y_test.shape}")

## Model Training - TFIDF

### 1. Logistic Regression

In [None]:
# Initialize Logistic Regression model
log_reg_tf = LogisticRegression(max_iter=1000, solver='lbfgs', C=1.0, penalty='l2')

# Initialize k-fold cross-validation

kf = StratifiedKFold(n_splits= 5, shuffle=True)

# Initialize metrics to accumulate results
conf_matrix_sum = np.zeros((2, 2))  # for binary classification (2x2 confusion matrix)

# Lists to store metrics for each fold
accuracy_scores = []
f1_scores = []
precision_scores = []
recall_scores = []

# Perform k-fold cross-validation manually
for fold, (train_index, test_index) in enumerate(kf.split(X_tfidf, y)):
    print(f"Fold {fold + 1}")
    
    # Split the data into training and testing sets for this fold using .iloc for positional indexing
    X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]  # Use iloc to select by position
    
    # Fit the logistic regression model
    log_reg_tf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = log_reg_tf.predict(X_test)
    
    # Update confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix_sum += conf_matrix  # Accumulate the confusion matrices
    
     # Calculate and store metrics
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))



# Print the final averaged metrics
print("\n=== Final Averaged Metrics Across All Folds ===")
print(f'Mean Accuracy: {np.mean(accuracy_scores):.4f}')
print(f'Mean F1 Score: {np.mean(f1_scores):.4f}')
print(f'Mean Precision: {np.mean(precision_scores):.4f}')
print(f'Mean Recall: {np.mean(recall_scores):.4f}')

# Print the final summed confusion matrix
print("\n=== Final Summed Confusion Matrix Across All Folds ===")
print(conf_matrix_sum.astype(int))  # Convert to integer for cleaner display

In [None]:
# Plot confusion matrix
conf_matrix = conf_matrix_sum.astype(int)
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Reliable', 'Unreliable'], yticklabels=['Reliable', 'Unreliable'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix - Logistic Regression with TF-IDF')
plt.show()

In [None]:
# Save the trained model to a file

# joblib.dump(log_reg_tf, 'logistic_regression_model_tfidf.pkl')

### 2. Naive Bayes

In [None]:
# Initialize the Naive Bayes model
naive_bayes_tf = MultinomialNB(alpha=1.0, fit_prior=True)

# Initialize k-fold cross-validation

kf = StratifiedKFold(n_splits= 5, shuffle=True)

# Initialize metrics to accumulate results
conf_matrix_sum = np.zeros((2, 2))  # for binary classification (2x2 confusion matrix)

# Lists to store metrics for each fold
accuracy_scores = []
f1_scores = []
precision_scores = []
recall_scores = []

# Perform k-fold cross-validation manually
for fold, (train_index, test_index) in enumerate(kf.split(X_tfidf, y)):
    print(f"Fold {fold + 1}")
    
    # Split the data into training and testing sets for this fold using .iloc for positional indexing
    X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]  # Use iloc to select by position
    
    # Fit the logistic regression model
    naive_bayes_tf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = naive_bayes_tf.predict(X_test)
    
    # Update confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix_sum += conf_matrix  # Accumulate the confusion matrices
    
     # Calculate and store metrics
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))



# Print the final averaged metrics
print("\n=== Final Averaged Metrics Across All Folds ===")
print(f'Mean Accuracy: {np.mean(accuracy_scores):.4f}')
print(f'Mean F1 Score: {np.mean(f1_scores):.4f}')
print(f'Mean Precision: {np.mean(precision_scores):.4f}')
print(f'Mean Recall: {np.mean(recall_scores):.4f}')

# Print the final summed confusion matrix
print("\n=== Final Summed Confusion Matrix Across All Folds ===")
print(conf_matrix_sum.astype(int))  # Convert to integer for cleaner display

In [None]:
# Plot confusion matrix
conf_matrix = conf_matrix_sum.astype(int)
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Reliable', 'Unreliable'], yticklabels=['Reliable', 'Unreliable'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix - Naive Bayes with TF-IDF')
plt.show()

In [None]:
# Save the trained model to a file

# joblib.dump(naive_bayes_tf, 'naive_bayes_model_tfidf.pkl')

### 3. Stochastic Gradient Descent (SGD)

In [None]:
# Initialize the Stochastic Gradient Descent (SGD) Classifier
sgd_tf = SGDClassifier(loss='hinge', penalty='l2', max_iter=1000, tol=1e-3, random_state=42)

# Initialize k-fold cross-validation

kf = StratifiedKFold(n_splits= 5, shuffle=True)

# Initialize metrics to accumulate results
conf_matrix_sum = np.zeros((2, 2))  # for binary classification (2x2 confusion matrix)

# Lists to store metrics for each fold
accuracy_scores = []
f1_scores = []
precision_scores = []
recall_scores = []

# Perform k-fold cross-validation manually
for fold, (train_index, test_index) in enumerate(kf.split(X_tfidf, y)):
    print(f"Fold {fold + 1}")
    
    # Split the data into training and testing sets for this fold using .iloc for positional indexing
    X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]  # Use iloc to select by position
    
    # Fit the logistic regression model
    sgd_tf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = sgd_tf.predict(X_test)
    
    # Update confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix_sum += conf_matrix  # Accumulate the confusion matrices
    
     # Calculate and store metrics
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))



# Print the final averaged metrics
print("\n=== Final Averaged Metrics Across All Folds ===")
print(f'Mean Accuracy: {np.mean(accuracy_scores):.4f}')
print(f'Mean F1 Score: {np.mean(f1_scores):.4f}')
print(f'Mean Precision: {np.mean(precision_scores):.4f}')
print(f'Mean Recall: {np.mean(recall_scores):.4f}')

# Print the final summed confusion matrix
print("\n=== Final Summed Confusion Matrix Across All Folds ===")
print(conf_matrix_sum.astype(int))  # Convert to integer for cleaner display

In [None]:
# Plot confusion matrix
conf_matrix = conf_matrix_sum.astype(int)
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Reliable', 'Unreliable'], yticklabels=['Reliable', 'Unreliable'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix - SGD Classifier with TF-IDF')
plt.show()

In [None]:
# Save the trained model to a file

joblib.dump(sgd_tf, 'stochastic_gradient_descent_model_tfidf.pkl')

### 4. K-Nearest Neighbors (KNN)

In [None]:
# Initialize the KNN model
knn_tf = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)  # p=2 means Euclidean distance


# Initialize k-fold cross-validation
kf = StratifiedKFold(n_splits= 5, shuffle=True)

# Initialize metrics to accumulate results
conf_matrix_sum = np.zeros((2, 2))  # for binary classification (2x2 confusion matrix)

# Lists to store metrics for each fold
accuracy_scores = []
f1_scores = []
precision_scores = []
recall_scores = []

# Perform k-fold cross-validation manually
for fold, (train_index, test_index) in enumerate(kf.split(X_tfidf, y)):
    print(f"Fold {fold + 1}")
    
    # Split the data into training and testing sets for this fold using .iloc for positional indexing
    X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]  # Use iloc to select by position
    
    # Fit the logistic regression model
    knn_tf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = knn_tf.predict(X_test)
    
    # Update confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix_sum += conf_matrix  # Accumulate the confusion matrices
    
     # Calculate and store metrics
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))



# Print the final averaged metrics
print("\n=== Final Averaged Metrics Across All Folds ===")
print(f'Mean Accuracy: {np.mean(accuracy_scores):.4f}')
print(f'Mean F1 Score: {np.mean(f1_scores):.4f}')
print(f'Mean Precision: {np.mean(precision_scores):.4f}')
print(f'Mean Recall: {np.mean(recall_scores):.4f}')

# Print the final summed confusion matrix
print("\n=== Final Summed Confusion Matrix Across All Folds ===")
print(conf_matrix_sum.astype(int))  # Convert to integer for cleaner display

In [None]:
# Plot confusion matrix
conf_matrix = conf_matrix_sum.astype(int)
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Reliable', 'Unreliable'], yticklabels=['Reliable', 'Unreliable'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix - K-Nearest Neighbors with TF-IDF')
plt.show()

In [None]:
# Save the trained model to a file

# joblib.dump(knn_tf, 'k-nearest_neighbors_model_tfidf.pkl')

### 5. Decision Tree

In [None]:
# Initialize the Decision Tree model
decision_tree_tf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, random_state=42)

# Define the k-fold cross-validation# Initialize the KNN model
knn_tf = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)  # p=2 means Euclidean distance


# Initialize k-fold cross-validation
kf = StratifiedKFold(n_splits= 5, shuffle=True)

# Initialize metrics to accumulate results
conf_matrix_sum = np.zeros((2, 2))  # for binary classification (2x2 confusion matrix)

# Lists to store metrics for each fold
accuracy_scores = []
f1_scores = []
precision_scores = []
recall_scores = []

# Perform k-fold cross-validation manually
for fold, (train_index, test_index) in enumerate(kf.split(X_tfidf, y)):
    print(f"Fold {fold + 1}")
    
    # Split the data into training and testing sets for this fold using .iloc for positional indexing
    X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]  # Use iloc to select by position
    
    # Fit the logistic regression model
    decision_tree_tf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = decision_tree_tf.predict(X_test)
    
    # Update confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix_sum += conf_matrix  # Accumulate the confusion matrices
    
     # Calculate and store metrics
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))



# Print the final averaged metrics
print("\n=== Final Averaged Metrics Across All Folds ===")
print(f'Mean Accuracy: {np.mean(accuracy_scores):.4f}')
print(f'Mean F1 Score: {np.mean(f1_scores):.4f}')
print(f'Mean Precision: {np.mean(precision_scores):.4f}')
print(f'Mean Recall: {np.mean(recall_scores):.4f}')

# Print the final summed confusion matrix
print("\n=== Final Summed Confusion Matrix Across All Folds ===")
print(conf_matrix_sum.astype(int))  # Convert to integer for cleaner display
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(decision_tree_tf, X_train_tfidf, y_train, cv=cv, scoring='accuracy')
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(cv_scores):.4f}")

# Train the model on the TF-IDF features
decision_tree_tf.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_tfidf_dt = decision_tree_tf.predict(X_test_tfidf)

# Evaluation Metrics
accuracy_tfidf_dt = accuracy_score(y_test, y_pred_tfidf_dt)
precision_tfidf_dt = precision_score(y_test, y_pred_tfidf_dt)
recall_tfidf_dt = recall_score(y_test, y_pred_tfidf_dt)
f1_tfidf_dt = f1_score(y_test, y_pred_tfidf_dt)

# Print the evaluation results
print(f"Decision Tree with TfidfVectorizer:")
print(f"Accuracy: {accuracy_tfidf_dt:.4f}")
print(f"Precision: {precision_tfidf_dt:.4f}")
print(f"Recall: {recall_tfidf_dt:.4f}")
print(f"F1 Score: {f1_tfidf_dt:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_tfidf_dt))

# Plot confusion matrix
conf_matrix_tfidf_dt = confusion_matrix(y_test, y_pred_tfidf_dt)
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix_tfidf_dt, annot=True, fmt='d', cmap='OrRd', xticklabels=['Non-Rumor', 'Rumor'], yticklabels=['Non-Rumor', 'Rumor'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix - Decision Tree with TF-IDF')
plt.show()

In [None]:
# Plot confusion matrix
conf_matrix = conf_matrix_sum.astype(int)
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Reliable', 'Unreliable'], yticklabels=['Reliable', 'Unreliable'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix - Decision Trees with TF-IDF')
plt.show()

In [None]:
# Save the trained model to a file

joblib.dump(decision_tree_tf, 'decision_tree_model_tfidf.pkl')

### 6. Random Forest

In [None]:
# Initialize the Random Forest model
random_forest_tf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, random_state=42, n_jobs=-1)

# Define the k-fold cross-validation# Initialize the KNN model
knn_tf = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)  # p=2 means Euclidean distance


# Initialize k-fold cross-validation
kf = StratifiedKFold(n_splits= 5, shuffle=True)

# Initialize metrics to accumulate results
conf_matrix_sum = np.zeros((2, 2))  # for binary classification (2x2 confusion matrix)

# Lists to store metrics for each fold
accuracy_scores = []
f1_scores = []
precision_scores = []
recall_scores = []

# Perform k-fold cross-validation manually
for fold, (train_index, test_index) in enumerate(kf.split(X_tfidf, y)):
    print(f"Fold {fold + 1}")
    
    # Split the data into training and testing sets for this fold using .iloc for positional indexing
    X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]  # Use iloc to select by position
    
    # Fit the logistic regression model
    random_forest_tf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = random_forest_tf.predict(X_test)
    
    # Update confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    conf_matrix_sum += conf_matrix  # Accumulate the confusion matrices
    
     # Calculate and store metrics
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))



# Print the final averaged metrics
print("\n=== Final Averaged Metrics Across All Folds ===")
print(f'Mean Accuracy: {np.mean(accuracy_scores):.4f}')
print(f'Mean F1 Score: {np.mean(f1_scores):.4f}')
print(f'Mean Precision: {np.mean(precision_scores):.4f}')
print(f'Mean Recall: {np.mean(recall_scores):.4f}')

# Print the final summed confusion matrix
print("\n=== Final Summed Confusion Matrix Across All Folds ===")
print(conf_matrix_sum.astype(int))  # Convert to integer for cleaner display
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(decision_tree_tf, X_train_tfidf, y_train, cv=cv, scoring='accuracy')
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(cv_scores):.4f}")

# Train the model on the TF-IDF features
decision_tree_tf.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_tfidf_dt = decision_tree_tf.predict(X_test_tfidf)

# Evaluation Metrics
accuracy_tfidf_dt = accuracy_score(y_test, y_pred_tfidf_dt)
precision_tfidf_dt = precision_score(y_test, y_pred_tfidf_dt)
recall_tfidf_dt = recall_score(y_test, y_pred_tfidf_dt)
f1_tfidf_dt = f1_score(y_test, y_pred_tfidf_dt)

# Print the evaluation results
print(f"Decision Tree with TfidfVectorizer:")
print(f"Accuracy: {accuracy_tfidf_dt:.4f}")
print(f"Precision: {precision_tfidf_dt:.4f}")
print(f"Recall: {recall_tfidf_dt:.4f}")
print(f"F1 Score: {f1_tfidf_dt:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_tfidf_dt))

# Plot confusion matrix
conf_matrix_tfidf_dt = confusion_matrix(y_test, y_pred_tfidf_dt)
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix_tfidf_dt, annot=True, fmt='d', cmap='OrRd', xticklabels=['Non-Rumor', 'Rumor'], yticklabels=['Non-Rumor', 'Rumor'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix - Decision Tree with TF-IDF')
plt.show()

In [None]:
# Plot confusion matrix
conf_matrix = conf_matrix_sum.astype(int)
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Reliable', 'Unreliable'], yticklabels=['Reliable', 'Unreliable'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix - Random Forest with TF-IDF')
plt.show()

In [None]:
# Save the trained model to a file

joblib.dump(random_forest_tf, 'random_forest_model_tfidf.pkl')

### 7. Support Vector Machine (SVM)

In [None]:
# ================ without cross validation =================

# # Initialize the Support Vector Machine model
# svm_tf = SVC(kernel='linear', C=1.0, random_state=42)

# # Train the model on the TF-IDF features
# svm_tf.fit(X_train_tfidf, y_train)

# # Predict on the test set
# y_pred_tfidf_svm = svm_tf.predict(X_test_tfidf)

# # Evaluation Metrics
# accuracy_tfidf_svm = accuracy_score(y_test, y_pred_tfidf_svm)
# precision_tfidf_svm = precision_score(y_test, y_pred_tfidf_svm)
# recall_tfidf_svm = recall_score(y_test, y_pred_tfidf_svm)
# f1_tfidf_svm = f1_score(y_test, y_pred_tfidf_svm)

# # Print the evaluation results
# print(f"Support Vector Machine with TfidfVectorizer:")
# print(f"Accuracy: {accuracy_tfidf_svm:.4f}")
# print(f"Precision: {precision_tfidf_svm:.4f}")
# print(f"Recall: {recall_tfidf_svm:.4f}")
# print(f"F1 Score: {f1_tfidf_svm:.4f}")

# # Print classification report
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred_tfidf_svm))

# # Plot confusion matrix
# conf_matrix_tfidf_svm = confusion_matrix(y_test, y_pred_tfidf_svm)
# plt.figure(figsize=(6,4))
# sns.heatmap(conf_matrix_tfidf_svm, annot=True, fmt='d', cmap='OrRd', xticklabels=['Non-Rumor', 'Rumor'], yticklabels=['Non-Rumor', 'Rumor'])
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.title('Confusion Matrix - Support Vector Machine with TF-IDF')
# plt.show()

In [None]:
# Initialize the Support Vector Machine model
svm_model_tf = SVC(kernel='linear', C=1.0, random_state=42)

# Define the k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_scores = cross_val_score(svm_model_tf, X_train_tfidf, y_train, cv=cv, scoring='accuracy')
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(cv_scores):.4f}")

# Train the model on the full training data
svm_model_tf.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_tfidf_svm = svm_model_tf.predict(X_test_tfidf)

# Evaluation Metrics
accuracy_tfidf_svm = accuracy_score(y_test, y_pred_tfidf_svm)
precision_tfidf_svm = precision_score(y_test, y_pred_tfidf_svm)
recall_tfidf_svm = recall_score(y_test, y_pred_tfidf_svm)
f1_tfidf_svm = f1_score(y_test, y_pred_tfidf_svm)

# Print the evaluation results
print(f"Support Vector Machine with TfidfVectorizer:")
print(f"Accuracy: {accuracy_tfidf_svm:.4f}")
print(f"Precision: {precision_tfidf_svm:.4f}")
print(f"Recall: {recall_tfidf_svm:.4f}")
print(f"F1 Score: {f1_tfidf_svm:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_tfidf_svm))

# Plot confusion matrix
conf_matrix_tfidf_svm = confusion_matrix(y_test, y_pred_tfidf_svm)
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix_tfidf_svm, annot=True, fmt='d', cmap='OrRd', xticklabels=['Non-Rumor', 'Rumor'], yticklabels=['Non-Rumor', 'Rumor'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix - Support Vector Machine with TF-IDF')
plt.show()

In [None]:
# Plot confusion matrix
conf_matrix = conf_matrix_sum.astype(int)
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Reliable', 'Unreliable'], yticklabels=['Reliable', 'Unreliable'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix - Support Vector Machine with TF-IDF')
plt.show()

In [None]:
# Save the trained model to a file

joblib.dump(svm_model_tf, 'svm_model_tfidf.pkl')