# Logistic Regression 

In [None]:
import pandas as pd

# load raw dataset
src = 'data/training_data_features.csv'
training_data = pd.read_csv(src)

# load grouped validation data
src_validation = 'data/validation_data_features.csv'
validation_data = pd.read_csv(src_validation)

src_test = "data/test_data_features.csv"
test_data = pd.read_csv(src_test)

### Logistic Regression with TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()

tfidf_training_matrix = tfidf_vectorizer.fit_transform(training_data['content_stem'])
tfidf_validation_matrix = tfidf_vectorizer.transform(validation_data['content_stem'])

# Target labels
y_training_data = training_data['reliable']
y_validation_data = validation_data['reliable']

# Initialize and train Logistic Regression model
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(tfidf_training_matrix, y_training_data)

# Predict on validation set
predictions = logistic_model.predict(tfidf_validation_matrix)

# Evaluate the model
print("LOGISTIC REGRESSION w/ TF-IDF")
print(classification_report(y_validation_data, predictions))

##### Test on Fake News

In [None]:
from sklearn.metrics import classification_report

# Transform test data using the loaded TF-IDF vectorizer
tfidf_test_matrix = tfidf_vectorizer.transform(test_data['content_stem'])

# Target labels for test data
y_test_data = test_data['reliable']

# Predict on test data
test_predictions = logistic_model.predict(tfidf_test_matrix)

# Evaluate the model on test data
print("LOGISTIC REGRESSION w/ TF-IDF on Fake News")
print(classification_report(y_test_data, test_predictions))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test_data, test_predictions)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['false', 'true'])
disp.plot(values_format='')
plt.title('LOGISTIC REGRESSION w/ TF-IDF on Fake News')
plt.show()

##### Test on LIAR

In [None]:
src_test = "data/liar_dataset/test_features.csv"
test_data = pd.read_csv(src_test)

# Transform test data using the loaded TF-IDF vectorizer
tfidf_test_matrix = tfidf_vectorizer.transform(test_data['content_stem'])

# Target labels for test data
y_test_data = test_data['reliable']

# Predict on test data
test_predictions = logistic_model.predict(tfidf_test_matrix)

# Evaluate the model on test data
print("LOGISTIC REGRESSION w/ TF-IDF on LIAR")
print(classification_report(y_test_data, test_predictions))

In [None]:
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test_data, test_predictions)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['false', 'true'])
disp.plot()
plt.title('LOGISTIC REGRESSION w/ TF-IDF on LIAR')
plt.show()

### Logistic Regression with Sentence Transformer

In [None]:
import pandas as pd

# load data
# training data
src = 'data/training_data_embeddings.csv'
training_embeddings = pd.read_csv(src)

src = 'data/training_data_features.csv'
training_data = pd.read_csv(src)

# validation data
src = 'data/validation_data_embeddings.csv'
validation_embeddings = pd.read_csv(src)

src = 'data/validation_data_features.csv'
validation_data = pd.read_csv(src)

# test data
src = 'data/test_data_features.csv'
test_data = pd.read_csv(src)

src = 'data/test_data_embeddings.csv'
test_embeddings = pd.read_csv(src)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train = training_embeddings
Y_train = training_data['reliable']

X_val = validation_embeddings
Y_val = validation_data['reliable']

# create logistic reg. model, and train it
model = LogisticRegression(random_state=42)
model.fit(X_train, Y_train)

# test the model and report performance
predictions = model.predict(X_val)
print('LOGISTIC REGRESSION w/ TRANSFORMER')
print(classification_report(Y_val, predictions))

##### Test on Fake News

In [None]:
X_test = test_embeddings
Y_test = test_data['reliable']

# test the model and report performance
log_predictions = model.predict(X_test)
print('LOGISTIC REGRESSION w/ TRANSFORMER on Fake News')
print(classification_report(Y_test, log_predictions))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate confusion matrix
conf_matrix = confusion_matrix(Y_test, log_predictions)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['false', 'true'])
disp.plot()
plt.title('LOGISTIC REGRESSION w/ TRANSFORMER on Fake News')
plt.show()

##### Test on LIAR

In [None]:
src = 'data/liar_dataset/test_features.csv'
test_data = pd.read_csv(src)

src = 'data/liar_dataset/test_embeddings.csv'
test_embeddings = pd.read_csv(src)

X_test = test_embeddings
Y_test = test_data['reliable']

# test the model and report performance
log_predictions = model.predict(X_test)
print('LOGISTIC REGRESSION w/ TRANSFORMER on LIAR')
print(classification_report(Y_test, log_predictions))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate confusion matrix
conf_matrix = confusion_matrix(Y_test, log_predictions)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['false', 'true'])
disp.plot()
plt.title('LOGISTIC REGRESSION w/ TRANSFORMER on LIAR')
plt.show()