# Simple Logistic Regression Model

In [None]:
import pandas as pd

# load data
src = 'data/training_data_features.csv'
training_data = pd.read_csv(src, index_col=0)

src = 'data/validation_data_features.csv'
validation_data = pd.read_csv(src, index_col=0)

## LogReg with Baseline features

Using FakeNews validation and training set

In [None]:
# Code reference: https://www.freecodecamp.org/news/how-to-build-and-train-linear-and-logistic-regression-ml-models-in-python/

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, mean_squared_error

features = [
    'date_count', 
    'url_count', 
    'exclm_count',
    'content_word_freq', 
    'stop_word_freq', 
    'stem_word_freq',
    'stop_reduction_rate', 
    'stem_reduction_rate', 
    'average_sentence_length'
]

# features
X_training = training_data[features]
X_validation = validation_data[features]

# targets
y_training = training_data['reliable']
y_validation = validation_data['reliable']

# create logistic reg. model, and train it
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_training, y_training)

# test the model on validation data and report performance
predictions = model.predict(X_validation)

print('LOGISTIC REGRESSION w/ BASELINE (VALIDATION)')
print(classification_report(y_validation, predictions))

In [None]:
# get weights of coefficents
dict(zip(model.feature_names_in_, model.coef_[0]))

### FakeNews Test Results

FakeNews test split data

In [None]:
# load test data
src = 'data/test_data_features.csv'
test_data = pd.read_csv(src, index_col=0)

# get features
X_test = test_data[features]
y_test = test_data['reliable']

# test the model and report performance
test_predictions = model.predict(X_test)

print('LOGISTIC REGRESSION w/ BASELINE [TEST]')
print(classification_report(y_test, test_predictions))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['false', 'true'])
disp.plot()
plt.title('LOGISTIC REGRESSION w/ Baseline on Fake News')
plt.show()

### LIAR Test Results

LIAR test data

In [None]:
# load test data
src = 'data/liar_dataset/test_features.csv'
liar_data = pd.read_csv(src, index_col=0)

# get features
X_liar = liar_data[features]
y_liar = liar_data['reliable']

# test the model and report performance
liar_predictions = model.predict(X_liar)

print('LOGISTIC REGRESSION w/ BASELINE [LIAR TEST]')
print(classification_report(y_liar, liar_predictions))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_liar, liar_predictions)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['false', 'true'])
disp.plot()
plt.title('LOGISTIC REGRESSION w/ Baseline on LIAR')
plt.show()

## LogReg with Baseline and Meta-Data features



In [None]:
# Code reference: https://www.freecodecamp.org/news/how-to-build-and-train-linear-and-logistic-regression-ml-models-in-python/

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

features = [
    'date_count', 
    'url_count', 
    'exclm_count',
    'content_word_freq', 
    'stop_word_freq', 
    'stem_word_freq',
    'stop_reduction_rate', 
    'stem_reduction_rate', 
    'average_sentence_length',
    'has_author' # meta-data
]

# features
X_training = training_data[features]
X_validation = validation_data[features]

# targets
y_training = training_data['reliable']
y_validation = validation_data['reliable']

# create logistic reg. model, and train it
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_training, y_training)

# test the model on validation data and report performance
predictions = model.predict(X_validation)

print('LOGISTIC REGRESSION w/ BASELINE, META-DATA [VALIDATION]')
print(classification_report(y_validation, predictions))

In [None]:
# get weights of coefficents
dict(zip(model.feature_names_in_, model.coef_[0]))

## LogReg with BBC articles added to training data

In [None]:
# load BBC articles
src = 'data/articles_features.csv'
bbc_data = pd.read_csv(src, index_col=0)

In [None]:
# add BBC articles to training_data
training_data_extended = pd.concat([training_data, bbc_data])

In [None]:
# percentage destribution of types
type_dist = training_data_extended['type'].value_counts(normalize=True) * 100
print(type_dist)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

features = [
    'date_count', 
    'url_count', 
    'exclm_count',
    'content_word_freq', 
    'stop_word_freq', 
    'stem_word_freq',
    'stop_reduction_rate', 
    'stem_reduction_rate', 
    'average_sentence_length'
]

# features
X_training = training_data_extended[features]
X_validation = validation_data[features]

# targets
y_training = training_data_extended['reliable']
y_validation = validation_data['reliable']

# create logistic reg. model, and train it
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_training, y_training)

# test the model on validation data and report performance
predictions = model.predict(X_validation)

print('LOGISTIC REGRESSION w/ EXTENDED TRANING SET (BBC ARTICLES) [VALIDATION]')
print(classification_report(y_validation, predictions))

In [None]:
# get weights of coefficents
dict(zip(model.feature_names_in_, model.coef_[0]))