In [2]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline 
import os
import sys
import pandas as pd
import numpy as np
import scipy as sp
from sklearn import metrics

LR classifier: 
The gridsearch parameters initialised for the logistic regression classifier are described and rationalised below. For more on the parameters dicussed [see the scikit-learn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression).

- solvers = ["lbfgs", "saga", "liblinear"]

The solver determines the algorithm used when optimizing. Different solvers are avilable, 
'lbfgs' is robust and the default solver when using ```scikit-learn``` LogisticRegression. Additionally, 'saga' and 'liblinear' are included in the tuning. 'liblinear' is recommended on smaller datasets, which the *Fake or Real News* dataset is, and 'saga' is overall well performing. 

- penalties = ["l1", "l2"]

*consider whether lbfgs should be included? it does not support l1 penalty*

The penalties represent different regularization techinques, which helps balance between model fit and complexity. Not all solvers support all penalties, in this case 'l1' and 'l2' penalties have been included in the gridsearch. 

- C = [1.0, 0.1, 0.01]

The C hyperparameter defines the strength of the regulrazation on the model, the smaller the value the more regulration and the simpler the model, in turn a higher value slacks the regularization and allows for a more complex model. The default is 1.0, and 0.1 and 0.01 has also been included as paramaters in the grid search. 

In [7]:
y_train, y_test, X_train, X_test, feature_names = pd.read_pickle('out/features.pkl')

In [15]:
X_test.shape


(1267, 500)

In [10]:
# Initialise the default model, here given the name 'classifier'
pipe = Pipeline([('classifier' , LogisticRegression())]) #able to pass multiple classifiers to test
# Set tunable parameters for grid search - other parameters can be set (scikit learn logreg see parameters)
penalties = ['l1', 'l2'] # different regularization parameters
C = [1.0, 0.1, 0.01]     # different regularization 'strengths'
solvers = ['liblinear', "saga", "lbfgs"]  # different solvers - check all of the sklearn docs 
# Create parameter grid (a Python dictionary)
parameters = dict(classifier__penalty = penalties,  # notice how we use the name 'classifier'
                  classifier__C = C,
                  classifier__solver = solvers)
# Choose which metrics on which we want to optimise
scores = ['precision', 'recall', 'f1']

In [9]:
for score in scores:
    print(f"# Tuning hyper-parameters for {score}")
    print()
    
    # Initialise Gridsearch with predefined parameters 
    clf = GridSearchCV(pipe,                                #the initialised pipeline logreg
                       parameters,                          #the parameters defined as dict
                       scoring= f"{score}_weighted",
                       cv=10) # use 10-fold cross-validation, running the model 10 times on diff shuffles, to get averages
    # Fit
    clf.fit(X_train, y_train)
    
    # Print best results on training data
    print("Best parameters set found on training data:")
    # add new lines to separate rows
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on training data:")
    print()
    # get all means
    means = clf.cv_results_['mean_test_score']
    # get all standard deviations
    stds = clf.cv_results_['std_test_score']
    # get parameter combinations
    params = clf.cv_results_['params']

    # print means, standard deviations , and parameters for all runs
    i = 0
    for mean, stdev, param in zip(means, stds, params):
        # 2*standard deviation covers 95% of the spread - check out the 68–95–99.7 rule
        print(f"Run {i}: {round(mean,3)} (SD=±{round(stdev*2, 3)}), using {param}")
        i += 1
    print()
    
    # Print details classification report
    print("Detailed classification report:")
    print()
    print("The model is trained on the full training set.")
    print("The scores are computed on the full test set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Best parameters set found on training data:

{'classifier__C': 1.0, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}

Grid scores on training data:

Run 0: 0.891 (SD=±0.024), using {'classifier__C': 1.0, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Run 1: 0.891 (SD=±0.023), using {'classifier__C': 1.0, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
Run 2: 0.885 (SD=±0.034), using {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Run 3: 0.885 (SD=±0.034), using {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
Run 4: 0.824 (SD=±0.024), using {'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Run 5: 0.823 (SD=±0.024), using {'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
Run 6: 0.851 (SD=±0.033), using {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Run 7: 0.852 (SD=±0.033



Best parameters set found on training data:

{'classifier__C': 1.0, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}

Grid scores on training data:

Run 0: 0.891 (SD=±0.024), using {'classifier__C': 1.0, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Run 1: 0.891 (SD=±0.024), using {'classifier__C': 1.0, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
Run 2: 0.885 (SD=±0.034), using {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Run 3: 0.885 (SD=±0.034), using {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
Run 4: 0.822 (SD=±0.025), using {'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Run 5: 0.82 (SD=±0.025), using {'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
Run 6: 0.851 (SD=±0.033), using {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Run 7: 0.851 (SD=±0.033)



Best parameters set found on training data:

{'classifier__C': 1.0, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}

Grid scores on training data:

Run 0: 0.891 (SD=±0.024), using {'classifier__C': 1.0, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Run 1: 0.891 (SD=±0.023), using {'classifier__C': 1.0, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
Run 2: 0.885 (SD=±0.034), using {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Run 3: 0.885 (SD=±0.034), using {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
Run 4: 0.822 (SD=±0.025), using {'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Run 5: 0.821 (SD=±0.025), using {'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
Run 6: 0.851 (SD=±0.033), using {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Run 7: 0.851 (SD=±0.033

