In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

## Choose model, and data to test against

In [None]:
models = ['Simple','Advansed']
data = ['FakeNews', 'Liar'] 

model = models[1]
test = data[0]

## Load data

In [None]:
X_train = pd.read_csv('files\\X_train.csv')
X_test = pd.read_csv('files\\X_test.csv')
X_val = pd.read_csv('files\\X_val.csv')   
y_train = pd.read_csv('files\\y_train.csv') 
y_test = pd.read_csv('files\\y_test.csv')  
y_val = pd.read_csv('files\\y_val.csv')   
X_liar = pd.read_csv('files\\X_liar.csv') 
y_liar = pd.read_csv('files\\y_liar.csv') 

## Vectorize the chosen prediction data 

In [None]:
def Vectorizer(train, test, val, liar):
    if model == models[0]:
        vectorizer = CountVectorizer(stop_words='english', max_df=0.75, max_features=1000)
    else:
        vectorizer = TfidfVectorizer(stop_words='english', max_df=0.75)
        
    col_vectorizer = make_column_transformer((vectorizer, 'content'),
                                             remainder='drop')

    X_train_vec = col_vectorizer.fit_transform(train)
    X_test_vec = col_vectorizer.transform(test)
    X_val_vec = col_vectorizer.transform(val)
    X_liar_vec = col_vectorizer.transform(liar)
    
    return X_train_vec, X_test_vec, X_val_vec, X_liar_vec

X_train_vec, X_test_vec, X_val_vec, X_liar_vec = Vectorizer(X_train, X_test, X_val, X_liar)

## Models

In [None]:
def SimpelModel():
    model = LogisticRegression(random_state=0, max_iter=1000)
    pipeline = make_pipeline(StandardScaler(with_mean=False, with_std=False), model)
    return pipeline

def AdvancedModel():
    model = LinearSVC(max_iter=1000, dual='auto')
    return model

## Train model

In [None]:
def TrainModel(X, y):
    if model == models[0]:
        return SimpelModel().fit(X, y.type)
    else:
        return AdvancedModel().fit(X, y.type)

trained_model = TrainModel(X_train_vec, y_train)

## Predict and test score

In [None]:
def Predict(model, X_data):
    y_pred = model.predict(X_data)   
    return y_pred

def ConfMatrix(y_data, y_pred):
    matrix = confusion_matrix(y_data,y_pred, labels=[1,0])
    TP = matrix[0][0]
    TN = matrix[1][1]
    FP = matrix[1][0]
    FN = matrix[0][1]
    print(f'Confusion matrix:\n {matrix}')
    print('\n')
    print(f'TP: {TP}')
    print(f'TN: {TN}')
    print(f'FP: {FP}')
    print(f'FN: {FN}')
    print('\n')
    return TP, TN, FP, FN

def Score(TP, TN, FP, FN):
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1 = 2 * ((precision*recall)/(precision+recall))
    print(f'Accuracy: {accuracy}')
    print(f'precision: {precision}')
    print(f'recall: {recall}')
    print(f'F1: {f1}')
    
def TestModel(model,X_data ,y_data):
    y_pred = Predict(model, X_data)
    TP, TN, FP, FN = ConfMatrix(y_data, y_pred) 
    Score(TP, TN, FP, FN)

if test == data[0]:
    TestModel(trained_model, X_val_vec, y_val)
else:
    TestModel(trained_model, X_liar_vec, y_liar)

## Crossvalidate

In [None]:
def CrossValidate(model, X_path, y_path):    
    df_X = pd.read_csv(X_path) 
    df_y = pd.read_csv(y_path)

    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.75)

    col_vectorizer = make_column_transformer((vectorizer, 'content'),
                                             remainder='drop')
    
    pipeline = make_pipeline([col_vectorizer, model])

    cv = KFold(n_splits=5, shuffle=True)
    score = cross_val_score(pipeline, df_X, df_y.type, cv=cv)

    print(f'K Fold Accuracy: {score}%')

#CrossValidate('files\\X_full.csv', 'files\\y_full.csv')