### notes assignment 2

#### set up 

In [None]:
import os
import sys

import pandas as pd
import numpy as np
import scipy as sp
from joblib import dump, load
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

### Functions 

### vectorizer.py script 

In [None]:
# Read fake or real news data and define test and train split
filepath_data = os.path.join("..", "in","fake_or_real_news.csv")
news = pd.read_csv(filepath_data, index_col=0)
X = news["text"]
y = news["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define vectorizer
vectorizer = TfidfVectorizer(ngram_range = (1,2), lowercase =  True, max_df = 0.95, min_df = 0.05, max_features = 500)    

In [None]:
# Fit vectorizer to data and extract feature names
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)
feature_names = vectorizer.get_feature_names_out()

In [None]:
# Save vectorizer in 'models' folder
dump(vectorizer, "../models/tfidf_vectorizer.joblib")

In [None]:
# Save vectorized data, extracted features etc to 'features' folder
vectorized_data = [X_train, X_test, y_train, y_test, X_train_features, X_test_features, feature_names]

f = open('../out/features.pkl', 'wb' )
pickle.dump(vectorized_data, f)
f.close()

### LR_classifier.py script

In [None]:
# Load the stored data 
X_train, X_test, y_train, y_test, X_train_features, X_test_features, feature_names = pd.read_pickle('../out/features.pkl')

In [None]:
# Fit logistic regression classifier to training set, save it in 'models' folder, 
# and test it on test set. 
classifier_LR = LogisticRegression(random_state=42).fit(X_train_features, y_train)
dump(classifier_LR, "../models/classifier_LR.joblib")
y_pred_LR = classifier_LR.predict(X_test_features)

In [None]:
# Calculate evalutation metrics and save them as txt file in 'out' folder 
classifier_LR_metrics = metrics.classification_report(y_test, y_pred_LR)


save_metrics(classifier_LR_metrics)

### MLP_classifier.py script

In [None]:
# Load the stored data 
X_train, X_test, y_train, y_test, X_train_features, X_test_features, feature_names = pd.read_pickle('../out/features.pkl')

In [None]:
# Fit MLP classifier to training set, save it in 'models' folder, 
# and test it on test set. 
classifier_MLP = MLPClassifier(activation = "logistic",
                               hidden_layer_sizes = (20,), 
                               max_iter=1000, 
                               random_state=42).fit(X_train_features, y_train)
dump(classifier_MLP, "../models/classifier_MLP.joblib")

y_pred_MLP = classifier_MLP.predict(X_test_features)

In [None]:
# Calculate evalutation metrics and save them as txt file in 'out'folder 
classifier_MLP_metrics = metrics.classification_report(y_test, y_pred_MLP)


save_metrics(classifier_MLP_metrics)