pip install scikit-learn matplotlib pandas numpy seaborn 

In [1]:
import os
import sys

import pandas as pd
import numpy as np
import scipy as sp

from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

from joblib import dump, load

In [4]:
#data load and handeling 
filepath_data = os.path.join("..", "in","fake_or_real_news.csv")
news = pd.read_csv(filepath_data, index_col=0)
X = news["text"]
y = news["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
#vectoriser 
vectorizer = TfidfVectorizer(ngram_range = (1,2), lowercase =  True, max_df = 0.95, min_df = 0.05, max_features = 500)    

In [6]:
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)
feature_names = vectorizer.get_feature_names_out()

In [8]:
dump(vectorizer, "../models/tfidf_vectorizer.joblib")

['../models/tfidf_vectorizer.joblib']

In [3]:
filepath_features = "../features"

In [8]:
vectorized_data = [X_train, X_test, y_train, y_test, X_train_features, X_test_features, feature_names]
vectorized_data_names = ["X_train", "X_test", "y_train", "y_test", "X_train_features", "X_test_features", "feature_names"]

In [9]:
def save_data(data, name, filepath):
    if type(data) == sp.sparse._csr.csr_matrix:
        sp.sparse.save_npz(
            os.path.join(filepath, f'{name}.npz'), 
            data, 
            compressed=True)
    elif type(data) == pd.core.series.Series:
        data.to_csv(os.path.join(filepath, f'{name}.csv'))
    elif type(data) == np.ndarray:
        data.dump(os.path.join(filepath, f'{name}.dat'))
    else: 
        print(f'{name}: not csr_matrix or Series')

In [10]:
for (data, name) in zip (vectorized_data, vectorized_data_names):
    save_data(data, name, filepath_features)

In [19]:
def load_data(name, filepath, form):
    data_name = f'{name}'
    if form == 'npz':
        data_name = sp.sparse.load_npz(filepath)
    elif form == 'csv':
        data_name = pd.read_csv(filepath)
    elif form == 'dat':
        data_name = np.load(filepath, allow_pickle=True)
    else: 
        print(f'{name}: not csr_matrix or Series')
    return data_name

In [23]:
for file in os.listdir(filepath_features):
    filepath = os.path.join(filepath_features, file)

    name, form = file.split('.')
    print(name, form)
    #name = load_data(name, filepath, form)

X_test_features npz
X_test csv
feature_names dat
y_test csv
X_train_features npz
y_train csv
X_train csv


In [22]:
os.listdir(filepath_features)

['X_test_features.npz',
 'X_test.csv',
 'feature_names.dat',
 'y_test.csv',
 'X_train_features.npz',
 'y_train.csv',
 'X_train.csv']

In [7]:
def save_metrics(model_metrics):
    model_metrics_name = f"{model_metrics=}".split("=")[0]
    filepath_metrics = open(f'../out/{model_metrics_name}.txt', 'w')
    filepath_metrics.write(model_metrics)
    filepath_metrics.close()


In [6]:
x = 5
variable_name = f"{x=}".split("=")[0]
print(variable_name)

x


In [8]:
classifier_LR = LogisticRegression(random_state=42).fit(X_train_features, y_train)
y_pred_LR = classifier_LR.predict(X_test_features)
classifier_LR_metrics = metrics.classification_report(y_test, y_pred_LR)

NameError: name 'X_train_features' is not defined

In [2]:
#save model and txt metrics
filepath_metrics_LR = open(r'../out/classifier_LR_metrics.txt', 'w')
filepath_metrics_LR.write(classifier_LR_metrics)
filepath_metrics_LR.close()

dump(classifier_LR, "../models/classifier_LR.joblib")

NameError: name 'classifier_LR_metrics' is not defined

In [37]:
classifier_MLP = MLPClassifier(activation = "logistic",
                               hidden_layer_sizes = (20,), 
                               max_iter=1000, 
                               random_state=42).fit(X_train_features, y_train)

y_pred_MLP = classifier_MLP.predict(X_test_features)

classifier_MLP_metrics = metrics.classification_report(y_test, y_pred_MLP)

In [38]:
#save model and txt metrics
filepath_metrics_MLP = open(r'../out/classifier_MLP_metrics.txt', 'w')
filepath_metrics_MLP.write(classifier_MLP_metrics)
filepath_metrics_MLP.close()

dump(classifier_MLP, "../models/classifier_MLP.joblib")

['../models/classifier_MLP.joblib']