pip install scikit-learn matplotlib pandas numpy seaborn 

In [115]:
from joblib import dump, load

import os
import sys

# data munging tools
import pandas as pd
import numpy as np

# Machine learning stuff
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

import scipy as sp


# Visualisation
import matplotlib.pyplot as plt

In [9]:
#data load and handeling 
filepath = os.path.join("..", "in","fake_or_real_news.csv")
news = pd.read_csv(filepath, index_col=0)
X = news["text"]
y = news["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
#vectoriser 
vectorizer = TfidfVectorizer(ngram_range = (1,2), lowercase =  True, max_df = 0.95, min_df = 0.05, max_features = 500)    

In [29]:
# first we fit to the training data...
X_train_features = vectorizer.fit_transform(X_train)

#... then do it for our test data
X_test_features = vectorizer.transform(X_test)

# get feature names
feature_names = vectorizer.get_feature_names_out()

In [39]:
dump(vectorizer, "../models/tfidf_vectorizer.joblib")

['../models/tfidf_vectorizer.joblib']

In [48]:
vectorized_data_fp = "../in"

In [108]:
vectorized_data = [X_train, X_test, y_train, y_test, X_train_features, X_test_features, feature_names]
vectorized_data_names = ["X_train", "X_test", "y_train", "y_test", "X_train_features", "X_test_features", "feature_names"]

In [120]:
def save_data(data, name, filepath):
    if type(data) == sp.sparse._csr.csr_matrix:
        sp.sparse.save_npz(
            os.path.join(filepath, f'{name}.npz'), 
            data, 
            compressed=True)
    elif type(data) == pd.core.series.Series:
        data.to_csv(os.path.join(filepath, f'{name}.csv'))
    elif type(data) == np.ndarray:
        data_con = pd.DataFrame(data)
        data_con.to_csv(os.path.join(filepath, f'{name}.csv'))
    else: 
        print(f'{name}: not csr_matrix or Series')

In [122]:
for (data, name) in zip (vectorized_data, vectorized_data_names):
    save_data(data, name, vectorized_data_fp)

In [46]:
npz_train_feats = sp.sparse.load_npz('../in/X_train_features.npz')


In [31]:
classifier_LR = LogisticRegression(random_state=42).fit(X_train_features, y_train)
y_pred_LR = classifier_LR.predict(X_test_features)
classifier_LR_metrics = metrics.classification_report(y_test, y_pred_LR)

In [36]:
#save model and txt metrics
filepath_metrics_LR = open(r'../out/classifier_LR_metrics.txt', 'w')
filepath_metrics_LR.write(classifier_LR_metrics)
filepath_metrics_LR.close()

dump(classifier_LR, "../models/classifier_LR.joblib")

['../models/classifier_LR.joblib']

In [37]:
classifier_MLP = MLPClassifier(activation = "logistic",
                               hidden_layer_sizes = (20,), 
                               max_iter=1000, 
                               random_state=42).fit(X_train_features, y_train)

y_pred_MLP = classifier_MLP.predict(X_test_features)

classifier_MLP_metrics = metrics.classification_report(y_test, y_pred_MLP)

In [38]:
#save model and txt metrics
filepath_metrics_MLP = open(r'../out/classifier_MLP_metrics.txt', 'w')
filepath_metrics_MLP.write(classifier_MLP_metrics)
filepath_metrics_MLP.close()

dump(classifier_MLP, "../models/classifier_MLP.joblib")

['../models/classifier_MLP.joblib']