In [2]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib as mpl
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [3]:
def XyData(csv_path, X_path, y_path):
    usecols = ['content', 'type']
    df = pd.read_csv(csv_path, usecols=usecols)
    
    df.dropna(subset=['type'], inplace = True) #remove empty labels
    df = df.loc[df.content.apply(type) == str]
    #df.drop(df[df['type'] == 'unknown'].index, inplace=True) #remove 'unknown' label
    df.reset_index(drop=True, inplace=True) #reset index

    l = ['fake']
    y = pd.DataFrame({'result' : np.where(df['type'].isin(l), 1, 0)}) #[1, 0] = [fake, not fake]

    unique_words = Counter()
    df['content'].str.split().apply(unique_words.update)
    unique_words.subtract(unique_words) #keep keys, reset count
 
    frames = []  
    for row in range(len(df)):
        words = unique_words.copy()
        words.update(df.loc[row,"content"].split())
        frames += [pd.DataFrame(words, index=[row])]
    X = pd.concat(frames)
         
    X.to_csv(X_path, index=False)
    y.to_csv(y_path, index=False)

XyData("../cleaned_news_sample.csv", "X.csv", "y.csv")

In [4]:
def Split(X_path, y_path):
    df_X = pd.read_csv(X_path)
    df_y = pd.read_csv(y_path)
    X_train, X_test, y_train, y_test = train_test_split(df_X,
                                                        df_y,
                                                        train_size = 0.8,
                                                        random_state = 0,
                                                        shuffle = False,
                                                        stratify = None)

    X_test, X_val, y_test, y_val = train_test_split(X_test,
                                                    y_test,
                                                    test_size= 0.5,
                                                    random_state = 0,
                                                    shuffle = False,
                                                    stratify = None)

    return X_train, X_test, X_val, y_train, y_test, y_val

In [8]:
# Split data into Train, Test and Validation
X_train, X_test, X_val, y_train, y_test, y_val = Split('X.csv', 'y.csv')

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(4, 1), random_state=1)

fit = clf.fit(X_train, y_train['result'])

pred = fit.predict(X_test)

mse = mean_squared_error(y_test['result'], pred)
print("MLPClassifier MSE: ", mse)
acc = accuracy_score(y_test['result'], pred)
print("MLPClassifier accuracy: ", acc)
print(accuracy_score(y_val['result'], pred))
