In [10]:
import pandas as pd
import re
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
from sklearn.svm import LinearSVC
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [11]:
def GenerateXY(csv):
  # prepare dataframe
  df = pd.read_csv(csv)
  df.dropna(subset=['type'], inplace = True)
  df.drop(df[df['type'] == 'unknown'].index, inplace=True)
  df.reset_index(drop=True, inplace=True)
  
  # target values
  y = pd.DataFrame({'result' : np.where(df['type'] == 'fake', 1, 0)}) #[1, 0] = [fake, not fake]
  
  words = Counter()
  words.update("num".split())
  words.update("date".split())
  words.update("url".split())
  words.subtract(words)
  
  frames = []
  for i in range(len(df)):
    wordcopy = words.copy()
    for word in df.loc[i, "content"].split():
      if word == "<num>":
        wordcopy.update("num".split())
      if word == "<url>":
        wordcopy.update("url".split())
      if word == "date":
        wordcopy.update("date".split())
    frames += [pd.DataFrame(dict(wordcopy), index=[i])]
    
    
  # concat frames generated in loop
  X = pd.concat(frames)

  return X, y
  
X, y = GenerateXY("../cleaned_995.csv")

X.to_csv('X.csv', index=False)
y.to_csv('y.csv', index=False)

In [12]:
def Split(X_path, y_path):
    # usecols = ['content', 'type']
    # df = pd.read_csv(csv_path, usecols=usecols)
    
    df_X = pd.read_csv(X_path)
    df_y = pd.read_csv(y_path)
    X_train, X_test, y_train, y_test = train_test_split(df_X,
                                                        df_y,
                                                        train_size = 0.8,
                                                        random_state = 0,
                                                        shuffle = False,
                                                        stratify = None)

    X_test, X_val, y_test, y_val = train_test_split(X_test,
                                                    y_test,
                                                    test_size= 0.5,
                                                    random_state = 0,
                                                    shuffle = False,
                                                    stratify = None)

    return X_train, X_test, X_val, y_train, y_test, y_val

X_train, X_test, X_val, y_train, y_test, y_val = Split('X.csv', 'y.csv')

In [13]:
model = LinearRegression()
fit = model.fit(X_train, y_train['result'])
y_pred = fit.predict(X_test)
y_pred = [round(x,0) for x in y_pred]

mse = mean_squared_error(y_test['result'], y_pred)
print("LinearRegression MSE: ", mse)
acc = accuracy_score(y_test['result'], y_pred)
print("LinearRegression accuracy: ", acc)

LinearRegression MSE:  0.15536105032822758
LinearRegression accuracy:  0.8446389496717724


In [14]:
model = LogisticRegression(random_state=0, max_iter=1000)
fit = model.fit(X_train, y_train['result'])
y_pred = fit.predict(X_test)
y_pred = [round(x,0) for x in y_pred]

mse = mean_squared_error(y_test['result'], y_pred)
print("LinearRegression MSE: ", mse)
acc = accuracy_score(y_test['result'], y_pred)
print("LinearRegression accuracy: ", acc)

LinearRegression MSE:  0.16301969365426697
LinearRegression accuracy:  0.8369803063457331


In [15]:
model = LinearSVC(max_iter=100)
model.fit(X_train, y_train['result'])

pred = model.predict(X_test)
acc = accuracy_score(y_test['result'], pred)

print("LinearSVC accuracy: ", acc)


LinearSVC accuracy:  0.7133479212253829




In [16]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(4, 1), random_state=1)

fit = clf.fit(X_train, y_train['result'])

pred = fit.predict(X_test)

acc = accuracy_score(y_test['result'], pred)

print(acc)


0.8687089715536105


In [17]:
# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
    
mlp_clf = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=5000, activation='relu', solver='adam', random_state=42)

fit = mlp_clf.fit(X_train, y_train['result'])

pred = fit.predict(X_test)

acc = accuracy_score(y_test['result'], pred)

# mse = mean_squared_error(y_test['result'], pred)
print("MLPClassifier MSE: ", mse)
print("MLPClassifier accuracy: ", acc)


MLPClassifier MSE:  0.16301969365426697
MLPClassifier accuracy:  0.8698030634573304
