In [None]:
import pandas as pd
import numpy as np
import nltk
import string
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes, svm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
fake_data = pd.read_csv('data/Fake.csv')
fake_data['label'] = 0
fake_data.info()

In [None]:
true_data = pd.read_csv('data/True.csv')
true_data['label'] = 1
true_data.info()

In [None]:
fake_data.isnull().sum()

In [None]:
true_data.isnull().sum()

In [None]:
fake_data['subject'].value_counts()

In [None]:
true_data['subject'].value_counts()

In [None]:
merged_data = pd.concat((fake_data, true_data))

In [None]:
merged_data = merged_data.drop('date', axis=1)

In [None]:
merged_data['text'] = merged_data['subject'] + ' ' + merged_data['title'] + ' ' + merged_data['text']
merged_data = merged_data.drop('subject', axis=1).drop('title', axis=1)

In [None]:
random_permutation = np.random.permutation(len(merged_data))
merged_data = merged_data.iloc[random_permutation]

In [None]:
merged_data = merged_data.head(1000)

In [None]:
merged_data.info()

In [None]:
merged_data['label'].value_counts()

In [None]:
lemmatizer = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
def preprocess_text(text):
    ret_text = text.translate(text.maketrans({x:'' for x in string.punctuation})).lower() # lowercase; remove punctuation
    ret_text = nltk.tokenize.word_tokenize(ret_text)
    ret_text = [word for word in ret_text if word not in stopwords]
    ret_text = [lemmatizer.lemmatize(word) for word in ret_text]
    return  ret_text
    

In [None]:
merged_data['text'] = merged_data['text'].apply(preprocess_text)

In [None]:
w2v_text = Word2Vec(merged_data['text'], vector_size=100, workers=8) # TODO: test different values of vector_size, up to 300

In [None]:
def vectorize(words, model, first_n_tokens):
    words_vecs = [model.wv[word] for word in words if word in model.wv]
    for i in range(first_n_tokens-len(words_vecs)):
        words_vecs.append(np.zeros(100,))
    return np.asarray(words_vecs[:first_n_tokens]).flatten()

In [None]:
num_of_significant_tokens = 15 # TODO: test different values, check if 'text' is even worth it

each row now has num_of_significant_tokens * vector_size elements (15 * 100 = 1500)

In [None]:
merged_data['text'] = merged_data['text'].apply(lambda x: vectorize(x, w2v_text, num_of_significant_tokens))

In [None]:
merged_data['text'].head()

In [None]:
merged_data.head()

In [None]:
transformed_array_X = np.asarray([x for x in merged_data['text']])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(transformed_array_X, merged_data['label'], test_size=0.2, random_state=2024)

In [None]:
print(X_train.shape)

=============================================

In [None]:
model_bayes = naive_bayes.GaussianNB()

In [None]:
model_bayes.fit(X_train, Y_train)

In [None]:
y_pred = model_bayes.predict(X_test)

In [None]:
print(accuracy_score(y_pred, Y_test))
print(precision_score(Y_test, y_pred,))
print(recall_score(Y_test, y_pred,))
print(f1_score(Y_test, y_pred,))
print(confusion_matrix(Y_test, y_pred,))

=============================================

In [None]:
model_svm = svm.SVC(kernel='linear')

In [None]:
model_svm.fit(X_train, Y_train)

In [None]:
y_pred = model_svm.predict(X_test)

In [None]:
print(accuracy_score(y_pred, Y_test))
print (precision_score(Y_test, y_pred,))
print (recall_score(Y_test, y_pred,))
print(f1_score(Y_test, y_pred,))
print(confusion_matrix(Y_test, y_pred,))

=============================================

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Input

In [None]:
model = Sequential()
model.add(Input((1,1500)))
model.add(LSTM(1500))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
X_train2 = []
for x in X_train:
    X_train2.append([x])
X_train2 = np.asarray(X_train2)

In [None]:
model.fit(X_train2, Y_train, batch_size=100, epochs=10, validation_split=0.2)

In [None]:
X_test2 = []
for x in X_test:
    X_test2.append([x])
X_test2 = np.asarray(X_test2)

In [None]:
accr = model.evaluate(X_test2,Y_test)