In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import re

In [None]:
import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegressionCV

# Load Data

In [None]:
true_df = pd.read_csv('./Dataset/True.csv')
fake_df = pd.read_csv('./Dataset/Fake.csv')

In [None]:
# Create dataset
true_df['label'] = 'True'
fake_df['label'] = 'Fake'
data_df = pd.concat([true_df, fake_df], ignore_index=True)

# Clean Data

In [None]:
# Remove duplicated rows
data_df = data_df[~data_df.duplicated()]

# Remove rows with empty / short text
data_df = data_df[data_df['text'].str.len() > 10]

# Convert date string to date
data_df['date'] = pd.to_datetime(data_df['date'], errors='coerce')

# Remove rows with non-date values in 'date' column
data_df.dropna(inplace=True)
data_df.reset_index(drop=True, inplace=True)

# Preprocess Data

In [None]:
porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_txt(txt, stem=None):
    txt = txt.lower()
    txt = re.sub('http[s]?://\S+', '', txt)
    txt = re.sub(r'[^a-z0-9_\s]+', '', txt)
    if stem == 'porter':
        token_words = word_tokenize(txt)
        stem_words = [porter.stem(word) for word in token_words]
        return ' '.join(stem_words)
    elif stem == 'lemma':
        token_words = word_tokenize(txt)
        stem_words = [lemmatizer.lemmatize(word) for word in token_words]
        return ' '.join(stem_words)
    return txt

In [None]:
for field in ['title', 'text']:
    data_df[field] = data_df[field].apply(lambda txt: preprocess_txt(txt, stem='stem'))

# Again, Remove rows with empty / short text
data_df = data_df[data_df['text'].str.len() > 10]

In [None]:
data_df.label.value_counts()

# Prepare Data set

In [None]:
def vectorize(train_data, test_data, vectorizer='count', ngram_range=(1, 1), min_df=1):
    if vectorizer == 'count':
        vector = CountVectorizer(stop_words='english', ngram_range=ngram_range, min_df=min_df)
    elif vectorizer == 'tfidf':
        vector = TfidfVectorizer(stop_words='english', ngram_range=ngram_range, min_df=min_df)

    train_data = vector.fit_transform(train_data)
    test_data = vector.transform(test_data)
    return train_data, test_data

In [None]:
X, y = data_df[['subject', 'title', 'text']].apply(' '.join, axis=1), data_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# vector = CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=4)
# vector = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), min_df=4)
vector = HashingVectorizer(stop_words='english', ngram_range=(1, 1))

X_train = vector.fit_transform(X_train)
X_test = vector.transform(X_test)


# Logistic Regression

In [None]:
lr = LogisticRegressionCV().fit(X_train, y_train)

In [None]:
lr.score(X_test, y_test)