In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from utils.loader import DataLoader
import datasets
import random

In [2]:
import nltk
import numpy as np
import re
import pickle
import csv
import string
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
lem = WordNetLemmatizer()


In [3]:
# Load datasets
loader = DataLoader()
train_data = loader.load_amazon(deceptive=False, all=True, test_mode=False)
test_data = loader.load_amazon(deceptive=False, all=True, test_mode=True)

In [4]:
def pre_process(txt):
    '''Basic text preprocessing, including 
    to lowercases, remove punctuations/odd characters/slash_n/meaningless words, tokenize, lemmatization'''
    # convert to lowercases
    txt = txt.lower()

    # replace punctuations with spaces
    for punc in string.punctuation:
        txt = txt.replace(punc, ' ')

    # remove odd characters (keep alphabets only)
    txt = re.sub(r'[^a-z ]', '', txt)

    # tokenize the txt
    txt = word_tokenize(txt)

    # # lemmatization
    # txt = [lem.lemmatize(word) for word in txt]

    # # stop word removel, too-short word removel
    # stop_words = stopwords.words('english')
    # txt = [w for w in txt if w not in stop_words and len(w) > 1]

    return txt


def get_inv_idx(raw_txt):
    '''calculate the inverted index and store with nested dictionaries:
    a dictionary stores all the terms
    each term contains a dictionary, stores all related doc and occurance
    '''
    print('Start calculating inverted index...')
    inverted_idx = {}

    # variables keep track of already processed passages
    count_psg = 0

    for i in range(len(raw_txt)):

        passage = pre_process(raw_txt[i])

        for w in passage:
            if w in inverted_idx:
                inverted_idx[w][i] = 1
            else:
                inverted_idx[w] = {}
                inverted_idx[w][i] = 1
        count_psg += 1
    
    print('Total processed passages:', count_psg)
    print('Finish calculating inverted index')
    return inverted_idx


def cal_tf_idf(txt, N, inv_idx):
    ''' calculate the tf-idf of a pre-processed text
     Args:
        txt:        pre-processed text
        N:          number of documents in collection
        inv_idx:    inverted index of terms    
    '''
    term_size = len(inv_idx)
    key_list = list(inv_idx)
    
    tf_idf = np.zeros(term_size)
    for t in np.unique(txt):
        # number of documents in which term t appears
        if t in inv_idx:
            n_t = len(inv_idx[t])
            idf = np.log10(N/n_t)
            tf = txt.count(t)

            term_idx = key_list.index(t)
            tf_idf[term_idx] = tf * idf

    return tf_idf

In [5]:
text = train_data['REVIEW_TEXT']
text_test = test_data['REVIEW_TEXT']

In [6]:
label = train_data['LABEL']
label_test = test_data['LABEL']

lab = []
for i in range(len(label)):
    if label[i] == 0:
        lab.append('fake')
    else:
        lab.append('true')

labt = []
for i in range(len(label)):
    if label[i] == 0:
        labt.append('fake')
    else:
        labt.append('true')



In [25]:
tx = []
for i in range(len(text)):
    tx.append((text[i]))

for i in range(len(text_test)):
    tx.append((text_test[i]))

In [8]:
# my own inverted idx
inverted_index = get_inv_idx(tx)
len(inverted_index)

Start calculating inverted index...
Total processed passages: 21000
Finish calculating inverted index


33075

In [None]:
pre_process(tx[0])

In [9]:
N = 21000
tf_idf = []
for i in range(N):
    tf_idf.append(cal_tf_idf(pre_process(tx[i]), N, inverted_index))

In [19]:
X = tf_idf[:len(text)]
y = tf_idf[len(text):]

In [26]:
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(tx)


vectorizer = TfidfVectorizer()

vec = vectorizer.fit_transform(tx)
# y = vectorizer.fit_transform(txtest)
X = vec[:len(text)]
y = vec[len(text):]

In [27]:
print(X.shape, y.shape)

(15750, 34870) (5250, 34870)


In [13]:
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

In [14]:
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

In [28]:
for clf, name in (
        (RidgeClassifier(), "Ridge Classifier"),
        (Perceptron(), "Perceptron \t"),
        (PassiveAggressiveClassifier(), "Passive-Aggressive"),
        (KNeighborsClassifier(), "kNN \t \t"),
        (LinearSVC(), "LinearSVC\t"),
        (SGDClassifier(), "SGDClassifier\t"),
        (MultinomialNB(), "MultinomialNB\t"),
        (BernoulliNB(), "BernoulliNB\t")):


    clf.fit(X, lab)

    pred = clf.predict(y)

    count = 0
    for i in range(len(pred)):
        if pred[i] == labt[i]:
            count += 1

    print(f'{name} \t {count/len(pred)}')

Ridge Classifier 	 0.5095238095238095
Perceptron 	 	 0.5022857142857143
Passive-Aggressive 	 0.5051428571428571
kNN 	 	 	 0.5038095238095238
LinearSVC	 	 0.5062857142857143
SGDClassifier	 	 0.5108571428571429
MultinomialNB	 	 0.5059047619047619
BernoulliNB	 	 0.496952380952381


In [20]:
clf = SGDClassifier()
print("start fit")

clf.fit(X, lab)

print("start pred")
pred = clf.predict(y)

count = 0
for i in range(len(pred)):
    if pred[i] == labt[i]:
        count += 1

print(f'sgd \t {count/len(pred)}')

start fit
start pred
sgd 	 0.5055238095238095
