In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from utils import fair_train_test_split
from sklearn.metrics import accuracy_score

In [2]:
class LogisticRegression():
    def __init__(self, lr=1, num_iter=10000, fit_intercept=True, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
    
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        self.theta = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient
            
            if(self.verbose == True and i % 10000 == 0):
                z = np.dot(X, self.theta)
                h = self.__sigmoid(z)
                print(f'loss: {self.__loss(h, y)} \t')
    
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X, self.theta))
    
    def predict(self, X, threshold=0.5):
        return self.predict_prob(X) >= threshold

In [3]:
df = pd.read_csv('clean_dataset_with_stemming.csv')
X = df['Teks']
y = df['label']

X_train, X_test, y_train, y_test = fair_train_test_split(X, y, test_size=0.1)

In [4]:
count_vect = CountVectorizer()
df_count_vect = count_vect.fit_transform(X_train).toarray()

t1 = time.time()
clf = LogisticRegression()
clf.fit(df_count_vect, y_train)
y_pred = clf.predict(count_vect.transform(X_test).toarray())
print('Accuracy of LogisticRegression without TF-IDF : ', accuracy_score(y_pred, y_test))
print('Time elapsed :', time.time() - t1)
print()

Accuracy of LogisticRegression without TF-IDF :  0.940771349862259
Time elapsed : 1035.2719304561615



In [5]:
tfidf = TfidfVectorizer()
df_tfidf = tfidf.fit_transform(X_train).toarray()

t1 = time.time()
clf = LogisticRegression()
clf.fit(df_tfidf, y_train)
y_pred = clf.predict(tfidf.transform(X_test).toarray())
print('Accuracy of LogisticRegression with TF-IDF : ', accuracy_score(y_pred, y_test))
print('Time elapsed :', time.time() - t1)
print()

Accuracy of LogisticRegression with TF-IDF :  0.9435261707988981
Time elapsed : 949.4379150867462

