In [1]:
#sigmoid Function
def sigmoid (Z):
    return 1/(1+np.exp(-Z))

In [2]:
#probability
def probability(X,Theta):
    Z= X @ Theta #same as Z = X.dot(Theta) or np.dot(X,Theta)
    return sigmoid (Z)

In [3]:
#prediction
def predict(X,Theta):
    p = np.round(probability(X,Theta))
    return p

In [4]:
#Cross-entropy loss function for BGD
def computeCostBatch(X,Y,Theta):
    m=Y.size
    Hx = probability (X,Theta)
    return (-1/m)*(Y.T @ np.log(Hx) + (1-Y).T @ np.log(1-Hx))

In [5]:
#Cross-entropy loss function for SGD
def computeCostStochastic(X,Y,Theta):
    m=Y.size
    Hx = probability (X,Theta)
    return (-1)*(Y.T @ np.log(Hx) + (1-Y).T @ np.log(1-Hx))

In [6]:
#Batch Gradient Descent
def BGD (X,Y,alpha,n_epoch):
    print("\nRunning BGD.....\n")
    m=Y.size
    features=X[0].size
    J=list()
    Theta=np.zeros((features,1))
    for i in range(n_epoch):
        error=(probability(X,Theta))-Y
        errorIntoX = X.T @ error
        Theta=Theta-(alpha * (1/m) * errorIntoX)
        J.append(computeCostBatch(X,Y,Theta))
    return Theta, J

In [7]:
#Stochastic Gradient Descent
def SGD (X, Y, alpha, n_epoch):
    print("\nRunning SGD.....\n")
    m=Y.size
    features=X[0].size
    J=list()
    Theta=np.zeros((features,1))
    for i in range (n_epoch):
        cost = 0.0
        r_indices = random.sample(range(m),m)
        for i in r_indices:
            oneX = X[i,:].reshape(1,features)
            oneY = Y[i].reshape(1,1)
            error = (probability(oneX,Theta))-oneY
            errorIntoX = oneX.T @ error
            Theta=Theta-(alpha * errorIntoX)
            cost = cost + computeCostStochastic(oneX,oneY,Theta)
        J.append(cost/m)
    return Theta, J

In [8]:
#Report evaluation
def eval_GD (Theta,Test_X,Test_Y):
    Predictions = predict (Test_X, Theta)
    m = Test_Y.size
    Tn, Tp, Fn, Fp = 0, 0, 0, 0
    for i in range (m):
        if (Predictions[i]==Test_Y[i]):
            if (Predictions[i]==1):
                Tp=Tp+1
            else:
                Tn=Tn+1
        else:
            if (Predictions[i]==1):
                Fp=Fp+1
            else:
                Fn=Fn+1
    print ("No. of total Tp (classifier:pos, label:pos)= ",Tp)
    print ("No. of total Fp (classifier:pos, label:neg)= ",Fp)
    print ("No. of total Fn (classifier:neg, label:pos)= ",Fn)
    print ("No. of total Tn (classifier:neg, label:neg)= ",Tn)
    Accuracy = (Tn+Tp)/(Tn+Fp+Fn+Tp)
    Recall = Tp/(Fn+Tp)
    Precision = Tp/(Fp+Tp)
    f1 = (2*Precision*Recall)/(Precision+Recall)
    print("Accuray: ", Accuracy)
    print("Recall: ", Recall)
    print("Precision: ", Precision)
    print("F1 Score: ", f1)
    return

In [9]:
#takes directory and gives Train_X, Train_Y, Test_X, Test_Y
def extract_data (dir):
    Train_X = []
    Train_Y = []
    Test_X = []
    Test_Y = []
    for train_test in os.listdir(dir+"//"):
        if (train_test.endswith('.txt')):
            continue;
        print("Loading:",train_test,"data...")
        p_counts = [] #List of count(positive words) ∈ reviews
        n_counts = [] #List of count(negative words) ∈ reviews
        ratings = [] #List of Star Ratings (1-10 scale)
        log_counts = [] #List of log(word count of reviews)
        no_flags = [] #List of no_flags (1 if “no” ∈ review, 0 otherwise)
        ex_flags = [] #List of ex_flags (1 if “!” ∈ review, 0 otherwise)
        ys = [] #List of y labels (1 if positive, 0 otherwise)
        count=1
        for pos_neg in os.listdir(dir+"//"+train_test+"//"):
            for i in os.listdir(dir+"//"+train_test+"//"+pos_neg+"//"):
                #getting text
                f = open (dir+"//"+train_test+"//"+pos_neg+"//"+i,"r",encoding="utf8")
                text = f.read()
                textl=text.lower()
                f.close()
                #getting p_count
                p_count=0 #count(positive words) ∈ review
                f = open (dir+"//"+"positive-words.txt")
                for line in f:
                    if (line in textl):
                        p_count = p_count +1
                f.close()
                p_counts.append(p_count)
                #getting n_count
                n_count=0 #count(negative words) ∈ review
                f = open (dir+"//"+"negative-words.txt")
                for line in f:
                    if (line in textl):
                        n_count = n_count +1
                f.close()
                n_counts.append(n_count)
                #rating
                rating= "" #Star Rating (1-10 scale)
                dashIndex=i.index('_')
                k = dashIndex+1
                while (i[k]!='.'):
                    rating= rating + i[k]
                    k=k+1
                ratings.append(int(rating))
                #log_count
                log_count = math.log(text.count(' ')+1) #log(word count of review)
                log_counts.append(log_count)
                #no_flag
                no_flag=0 #(1 if “no” ∈ review, 0 otherwise)
                possible_nos = ["no ","no,","no."]
                for e in possible_nos:
                    if (e in textl):
                        no_flag=1
                        break;
                no_flags.append(no_flag)
                #ex_flag
                ex_flag=0 #(1 if “!” ∈ review, 0 otherwise)
                if ('!' in text):
                    ex_flag=1
                ex_flags.append(ex_flag)
                #y label
                y=0 #1 if positive, 0 otherwise
                if (pos_neg=="pos"):
                    y=1
                ys.append(y)
                
                count=count+1
                if(count%1000==0):
                    print(count,"Loaded!",end=' ')
        if (train_test=="train"):
            Train_X.append(p_counts)
            Train_X.append(n_counts)
            Train_X.append(ratings)
            Train_X.append(log_counts)
            Train_X.append(no_flags)
            Train_X.append(ex_flags)
            Train_Y.append(ys)
        else:
            Test_X.append(p_counts)
            Test_X.append(n_counts)
            Test_X.append(ratings)
            Test_X.append(log_counts)
            Test_X.append(no_flags)
            Test_X.append(ex_flags)
            Test_Y.append(ys)
        print("\n")
    return Train_X, Train_Y, Test_X, Test_Y

In [10]:
import os
import math
import random
import numpy as np

#Main Function part 1

#PREPROCESSING

#Takes few minutes to load
#calling a function which takes the directory and gives Train_X, Train_Y, Test_X, Test_Y
Train_X, Train_Y, Test_X, Test_Y = extract_data("Dataset")

#convert list to numpy array and reshaping
Tr_X, Tr_Y, Te_X, Te_Y = np.array(Train_X).T, np.array(Train_Y).T, np.array(Test_X).T, np.array(Test_Y).T

#feature scaling
Tr_X = (Tr_X - np.mean(Tr_X))/np.std(Tr_X)
Te_X = (Te_X - np.mean(Te_X))/np.std(Te_X)

#No. of instances in training data and test data
Tr_m = Tr_Y.size
Te_m = Te_Y.size

#adding columnn of ones
Tr_X=np.c_[np.ones((Tr_m)),Tr_X]
Te_X=np.c_[np.ones((Te_m)),Te_X]

print("Done!")

Loading: test data...
1000 Loaded! 2000 Loaded! 3000 Loaded! 4000 Loaded! 5000 Loaded! 6000 Loaded! 7000 Loaded! 8000 Loaded! 9000 Loaded! 10000 Loaded! 11000 Loaded! 12000 Loaded! 13000 Loaded! 14000 Loaded! 15000 Loaded! 16000 Loaded! 17000 Loaded! 18000 Loaded! 19000 Loaded! 20000 Loaded! 21000 Loaded! 22000 Loaded! 23000 Loaded! 24000 Loaded! 25000 Loaded! 

Loading: train data...
1000 Loaded! 2000 Loaded! 3000 Loaded! 4000 Loaded! 5000 Loaded! 6000 Loaded! 7000 Loaded! 8000 Loaded! 9000 Loaded! 10000 Loaded! 11000 Loaded! 12000 Loaded! 13000 Loaded! 14000 Loaded! 15000 Loaded! 16000 Loaded! 17000 Loaded! 18000 Loaded! 19000 Loaded! 20000 Loaded! 21000 Loaded! 22000 Loaded! 23000 Loaded! 24000 Loaded! 25000 Loaded! 

Done!


In [14]:
#Main Function part 2

#IMPLEMENTATION AND EVALUATION

#running Batch Gradient Descent
alpha , n_epoch = 0.01 , 1000
ThetaB,J = BGD (Tr_X,Tr_Y,alpha,n_epoch)
print("\nValues of Theta:", ThetaB.T)
print("\nFew values of J (BGD):",J[0:-1:100])

#evaluating BGD
print("\n*****EVALUATING BGD*****")
eval_GD (ThetaB,Te_X,Te_Y)

#running Stochastic Gradient Descent
alpha , n_epoch = 0.01 , 2
ThetaS,J = SGD (Tr_X,Tr_Y,alpha,n_epoch)
print("\nValues of Theta:", ThetaS.T)
print("\nFew values of J (SGD):",J)

#evaluating SGD
print("\n*****EVALUATING SGD*****")
eval_GD (ThetaS,Te_X,Te_Y)


Running BGD.....


Values of Theta: [[-0.52622483  0.35138005  0.340718    1.79367318 -0.59562546  0.23398085
   0.27737841]]

Few values of J (BGD): [array([[0.68989269]]), array([[0.49579861]]), array([[0.39468187]]), array([[0.32794904]]), array([[0.28101256]]), array([[0.24648094]]), array([[0.22011331]]), array([[0.19935077]]), array([[0.18258101]]), array([[0.16874713]])]

*****EVALUATING BGD*****
No. of total Tp (classifier:pos, label:pos)=  12500
No. of total Fp (classifier:pos, label:neg)=  0
No. of total Fn (classifier:neg, label:pos)=  0
No. of total Tn (classifier:neg, label:neg)=  12500
Accuray:  1.0
Recall:  1.0
Precision:  1.0
F1 Score:  1.0

Running SGD.....


Values of Theta: [[-2.31113167  1.54309974  1.49282695  6.51441069 -2.11580145  1.01494313
   1.13107429]]

Few values of J (SGD): [array([[0.04482141]]), array([[0.01115177]])]

*****EVALUATING SGD*****
No. of total Tp (classifier:pos, label:pos)=  12500
No. of total Fp (classifier:pos, label:neg)=  0
No. of tot