In [2]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import graphviz
from sklearn import tree
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv("../Data/ACME-HappinessSurvey2020.csv")

In [4]:
y = data["Y"]

In [5]:
columns  = list(data.columns)
columns.remove("Y")

In [6]:
# X = data[["X1", "X6"]]
X = data[columns]

In [7]:
feature_names = [
    "order delivered on time",
    "contents were as expected",
    "I ordered everything I wanted to order",
    "I paid a good price",
    "I am satisfied with the courier",
    "the app is easy to order"
]

In [8]:
indices = [0, 5]

selected_features = [feature_names[i] for i in indices]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

## Decision Tree

In [252]:
dt_classifier = DecisionTreeClassifier(criterion="gini")
val_scores_dt = cross_val_score(dt_classifier, X, y, scoring="accuracy", cv=6)
np.mean(val_scores_dt)

0.6587301587301587

In [131]:
dt_classifier = DecisionTreeClassifier(criterion="entropy", min_samples_leaf=16)
dt_classifier = dt_classifier.fit(X, y)
preds = dt_classifier.predict(X)
acc_score = accuracy_score(y, preds)
print(acc_score)

0.6507936507936508


### Gradient Boosting

In [130]:
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.1, min_samples_leaf=12,random_state=5)
val_scores_gbc = cross_val_score(gbc, X, y, scoring="accuracy", cv=6)
np.mean(val_scores_gbc)

0.6428571428571428

In [124]:
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.1, min_samples_leaf=12,random_state=5)
gbc = gbc.fit(X_train, y_train)
preds = gbc.predict(X_test)
acc_score = accuracy_score(y_test, preds)
print(acc_score)

0.4230769230769231


In [132]:
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.1, min_samples_leaf=12,random_state=5)
gbc = gbc.fit(X, y)
train_preds = gbc.predict(X)
acc_score = accuracy_score(y, train_preds)
print(acc_score)

0.9365079365079365


In [133]:
from joblib import dump, load

dump(gbc, "gbc.joblib")

['gbc.joblib']

### Random Forest

In [30]:
rf = RandomForestClassifier(n_estimators=100, random_state=3)
val_scores_rf = cross_val_score(rf, X, y, scoring="accuracy", cv=3)
np.mean(val_scores_rf)

0.5396825396825397

In [31]:
rf = RandomForestClassifier(n_estimators=100, random_state=3)
rf = rf.fit(X, y)
preds = rf.predict(X)
acc_score = accuracy_score(y, preds)
print(acc_score)

0.9444444444444444


In [32]:
dt_classifier = DecisionTreeClassifier(criterion="entropy")
dt_classifier = dt_classifier.fit(X_train, y_train)
y_preds = dt_classifier.predict(X_test)
acc_score = accuracy_score(y_test, y_preds)
print(acc_score)

0.46153846153846156


In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(
    dt_classifier,
    feature_names=selected_features,
    class_names=["unhappy", "happy"],
    filled=True
)

plt.savefig("dt.png", dpi=200)

## SVM

In [1219]:
svm_model = SVC(kernel="poly")
val_scores_svm = cross_val_score(svm_model, X, y, scoring="accuracy", cv=6)
np.mean(val_scores_svm)

0.5317460317460317

In [962]:
svm_model = SVC(kernel="poly")
svm_model = svm_model.fit(X_train, y_train)

In [963]:
y_preds = svm_model.predict(X_test)

In [964]:
acc_score = accuracy_score(y_test, y_preds)

In [965]:
acc_score

0.5789473684210527

## Logistic Regression

In [915]:
lr = LogisticRegression()
val_scores_lr = cross_val_score(lr, X, y, scoring="accuracy", cv=6)
np.mean(val_scores_lr)

0.6031746031746031

In [916]:
nb = GaussianNB()
val_scores_nb = cross_val_score(nb, X, y, scoring="accuracy",cv=6)
np.mean(val_scores_nb)

0.611111111111111

In [917]:
class LogReg:

    def __init__(self, y, X, eta, epochs, momentum=False, alpha=0.0, irls=False):
        self.w = np.random.normal(0.0, 1.0, (X.shape[1],)) 
        self.y = y
        self.X = X
        self.eta = eta
        self.epochs = epochs
        self.momentum = momentum
        #Iterative Re-weighted least squares
        self.irls = irls
        if momentum:
            self.velocity = 0.0
            #Parameter to update velocity
            self.alpha = alpha


    def sigmoid(self, x):
        return 1 / (1 + np.exp(-1 * x))
    
    
    def predict(self, x):
        preds = np.dot(self.w, x)
        return preds
    
    def pred_prob(self, x):
        return self.sigmoid(self.predict(x))

    def full_prob(self, X):

        return self.sigmoid(X @ self.w)
    
    def full_cost(self):

        cost = 0
        for x_i, y_i in zip(self.X, self.y):
            cost += self.calc_cost(x_i, y_i)
        
        return cost / len(self.X)

    
    def calc_cost(self, x, y):

        if y == 1:
            return -1 * np.log(self.sigmoid(self.predict(x)))
        else:
            return -1 * np.log(1 - self.sigmoid(self.predict(x)))
    
    def full_grad(self, lambd=0.0):
        
        #lambda for regularization
        P1 = self.full_prob(self.X)
        return self.X.T @ (P1 - self.y) + (lambd * self.w)

    def calc_hessian(self, lambd=0.0):

        P1 = self.full_prob(self.X)
        diagonals = P1 * (1 - P1)

        n = len(P1)
        R = np.zeros((n,n))
        np.fill_diagonal(R, diagonals)
        
        H = (self.X.T @ R @ self.X) + lambd

        return H
    
    def single_grad(self, x, y, l2=False, lamb=0.0):

        if l2:
            if y == 1:
                return -1 * x * (1 - self.sigmoid(self.predict(x))) + lamb * self.w
            else:
                return x * self.sigmoid(self.predict(x)) + lamb * self.w


        #Gives gradient for single input variable
        if y == 1:
            return -1 * x * (1 - self.sigmoid(self.predict(x)))
        else:
            return x * self.sigmoid(self.predict(x))
    
    def update_weights(self, grad=None, hessian=None):

        if self.momentum:
            self.w = self.w + self.velocity
        elif self.irls:
            self.w = self.w - (hessian * grad)
        else:
            self.w = self.w - (self.eta * grad)

    
    def shuffle_together(self, x, y):
        assert len(x) == len(y)
        p = np.random.permutation(len(x))
        return x[p], y[p]
    
    def gen_batches(self, x, y, batch_size):

        batches = []
        start = 0
        end = batch_size

        while end < len(x):
            batches.append((x[start:end], y[start:end]))
            start = end
            end += batch_size

        return np.array(batches)
    
    def train_gd_eff(self, lambd=0.0):

        n = len(self.X)
        cost = self.full_cost()
        print(f"The loss before training : {cost}")
        for e in range(self.epochs):
            
            grad = self.full_grad(lambd)
            self.update_weights(grad)
        
        cost = self.full_cost()
        print(f"The loss after training : {cost}")

    def train_irls(self, lambd=0.0):

        cost = self.full_cost()
        print(f"The loss before training : {cost}")
        for e in range(self.epochs):
            
            grad = self.full_grad()

            #Calculate the Hessian matrix
            hessian = self.calc_hessian(lambd)
            #Invert the Hessian matrix
            hessian = np.linalg.inv(hessian)
            print(hessian.shape)
            #Passing in the Hessian since learning rate won't be used
            self.update_weights(grad, hessian)
        
        cost = self.full_cost()
        print(f"The loss after training : {cost}")


    
    def train_sgd(self, l2=False, l=0):
        n = len(self.X)
        for e in range(self.epochs):
            epoch_loss = 0
            for i in range(n):

                i = np.random.randint(0, n)
                x_i = self.X[i]
                y_i = self.y[i]

                loss = self.calc_cost(x_i, y_i)
                grad = self.single_grad(x_i, y_i, l2, l)

                epoch_loss += loss
                # print(f"The observation : {x_i}")
                # print(f"The target : {y_i}")
                # print(f"Previous weights : {self.w}")
                # print(f"The dot product of weight with observation : {self.predict(x_i)}")
                # print(f"Sigmoid of prediction : {self.pred_prob(x_i)}")
                self.update_weights(grad)
                # print(f"Gradient update at epoch {e} : {grad}")
            epoch_loss /= n
            print(f"Loss at epoch {e} : {epoch_loss}")
                # print(f"Weight after update : {self.w}")
                # print("\n")
    def calc_velocity(self, grad):
        
        self.velocity = (self.alpha * self.velocity) - (self.eta * grad)

    def train_gd(self, l2=False, l=0):
        n = len(self.X)
        for e in range(self.epochs):
            total_loss = 0
            avg_grad = np.zeros(self.w.shape)
            #print(f"Previous weights at epoch {e} : {self.w}")
            for i in range(0, n):
                x_i = self.X[i]
                y_i = self.y[i]
                total_loss += self.calc_cost(x_i, y_i)
                avg_grad = avg_grad + self.single_grad(x_i, y_i, l2, l)

            print(f"Loss at epoch {e} : {total_loss / n}")
            print(f"The gradient is : {avg_grad}")
            #print(f"Gradients : {avg_grad}")
            #avg_grad = avg_grad / n

            if self.momentum:
                self.calc_velocity(avg_grad)
                self.update_weights()
            else:
                self.update_weights(avg_grad)
            #print(f"Weight after update at epoch {e} : {self.w}")
            #print("\n")
    def compare(self):
        self.w = np.zeros(6,)
        n = len(self.X)
        for e in range(self.epochs):
            total_loss = 0
            avg_grad = np.zeros(self.w.shape)
            #print(f"Previous weights at epoch {e} : {self.w}")
            for i in range(0, n):
                x_i = self.X[i]
                y_i = self.y[i]
                total_loss += self.calc_cost(x_i, y_i)
                avg_grad = avg_grad + self.single_grad(x_i, y_i)

            #print(f"Loss at epoch {e} : {total_loss / n}")
            #print(f"The gradient is : {avg_grad}")
            #print(f"Gradients : {avg_grad}")
            #avg_grad = avg_grad / n

            if self.momentum:
                self.calc_velocity(avg_grad)
                self.update_weights()
            else:
                self.update_weights(avg_grad)
            print(f"Weight after update at epoch {e} : {self.w}")
            #print("\n")
        print("\n")
        #Comparing with the efficient version 
        self.w = np.zeros(6,)


        for e in range(self.epochs):
            
            total_cost = self.full_cost() / n
            #print(f"Loss at epoch {e} : {total_cost}")
            grad = self.full_grad()
            #print(f"The gradient : {grad}")
            self.update_weights(grad)
            print(f"Weight after update at epoch {e} : {self.w}")


### Training on a data split

In [939]:
lr= LogisticRegression(fit_intercept=False)
lr = lr.fit(X_train, y_train)
preds = lr.predict(X_test)
acc_score = accuracy_score(y_test, preds)
print(acc_score)

0.5526315789473685


In [940]:
lr.coef_

array([[ 0.42574454, -0.37940255]])

In [935]:
lr.intercept_

array([-4.42430986])

In [921]:
logreg = LogReg(y=y_train, X=X_train, eta=0.001, epochs=10000, irls=False)

In [898]:
logreg.train_gd_eff()

The loss before training : 0.92152379989836
The loss after training : 0.67381903941546


In [941]:
logreg.w = np.array([ 0.42574454, -0.37940255])

In [942]:
logreg.w

array([ 0.42574454, -0.37940255])

In [943]:
preds = logreg.full_prob(X_test)

In [944]:
preds = np.round(preds)

In [945]:
preds

array([0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0.,
       1., 1., 1., 1.])

In [946]:
acc_score = accuracy_score(y_test, preds)

In [947]:
acc_score

0.5526315789473685

### Training via Cross-Validation

In [710]:
y = np.array(y)
X = np.array(X)

In [711]:
len(X)

126

In [783]:
num_samples = len(X)
indices = np.arange(0, num_samples)
np.random.shuffle(indices)
k = 3
num_val_samples = num_samples // k
val_scores = []

for fold in range(0,k):
    val_set_ix = indices[num_val_samples * fold: num_val_samples * (fold+1)]
    train_set_ix = np.concatenate([indices[: num_val_samples * fold],
                                   indices[num_val_samples * (fold+1): ]])

    X_val = X[val_set_ix]
    y_val = y[val_set_ix]

    X_train = X[train_set_ix]
    y_train = y[train_set_ix]

    logreg = LogReg(y=y_train, X=X_train, eta=0.0001, epochs=1000)
    logreg.train_gd_eff()

    preds = logreg.full_prob(X_val)
    preds = np.round(preds)
    acc_score = accuracy_score(y_val, preds)

    val_scores.append(acc_score)

    print(f"Done with fold {fold+1}")

    






The loss before training : 1.7530780370560124
The loss after training : 0.672288310185557
Done with fold 1
The loss before training : 4.351926330287029
The loss after training : 0.6907893470225347
Done with fold 2
The loss before training : 1.5740489414933119
The loss after training : 0.6800117669321131
Done with fold 3


In [784]:
np.mean(val_scores)

0.5238095238095238

## MLP

In [100]:
import tensorflow as tf
import tensorflow.keras as keras

In [110]:
def create_model():

    inputs = keras.Input(shape=(2,))
    x = keras.layers.Dense(units=256, activation="relu")(inputs)
    x = keras.layers.Dense(units=256, activation="relu")(x)
    output = keras.layers.Dense(units=1, activation="sigmoid")(x)

    model = keras.Model(inputs, output)
    model.compile(tf.keras.optimizers.Adam(learning_rate=2e-3), loss="binary_crossentropy", metrics=["accuracy"])

    return model


In [111]:
nn = create_model()

In [112]:
history = nn.fit(x=X_train, y=y_train, epochs=100, batch_size=4)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [113]:
preds = nn.predict(X_test)



In [114]:
preds = np.round(preds)

In [115]:
acc_score = accuracy_score(y_test, preds)
acc_score

0.5526315789473685