In [None]:
# Time = 11:08 pm 
# Build a Logistic Regression

In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [None]:
class LinearReg:
    def __init__(self, lr = 0.01,n_iter = 1000, tol = 1e-5, verbose = False):
        self.lr = lr
        self.n_iter = n_iter
        self.tol = tol
        self.verbose = verbose
        self.weights = None
        self.bias = None
        
    def initialize(self, numvar):
        self.weights = np.zeros(numvar).reshape(1,-1) # np.random.rand(numvar).reshape(1,-1) # shape = (1,3)
        self.bias = np.zeros(1).reshape(1,-1) #np.random.rand(1).reshape(1,1)
        if self.verbose: print(self.weights.shape, self.bias.shape)
        
    def gradient(self, X,Y, Y_pred, flag):
        if flag == 'weight':
            return (2.0/X.shape[0])* np.matmul((Y_pred- Y).T, X) # shape (1,3)
        else:
            return (2.0/X.shape[0])*np.sum(Y_pred - Y)
        
    def compute_loss(self,Y, Y_pred):
        #return (1.0/X.shape[0])*np.sum((Y-Y_pred)**2)
        return np.mean((Y-Y_pred)**2)
    
    def train(self,X,Y):
        self.initialize(X.shape[1])
        
        prev_loss = 0
        for iter_num in range(self.n_iter):
            Y_pred = self.predict(X)
            if self.verbose: print ("Y_pred.shape: ", Y_pred.shape)
            loss = self.compute_loss(Y,Y_pred)
            self.weights -= self.lr*self.gradient(X,Y, Y_pred, "weight")
            self.bias -= self.lr*self.gradient(X,Y, Y_pred, "bias")
            if abs(prev_loss - loss) < self.tol:
                break
            if self.verbose: print (iter_num, loss)
            prev_loss = loss
        if self.verbose: print(self.weights, self.bias)
    
    def predict(self,X):
        return np.matmul(X,self.weights.T) + self.bias # shape = 100*3, 3*1 = 100,1 + 1 = 100,1
    
    def rmse(self, Y, Y_pred):
        return (1.0/X.shape[0])*np.sum(np.power(Y-Y_pred, 2))**0.5
    
    def rsquare(self,Y,Y_pred):
        avg_model = np.mean(Y)
        rmse_avg_model = self.rmse(Y, avg_model)
        rmse = self.rmse(Y, Y_pred)
        r2 = 1 - rmse/rmse_avg_model
        return r2
            

In [None]:
#X = np.random.rand(1000,2)
X = np.arange(0,10, 0.005).reshape(-1,1)
Y = X*3 -5 + np.random.randn(X.shape[0], X.shape[1])*3

train_mean = np.mean(X, axis = 0)
train_std = np.std(X, axis = 0)
X = (X - train_mean)/train_std
#X = np.random.rand(1000,2)
#Y = np.random.rand(1000,1)*10

X_test = np.arange(0,10, 0.003).reshape(-1,1)
Y_test = X_test*3 -5 + np.random.randn(X_test.shape[0], X_test.shape[1])*3

X_test = (X_test - train_mean)/train_std

print (X.shape, Y.shape)
#print (Y)

In [None]:
linreg = LinearReg(verbose = True)
linreg.train(X,Y)
Y_pred = linreg.predict(X)
Y_pred_test = linreg.predict(X_test)
#print (Y_pred)
print ("Training data metrics: ", linreg.rmse(Y,Y_pred),linreg.rmse(Y,np.mean(Y)), linreg.rsquare(Y,Y_pred))
print ("Test data metrics: ", linreg.rmse(Y_test,Y_pred_test),linreg.rmse(Y_test,np.mean(Y_test)), linreg.rsquare(Y_test,Y_pred_test))


In [None]:
plt.scatter(X[:100],Y[:100])
plt.scatter(X[:100],Y_pred[:100], color = 'red')
#plt.scatter(X_test[:100],Y_pred_test[:100], color = 'green')
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Scatter Plot")
plt.show()

In [None]:
class Logreg:
    def __init__(self, lr = 0.001,n_iter = 1000, tol = 1e-5, verbose = False):
        self.lr = lr
        self.n_iter = n_iter
        self.tol = tol
        self.verbose = verbose
        self.weights = None
        self.bias = None
        
    def initialize(self, numvar):
        self.weights = np.zeros(numvar).reshape(1,-1) # np.random.rand(numvar).reshape(1,-1) # shape = (1,3)
        self.bias = np.zeros(1).reshape(1,-1) #np.random.rand(1).reshape(1,1)
        if self.verbose: print("Parameter shapes: ", self.weights.shape, self.bias.shape)
        
    def gradient(self, X,Y, Y_pred, flag):
        if self.verbose: print("In gradient: ", X.shape, Y.shape, Y_pred.shape)
        if flag == 'weight':
            return (2.0/X.shape[0])* np.matmul((Y_pred- Y).T, X) # shape (1,3)
        else:
            return (2.0/X.shape[0])*np.sum(Y_pred - Y)
        
    def compute_loss(self,Y, Y_pred):
        l = -Y*np.log(Y_pred) - (1-Y)*np.log(1-Y_pred)
        return np.mean(l)
    
    def train(self,X,Y):
        self.initialize(X.shape[1])
        
        prev_loss = 0
        for iter_num in range(self.n_iter):
            Y_pred = self.predict(X)
            if self.verbose: print ("In Train: Y_pred.shape: ", Y_pred.shape)
            loss = self.compute_loss(Y,Y_pred)
            self.weights -= self.lr*self.gradient(X,Y, Y_pred, "weight")
            self.bias -= self.lr*self.gradient(X,Y, Y_pred, "bias")
            if abs(prev_loss - loss) < self.tol:
                break
            if self.verbose: print (iter_num, loss)
            prev_loss = loss
        if self.verbose: print(self.weights, self.bias)
            
    def sigmoid(self,X):
        return 1.0/(1.0 + np.exp(-X))
    
    def predict(self,X):
        Z = np.matmul(X,self.weights.T) + self.bias # shape = 100*3, 3*1 = 100,1 + 1 = 100,1
        if self.verbose: print ("In Predict: ", Z.shape)
        return self.sigmoid(Z)
        
    def accuracy(self, Y, Y_prob, threshold  = 0.5):
        Y_pred = np.where(Y_prob > threshold, 1, 0).reshape(-1,1)
        return sum(Y==Y_pred)/len(Y)
    

In [None]:
#X = np.random.rand(1000,3) + 1
#Y = np.mean(X, axis = 1) > 1.5 
#Y = np.where(Y==True, 1,0).reshape(-1,1)


n = 10000
X_data,Y_data = make_classification(n_samples=n, n_features=5)
Y_data = Y_data.reshape(-1,1)

X,Y = X_data[:int(n*0.8)], Y_data[:int(n*0.8)]
X_test,Y_test = X_data[int(n*0.8):], Y_data[int(n*0.8):]



train_mean = np.mean(X, axis = 0)
train_std = np.std(X, axis = 0)
X = (X - train_mean)/train_std

#X_test = np.random.rand(1000,3) + 1
#Y_test = np.mean(X_test, axis = 1) > 1.5
#Y_test = np.where(Y_test==True, 1,0).reshape(-1,1)

X_test = (X_test - train_mean)/train_std

logreg = Logreg(verbose = False)
logreg.train(X,Y)
Y_prob = logreg.predict(X)
#print ("Y_pred: ", Y_pred)
Y_prob_test = logreg.predict(X_test)

#print (Y_pred)
print ("Training data metrics: ", logreg.accuracy(Y,Y_prob))
print ("Test data metrics: ", logreg.accuracy(Y_test,Y_prob_test))

In [None]:
# Comparison with Logistic Regression from SKLEARN
clf = LogisticRegression(random_state=0).fit(X,Y.reshape(-1))
Y_prob = clf.predict_proba(X)[:,1]
Y_prob_test = clf.predict_proba(X_test)[:,1]
print (Y.shape, Y_prob.shape)
print ("Training data metrics: ", logreg.accuracy(Y,Y_prob))
print ("Test data metrics: ", logreg.accuracy(Y_test,Y_prob_test))

In [None]:
## Compute Percentile ##
# Given a list of numbers, return the value of say 10% percentile

a = [10,20,25,30,45,50,89]
x =0.25

start = 0; end = len(a) - 1
index = start + (end - start)*x

index_whole = int(index)
index_rem = index - index_whole

if index_whole != end:
    res = a[index_whole] + (a[index_whole+1] - a[index_whole])*index_rem
else:
    res = a[index_whole]
print (res)


In [None]:
## Coding a Decision Tree ## # Start time= 5:30 PM

class Node:
    def __init__(self, feat_id = None, threshold = None, is_leaf = False, left = None, right = None, y_pred = None):
        self.feat_id = feat_id
        self.threshold = threshold
        self.is_leaf = is_leaf
        self.y_pred = y_pred
        self.left = left
        self.right = right
        self.root = None

class DecisionTree:
    def __init__(self,max_depth = 5, verbose = False):
        self.max_depth = max_depth
        self.verbose = verbose
        
    def entropy(self, y):
        '''
        Shape of Y is (n,) & Y is binary 0/1
        '''
        n = y.shape[0]
        num_1 = np.where(y == 1)[0].shape[0]
        num_0 = np.where(y == 0)[0].shape[0]
        p_1 = num_1/n
        p_0 = num_0/n
        if p_0 in [1.0, 0.0]:
            return 0
        ent = -p_0*np.log(p_0) - p_1*np.log(p_1)
        return ent
    
    def gain(self, Xcol, y):
        '''
        Xcol shape = (n,) ; y shape = (n,)
        Assuming Xcol is continuos and we are splitting on median
        '''
        nrow = Xcol.shape[0]
        if self.verbose:
            print ("Inside Gain: ", Xcol.shape, y.shape)
        
        threshold = np.median(Xcol)
        indices1 = np.where(Xcol > threshold)
        indices2 = np.where(Xcol <= threshold)
        y1 = y[indices1]
        y2 = y[indices2]
        w1 = len(indices1)/nrow
        w2 = 1 - w1
        
        g = self.entropy(y) - w1*self.entropy(y1) - w2*self.entropy(y2)
        return g, threshold, indices1, indices2
        
    def select_feature(self, X,y):
        nrow, ncol = X.shape
        if self.verbose: print (X.shape)
            
        max_gain = -float("inf"); best_feat_id = None; best_threshold = None; best_indices1 = None; best_indices2 = None
        for feat_id in range(ncol):
            cur_gain, threshold, indices1, indices2 = self.gain(X[:,feat_id], y)
            if cur_gain > max_gain:
                max_gain = cur_gain
                best_feat_id = feat_id
                best_threshold = threshold
                best_indices1 = indices1
                best_indices2 = indices2 
                
                
        return best_feat_id, max_gain, best_threshold, best_indices1, best_indices2
            
        
    def _build(self, X,y, cur_depth):
        
        num_1 = len(np.where(y == 1)[0])
        num_0 = len(np.where(y == 0)[0])
        
        if cur_depth >= self.max_depth or num_1 == len(y) or num_0 == len(y):
            y_pred = int(num_1 > num_0)
            if self.verbose: print("inside build: ", y, num_1, num_0)
            return Node(is_leaf = True, y_pred = y_pred)
        
        best_feat_id, max_gain, best_threshold, best_indices1, best_indices2 = self.select_feature(X,y)
        X_left = X[best_indices1]
        X_right = X[best_indices2]
        y_left = y[best_indices1]
        y_right = y[best_indices2]
        
        root = Node(feat_id = best_feat_id, threshold = best_threshold)
        root.left = self._build(X_left, y_left, cur_depth+1)
        root.right = self._build(X_right, y_right, cur_depth+1)
        
        return root 
    
    def train(self,X,y):
        self.root = self._build(X,y, 0)
        
    def test(self, X):
        Y_pred = []
        for x in X:
            Y_pred.append(self.predict(x))
        return np.array(Y_pred)

    def predict(self, x):
        curr = self.root 
        while(curr.is_leaf == False):
            if x[curr.feat_id] > curr.threshold: # Left 
                curr = curr.left
            else:
                curr = curr.right
        if self.verbose: print ("Inside Predict: ", curr.is_leaf, curr.y_pred)
        return curr.y_pred
    
    def accuracy(self, y, y_pred):
        return sum(y == y_pred)/len(y)
        


In [None]:
X = np.random.rand(10000,3)
#y = np.random.randint(0,2,1000)

y = np.where(np.mean(X, axis = 1) > 0.6 + np.random.rand()*0.3 ,1,0)

X_test = np.random.rand(1000,3)
y_test = np.where(np.mean(X_test, axis = 1) > 0.6+ np.random.rand()*0.3 ,1,0)

print (X.shape, y.shape, X_test.shape, y_test.shape)

In [None]:
DT = DecisionTree(verbose = False)
DT.train(X,y)

In [None]:
y_pred_train = DT.test(X)
y_pred = DT.test(X_test)
print ("Train Accuracy: ", DT.accuracy(y_pred, y_test))
print ("Test Accuracy: ", DT.accuracy(y_pred_train, y))

In [None]:
clf = DecisionTreeClassifier(random_state=0, max_depth = 5, criterion='entropy')
clf.fit(X,y)
y_pred_train = clf.predict(X)
y_pred = clf.predict(X_test)
print ("Train Accuracy: ", DT.accuracy(y_pred, y_test))
print ("Test Accuracy: ", DT.accuracy(y_pred_train, y))

In [None]:

def printtree(curr):
    if curr:
        #print(curr.feat_id, curr.threshold, curr.is_leaf, curr.y_pred)
        if curr.is_leaf:
            pass
            #print(curr.feat_id, curr.threshold, curr.is_leaf, curr.y_pred)
        else:
            printtree(curr.left)
            printtree(curr.right)
printtree(DT.root)

In [None]:
class RandomForest:
    def __init__(self, n_trees =10,verbose = True, max_features = None):
        self.verbose = verbose
        self.n_trees = n_trees
        self.max_features = max_features
        self.trees = []
    
    def get_data_for_tree(self, X,y):
        # Random Sampling with replacement for rows 
        # Random Sampling without replacement for features
        nrow, nfeat = X.shape
        if self.verbose: print (X.shape, y.shape, self.max_features)
        chosen_feat_id = np.random.choice(range(nfeat), self.max_features)
        X_with_chosen_feats = X[:,chosen_feat_id]
        
        #chosen_row_id = np.array([np.random.randint(nrow) for _ in range(nrow)])
        chosen_row_id = np.random.choice(range(nrow), nrow , replace = True)
        X_filtered = X[chosen_row_id]
        y_filtered = y[chosen_row_id]
        
        return X_filtered, y_filtered
        
    
    def train(self, X,y):
        self.max_features = min(2, int(np.sqrt(X.shape[1])))
        
        for tree_num in range(self.n_trees):
            X_tree, y_tree = self.get_data_for_tree(X,y)
            DT = DecisionTree()
            DT.train(X_tree, y_tree)
            self.trees.append(DT)
    
    def test(self, X):
        Y_pred = []
        for x in X:
            Y_pred.append(self.predict(x))
        return np.array(Y_pred)
    
    def predict(self,x):
        res = np.array([dt.predict(x) for dt in self.trees])
        num_1 = len(np.where(y == 1)[0])
        num_0 = len(np.where(y == 0)[0])
        
        return int(num_1 > num_0)
    
    def accuracy(self, y, y_pred):
        return sum(y == y_pred)/len(y)
        

In [None]:
RF = RandomForest(verbose = True)
RF.train(X,y)

In [None]:
y_pred_train = RF.test(X)
y_pred = RF.test(X_test)

print ("Train Accuracy: ", RF.accuracy(y_pred, y_test))
print ("Test Accuracy: ", RF.accuracy(y_pred_train, y))

In [None]:
### Build Kmeans Clustering Class -- Start TIme = 7 : 50 PM

In [None]:
# Import Libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 

class KMeans:
    def __init__(self, k, n_iter = 30 , init_method = 'random' ,verbose = False, random_state = False):
        if k <=1:
            raise ValueError("k cannot be less than 2")
        self.k = k 
        self.n_iter = n_iter
        self.verbose = verbose
        self.random_state = random_state
        self.init_method = init_method 
        
    
    def param_init(self, X):
        '''
        Shape of X = (nrows, d )
        return k cluster centroids
        '''
        nrow, d = X.shape
        indices = list(range(nrow))
        
        if self.random_state != False:
            np.random.seed(self.random_state)
        chosen_indices = np.random.choice(indices, self.k)
        return X[chosen_indices]
    
    
    def get_new_centroid(self, X, centroids):
        max_dists =[]
        for point in X:
            dists = [self.distance(point, centroid) for centroid in centroids]
            max_dists.append(np.min(dists))
        return X[np.argmax(max_dists)]
        
    
    def param_init_smart(self,X):
        nrow, d= X.shape
        index = np.random.randint(0,nrow)
        print ("Inside param init smart: ", index)
        centroids = [X[index]]
        for i in range(1,self.k):
            centroid = self.get_new_centroid(X, centroids)
            centroids.append(centroid)
        return np.array(centroids)
        
    
    def compute_centroid(self, X, cluster_assignment):
        '''
        Recomputes centroid given a cluster_assignment
        '''
        
        nrow, d = X.shape
        centroids = []
        for cluster_id in range(self.k):
            indices = np.where(cluster_assignment == cluster_id)
            X_cluster = X[indices]
            centroids.append(np.mean(X_cluster, axis = 0))
        return centroids
            
    
    def assign_cluster(self, X, centroids):
        '''
        Assign ClusterID to each point given centroids
        '''
        nrow, d = X.shape
        cluster_assignment = []
        for point in X:
            assigned_cluster_id = np.argmin([self.distance(point, centroid) for centroid in centroids])
            cluster_assignment.append(assigned_cluster_id)
        return np.array(cluster_assignment)
    
    def distance(self, a,b):
        '''
        Shape of a = (d,) & shape of b = (d, )
        ''' 
        return np.sum((a - b)**2)**0.5
    
    def fit(self, X):
        if self.init_method =='kmeans++':
            centroids = self.param_init_smart(X)
        else:
            centroids = self.param_init(X)
        
        prev_clus_assignment = np.zeros(X.shape[0])
        for iterno in range(self.n_iter):
            cluster_assignment = self.assign_cluster(X, centroids) 
            if self.verbose: print ("Inside Fit: ", cluster_assignment)
            if np.all(cluster_assignment == prev_clus_assignment):
                    if self.verbose or True: print ("Exiting Iterations early", iterno)
                    break
            #else:
            centroids = self.compute_centroid(X, cluster_assignment)
            prev_clus_assignment = cluster_assignment
        self.centroids= centroids
        self.cluster_assignment = cluster_assignment
            
        
    def predict(self, X):
        '''
        Shape of X = (n, d)
        '''
        return self.assign_cluster(X, self.centroids) 
        
    def norm_sse(self, X):
        '''
        SSE = Sum of squared errors for each cluster
        norm_see = SSE/ SSE if k = 1 
        '''
        
        SSE = 0
        
        for cluster_id in range(self.k):
            indices = np.where(self.cluster_assignment == cluster_id)
            X_cluster = X[indices]
            SSE += np.sum([self.distance(point, self.centroids[cluster_id])**2 for point in X_cluster])
        
        SSE_1_cluster = np.sum([self.distance(point, np.mean(X))**2 for point in X])
        return SSE/SSE_1_cluster


In [None]:
# Generate Some data for Clustering
X1 = 2*np.random.randn(100, 2) + 20
X2 = -3*np.random.randn(120,2) - 10
X3 = np.random.randn(100,2) + 1

X = np.vstack((X1, X2, X3))

X_mean = np.mean(X, axis = 0)
X_std = np.std(X, axis = 0)
X = (X - X_mean)/ X_std



In [None]:
for _ in range(10):
    km = KMeans(k = 3, verbose = False, random_state = False, init_method ='kmeans++')
    km.fit(X)
    print ("SSE: ", km.norm_sse(X))
    print ("centers: ", km.centroids)
    #km.predict(X3)

In [None]:
# Plot the point with cluster 
colors = ['red', 'green', 'blue']
c = [colors[i] for i in km.cluster_assignment]

plt.scatter(X[:,0], X[:,1], c = c)
plt.xlabel("Feature1")
plt.ylabel("Feature2")
plt.title("Kmeans Clustering Example")
plt.show()

In [None]:
# Plot the Elbow Curve
SSE_list = []; k_val = []
for k in range(2,10):
    km = KMeans(k = k, verbose = False)
    km.fit(X)
    SSE = km.norm_sse(X)
    SSE_list.append(SSE)
    k_val.append(k)
    

In [None]:
plt.plot(k_val, SSE_list, 'o-')
plt.xlabel("k")
plt.ylabel("sse")
plt.title("Kmeans Elbow Curve")
plt.show()

In [None]:
a = np.array([1,0,1])
b = np.array([1,0,1])


In [3]:
import numpy as numpy
import pandas as pd
import random
from matplotlib import pyplot as plt

class kfold:
    def __init__(self, n_splits = 5, shuffle = False, random_seed = False, verbose = False):
             self.n_splits = n_splits
             if n_splits <= 1:
                     raise ValueError("num of splits shoud be > 1 and <= number of rows")
             self.shuffle = shuffle
             self.random_seed = random_seed
             self.verbose = verbose
         
    def split(self, X,y):
        '''
        Input: X ( Shape = (n,d) ); y (shape = (n,))
        '''
        nrow, n_feat = X.shape
        indices = np.array(range(nrow))
        if self.shuffle:
            if self.random_seed != False:
                np.random.seed(self.random_seed)
            random.shuffle(indices)
            
        num_of_rows_divisible = len(indices) - len(indices)%self.n_splits
        #leftover_indices_count = len(indices)%self.n_splits
        
        indices_splitted = np.split(indices[:num_of_rows_divisible], self.n_splits)
        indices_leftover = np.array(indices[num_of_rows_divisible:])
        if self.verbose: print ("indices spltited: ", indices_splitted)
        if self.verbose: print ("indices_leftover: ", indices_leftover)
        
        res = []
        for splitnum in range(self.n_splits):
            if splitnum >= len(indices_leftover):
                chosen_indices = indices_splitted[splitnum]
            else:
                chosen_indices = np.append(indices_splitted[splitnum], indices_leftover[splitnum])
            mask = np.array([True if i in set(chosen_indices) else False for i in range(nrow) ])
            X_train = X[~mask]
            y_train = y[~mask]
            X_test = X[mask]
            y_test = y[mask]
            res.append([X_train, y_train, X_test, y_test])
        
        return res

def accuracy(y_true, y_prob, threshold= 0.5):
    y_pred = np.where(y_prob > threshold, 1, 0)
    acc = np.sum(y_pred == y_true)/len(y_true)
    return acc 

In [6]:
from sklearn.linear_model import LogisticRegression

#X = np.random.rand(1002,3)
#y = np.random.randint(0,2, 1002)

n = 10002
X,y = make_classification(n, 5)

kf = kfold(5, verbose = False, shuffle = True)

acc_list = []
for X_train, y_train, X_test, y_test in kf.split(X,y):
    print (X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    model = LogisticRegression()
    model.fit(X_train,y_train)
    y_prob = model.predict_proba(X_test)
    #print (y_pred)
    acc = accuracy(y_test, y_prob[:,1])
    acc_list.append(acc)

print (np.mean(acc_list))
    

(8001, 5) (8001,) (2001, 5) (2001,)
(8001, 5) (8001,) (2001, 5) (2001,)
(8002, 5) (8002,) (2000, 5) (2000,)
(8002, 5) (8002,) (2000, 5) (2000,)
(8002, 5) (8002,) (2000, 5) (2000,)
0.9214156421789106


In [None]:
nrow = 11
indices = np.array(range(nrow))
num_of_rows_divisible = len(indices) - len(indices)%3

In [None]:
indices_splitted = np.split(indices[:num_of_rows_divisible], 3)
indices_leftover = np.array(indices[num_of_rows_divisible:])
print (indices_splitted, indices_leftover)

In [None]:
splitnum = 2
np.append(indices_splitted[splitnum], indices_leftover[splitnum])

In [None]:
y.shape