#### A) KNN-nearest neighbor

In [1]:
import numpy as np
from collections import Counter
# Should use the `find_k_nearest_neighbors` function below.
def predict_label(examples, features, k, label_key="is_intrusive"):
    dist = {}
    labels = {}
    for i in examples:
        dist[i] = np.sum([(examples[i]['feature'][j] - features[j])**2 for j in range(len(features))])
        labels[i] = examples[i][label_key]
    dist = {k:v for k,v in sorted(dist.items(),key=lambda x:x[1])}
    knn = list(dist.keys())[0:k]
    l = [labels[k] for k in knn]
    c = dict(Counter(l))
    c = {k:v for k,v in sorted(c.items(),key=lambda x:x[1],reverse = True)}
    return list(c.keys())[0]    


def find_k_nearest_neighbors(examples, features, k):
    dist = {}
    for i in examples:
        dist[i] = np.sum([(examples[i]['feature'][j] - features[j])**2 for j in range(len(features))])
    dist = {k:v for k,v in sorted(dist.items(),key=lambda x:x[1])}
    knn = list(dist.keys())[0:k]
    return knn

In [3]:
d = {'pid_1':{'feature':[1.2,1.3], 'label':0},'pid_4':{'feature':[2.42,1.99], 'label':1},'pid_3':{'feature':[2.4,3.3], 'label':1},'pid_2':{'feature':[1.52,1.36], 'label':0},'pid_5':{'feature':[1.02,1.73],'label':0}}

# Path: ML_Algorithm_fundamental/ML_algo.ipynb
feature = [2.4,3.5]
find_k_nearest_neighbors(d, feature,1)

['pid_3']

### B) KMEANs Clustering

In [4]:
import random
import numpy as np

class Centroid:
    def __init__(self, location):
        self.location = location
        self.closest_users = set()


def get_k_means(user_feature_map, num_features_per_user, k):
    # Don't change the following two lines of code.
    random.seed(42)
    # Gets the inital users, to be used as centroids.
    inital_centroid_users = random.sample(sorted(list(user_feature_map.keys())), k)
    d1 = list(user_feature_map.values())
    centroid_feat = np.array([user_feature_map[c] for c in inital_centroid_users])
    for j in range(200):
        assign = []
        for i in user_feature_map:
            c = np.argmin(np.sum(abs(np.array(user_feature_map[i]).reshape((1,num_features_per_user))-centroid_feat),axis=1))
            assign.append(c)   
        center = []
        for i in range(len(inital_centroid_users)):
            ce = np.mean([d1[x] for x in range(len(user_feature_map)) if assign[x]==i],axis=0)
            center.append(list(ce))
            centroid_feat = np.array(center)
    return [list(centroid_feat[i]) for i in range(k)]

d = {'pid_1':[1.2,1.3],'pid_2':[2.42,1.99],'pid_3':[2.4,3.3],'pid_2':[1.52,1.36],'pid_5':[1.02,1.73],'pid_6':[2.52,2.36],'pid_7':[1.12,1.93],'pid_8':[2.42,1.96],'pid_9':[1.22,1.83],'pid_10':[2.52,1.96],'pid_11':[1.05,1.71]}


In [10]:
#kmeans

from sklearn.datasets import make_blobs
X_train, true_labels = make_blobs(n_samples=100, centers=centers, random_state=42)
X_train = dict(zip(['pid_'+str(i) for i in range(100)],X_train))
get_k_means(X_train, 2, k=1)


[[-2.4476715310291493, 3.1452113041203607]]

### C) Multinomial Naive Bayes- text classification

In [75]:
class NB_multinomial:
    def __init__(self,data,alpha):
        self.alpha = 1
        self.data = data
        self.train()
    def train(self):
        self.priors = {}
        total = np.sum([len(self.data[i]) for i in self.data])
        for tag in self.data:
            self.priors[tag] = len(self.data[tag])/total
        self.count_word_per_tag = defaultdict(lambda:{tag:0 for tag in self.data})
        self.total_word_count_tag = defaultdict(int)
        for tag in self.data:
            for article in self.data[tag]:
                for word in article:
                    self.count_word_per_tag[word][tag] += 1
                    self.total_word_count_tag[tag] +=1
        self.word_liklihood_tag = defaultdict(lambda:{tag:0.5 for tag in self.data})
        for word, tag_map in self.count_word_per_tag.items():
            for tag in tag_map:
                self.word_liklihood_tag[word][tag] = (self.count_word_per_tag[word][tag] + 1 * self.alpha)/(self.total_word_count_tag[tag] + 2 * self.alpha)
        return self
    def predict(self,article):
        prediction = {}
        for tag in self.data:
            numerator = np.log(self.priors[tag])
            for word in article:
                numerator += np.log(self.word_liklihood_tag.get(0.5,self.word_liklihood_tag[word][tag]))
            prediction[tag] = numerator
        return prediction, max(prediction, key=prediction.get)    


data = {'sports':[['the','team','played','a','great','game'],['the','game','was','awesome'],['i','love','baseball'],['i','hate','tennis']],
        'not_sports':[['i','love','my','dog'],['my','dog','hates','me'],['the','cat','scratched','me'],['i','hate','cats']]}            

In [78]:
nb = NB_multinomial(data,alpha=1)
t = nb.train()
nb.predict(['baseball','is','tough','game'])

({'sports': -6.068425588244111, 'not_sports': -7.7458682297922685}, 'sports')

### D) Recursive Partitioning and Regression Trees (RPART)    

In [11]:
class TreeNode:
    def __init__(self, examples):
        self.examples = examples
        self.left = None
        self.right = None
        self.split_point = None

    def split(self):
        if len(self.examples) == 1:
            return
        best_split = {'feature':None,'value':None,'split_index':None,'mse':100000}    
        for feat in list(self.examples[0].keys())[:-1]:
            self.examples.sort(key = lambda example:example[feat])
            for i,_ in enumerate(self.examples[:-1]):
                feat_val = (self.examples[i][feat] + self.examples[i+1][feat])/2
                bst_mse,bst_index = self.mse_split(feat,feat_val)
                if best_split['mse'] > bst_mse:
                    best_split = {'feature':feat,'value':feat_val,'split_index':bst_index,'mse':bst_mse}
        print(best_split)
        self.split_point = best_split
        self.examples.sort(key = lambda example:example[self.split_point ['feature']])
        self.left = TreeNode(self.examples[:self.split_point['split_index']])
        # print(self.left)
        self.left.split()
        self.right = TreeNode(self.examples[self.split_point['split_index']:])
        # print(self.right)
        self.right.split()
    def mse_split(self,feat,feat_val):
        left_bpds = [example['target'] for example in self.examples if example[feat] <= feat_val]                 
        split_id = len(left_bpds)
        right_bpds = [example['target'] for example in self.examples if example[feat] > feat_val]
        if not len(left_bpds) or not len(right_bpds):
            return 10,1
        left_mean,right_mean = np.mean(left_bpds),np.mean(right_bpds)
        left_mse = np.sum([(left_bpds[j]-left_mean)**2 for j in range(len(left_bpds))])/len(left_bpds)
        right_mse = np.sum([(right_bpds[j]-right_mean)**2 for j in range(len(right_bpds))])/len(right_bpds)
        total_mse = (len(left_bpds) * left_mse + len(right_bpds) * right_mse)/(len(right_bpds) + len(left_bpds))
        return total_mse,split_id
class RegressionTree:
    def __init__(self, examples):
        self.root = TreeNode(examples)
        self.train()

    def train(self):
        self.root.split()

    def predict(self, example):
        node = self.root
        while node.left and node.right:
            if example[node.split_point['feature']] <= node.split_point['value']:
                node = node.left
            else:
                node = node.right    
        val = sum([leaf['target'] for leaf in node.examples])/len(node.examples)
        return val

In [12]:
#Regression tree
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
data = pd.DataFrame(np.concatenate([fetch_california_housing()['data'],fetch_california_housing()['target'].reshape(-1,1)],axis=1), columns=['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude',
  'target']).to_dict('records')
data = data[0:100]   

r = RegressionTree(data)
r.train()
r.predict({'MedInc': 3.8462,
 'HouseAge': 52.0,
 'AveRooms': 6.281853,
 'AveBedrms': 1.081081,
 'Population': 565.0,
 'AveOccup': 2.181467,
 'Latitude': 37.85,
 'Longitude': -122.25})


MedInc
HouseAge
AveRooms
AveBedrms
Population
AveOccup
Latitude
Longitude
{'feature': 'Longitude', 'value': -122.26, 'split_index': 91, 'mse': 0.33519456665813185}
MedInc
HouseAge
AveRooms
AveBedrms
Population
AveOccup
Latitude
Longitude
{'feature': 'Longitude', 'value': -122.28, 'split_index': 41, 'mse': 0.25086610973712625}
MedInc
HouseAge
AveRooms
AveBedrms
Population
AveOccup
Latitude
Longitude
{'feature': 'Longitude', 'value': -122.29, 'split_index': 17, 'mse': 0.055946140064562416}
MedInc
HouseAge
AveRooms
AveBedrms
Population
AveOccup
Latitude
Longitude
{'feature': 'Population', 'value': 921.0, 'split_index': 15, 'mse': 0.02575013137254902}
MedInc
HouseAge
AveRooms
AveBedrms
Population
AveOccup
Latitude
Longitude
{'feature': 'AveRooms', 'value': 6.227536053247874, 'split_index': 10, 'mse': 0.02221850666666667}
MedInc
HouseAge
AveRooms
AveBedrms
Population
AveOccup
Latitude
Longitude
{'feature': 'AveRooms', 'value': 3.073690544049655, 'split_index': 2, 'mse': 0.003980637499999998

3.422

### E) ANN- Implementation single Neuron(Perceptron) from scratch

In [13]:
class Neurone:
    def __init__(self,examples,n_feats=3):
        np.random.seed(100)
        self.examples = examples
        self.n_feats = n_feats
        self.W = np.random.normal(0,1,self.n_feats+1)
        self.train()
    
    def train(self,learning_rate=0.1, batch_size=10, epochs=5000):
        no_batch = len(self.examples) % batch_size
        for i in range(len(self.examples)):
            self.examples[i]['feats'].append(1)

        for i in range(epochs):
            # Error = []
            Error = 0
            for batch in range(no_batch):
                grad = np.zeros(self.n_feats + 1,float) 
                minibatch = self.examples[batch * batch_size:(batch+1) * batch_size]  
                for e in minibatch:
                    raw = np.sum([e['feats'][i] * self.W[i] for i in range(self.n_feats + 1)])
                    pred = 1/(1 + np.exp(-1 * raw))
                    error = (pred - e['label'])
                    Error += np.abs(error) 
                    for j in range(4):
                        grad[j] += error * e['feats'][j]
                
                grad = grad/batch_size
                self.W = self.W - learning_rate * grad
            Error = Error/len(self.examples)
            if i % 100 == 0:
                print(np.mean(Error))
                # print(self.W)

    def predict(self, features):
        features.append(1)
        val = np.sum(np.array(features).reshape(1,len(self.W)) * self.W)
        return np.round(1/(1+np.exp(-(val))),5)

data = [{'feats':[0.1,0.11,0.3],'label':0},{'feats':[0.19,0.51,0.39],'label':1},{'feats':[0.01,0.18,0.23],'label':0},{'feats':[0.31,0.411,0.43],'label':1},{'feats':[0.51,0.611,0.35],'label':1},{'feats':[0.11,0.211,0.035],'label':0}]
N = Neurone(data)
N.train()

0.5134133891414737
0.4724156898328847
0.43997363641644077
0.4110592570734297
0.38485707393166235
0.36112466206864835
0.3396586274471911
0.3202472460397928
0.30267930456762665
0.28675403837155194
0.2722868078050443
0.2591114069272577
0.2470802738534982
0.23606353222275936
0.2259474625470025
0.21663276508495125
0.2080328201376792
0.2000720541442754
0.1926844612088564
0.185812295713822
0.17940493304304786
0.1734178860525758
0.16781196094487874
0.1625525352615869
0.15760894138312717
0.15295394037198237
0.14856327273824943
0.14441527447762775
0.1404905483972728
0.13677168224301406
0.1332430064531595
0.12989038549284423
0.12670103768308044
0.12366337924930065
0.1207668889951889
0.1180019905779459
0.11535994983799676
0.11283278503457701
0.11041318817154451
0.108094455876098
0.10587042852604552
0.10373543651650753
0.10168425272084365
0.09971205033843178
0.0978143654380806
0.09598706360392584
0.09422631017364748
0.09252854362921599
0.09089045176017722
0.08930895027042794
0.08778116354291067
0.0

In [14]:
feature = [[0.72,0.81,0.71],[0.12,0.11,0.41]]
assert N.predict(feature[0]) == 0.99999
assert N.predict(feature[1]) == 0.14156