In [189]:
from sklearn import datasets
import pandas as pd
import numpy as np

In [190]:
iris = datasets.load_iris()

In [191]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [192]:
def abc(k, *val):
    if k < val[0]:
        return 0
    else:
        return 1

In [193]:
df.sl.apply(abc, args=(5,))

0      1
1      0
2      0
3      0
4      1
5      1
6      0
7      1
8      0
9      0
10     1
11     0
12     0
13     0
14     1
15     1
16     1
17     1
18     1
19     1
20     1
21     1
22     0
23     1
24     0
25     1
26     1
27     1
28     1
29     0
      ..
120    1
121    1
122    1
123    1
124    1
125    1
126    1
127    1
128    1
129    1
130    1
131    1
132    1
133    1
134    1
135    1
136    1
137    1
138    1
139    1
140    1
141    1
142    1
143    1
144    1
145    1
146    1
147    1
148    1
149    1
Name: sl, dtype: int64

In [194]:
def label(val, *boundaries):
    if (val < boundaries[0]):
        return 0
    elif (val < boundaries[1]):
        return 1
    elif (val < boundaries[2]):
        return 2
    else:
        return 3

def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [195]:
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,1,2,0,0
1,4.9,3.0,1.4,0.2,0,1,0,0
2,4.7,3.2,1.3,0.2,0,2,0,0
3,4.6,3.1,1.5,0.2,0,2,0,0
4,5.0,3.6,1.4,0.2,0,2,0,0
5,5.4,3.9,1.7,0.4,1,3,0,0
6,4.6,3.4,1.4,0.3,0,2,0,0
7,5.0,3.4,1.5,0.2,0,2,0,0
8,4.4,2.9,1.4,0.2,0,1,0,0
9,4.9,3.1,1.5,0.1,0,2,0,0


In [196]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [197]:
set(df['sl_labeled'])

{0, 1, 2, 3}

In [198]:
df["output"] = iris.target

In [199]:
df

Unnamed: 0,sl_labeled,sw_labeled,pl_labeled,pw_labeled,output
0,1,2,0,0,0
1,0,1,0,0,0
2,0,2,0,0,0
3,0,2,0,0,0
4,0,2,0,0,0
5,1,3,0,0,0
6,0,2,0,0,0
7,0,2,0,0,0
8,0,1,0,0,0
9,0,2,0,0,0


In [200]:
X, Y = df.values[:, :-1], df.values[:, -1]

In [201]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 1)

In [202]:
train_df = pd.DataFrame(x_train)
train_df['output'] = y_train

In [203]:
def fit(data):
    output_name = data.columns[-1]
    features = data.columns[0:-1]
    counts = {}
    counts["total_count"] = len(data)
    possible_outputs = set(data[output_name])
    for output in possible_outputs:
        counts[output] = {}
        smallData = data[data[output_name] == output]
        counts[output]["total_count"] = len(smallData)
        for i in range(len(features)):
            f = features[i]
            counts[output][i] = {}
            possible_values = set(data[f])
            for value in possible_values:
                val_count = len(smallData[smallData[f] == value])
                counts[output][i][value] = val_count
    return counts

In [204]:
def probability(dictionary, x, current_class):
    output = 0
    features = dictionary[current_class].keys()
    for j in range(len(features) - 1):
        current_x_j = x[j]
        #count_class_and_feature = 0
        #if current_x_j in dictionary[current_class][j]:
        count_class_and_feature = dictionary[current_class][j][current_x_j] + 1
        possible_values_current_feature = len(dictionary[current_class][j].keys())
        count_class = dictionary[current_class]['total_count'] + possible_values_current_feature
        p = np.log(count_class_and_feature) - np.log(count_class)
        output = output + p
    count_class = dictionary[current_class]['total_count']
    total_count = dictionary['total_count']
    class_prob = np.log(count_class) - np.log(total_count)
    output = output + class_prob
    return output

In [205]:
def predictSinglePoint(x, dictionary):
    classes = dictionary.keys()
    best_p = -1
    best_class = -1
    first_run = True
    for current_class in classes:
        if (current_class == 'total_count'):
            continue
        p_current_class = probability(dictionary, x, current_class)
        if (first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [206]:
def predict(dictionary, x_test):
    y_pred = []
    for x in x_test:
        x_class = predictSinglePoint(x, dictionary)
        y_pred.append(x_class)
    return y_pred

In [207]:
dictionary = fit(train_df)
dictionary

{'total_count': 112,
 0: {'total_count': 37,
  0: {0: 24, 1: 13, 2: 0, 3: 0},
  1: {0: 1, 1: 7, 2: 25, 3: 4},
  2: {0: 37, 1: 0, 2: 0, 3: 0},
  3: {0: 37, 1: 0, 2: 0, 3: 0}},
 1: {'total_count': 34,
  0: {0: 3, 1: 12, 2: 17, 3: 2},
  1: {0: 10, 1: 18, 2: 6, 3: 0},
  2: {0: 0, 1: 6, 2: 28, 3: 0},
  3: {0: 0, 1: 8, 2: 26, 3: 0}},
 2: {'total_count': 41,
  0: {0: 1, 1: 4, 2: 27, 3: 9},
  1: {0: 4, 1: 24, 2: 12, 3: 1},
  2: {0: 0, 1: 0, 2: 17, 3: 24},
  3: {0: 0, 1: 0, 2: 15, 3: 26}}}

In [208]:
y_pred = predict(dictionary, x_test)

In [209]:
y_test = list(y_test)

In [210]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [211]:
confusion_matrix(y_test, y_pred)

array([[13,  0,  0],
       [ 0, 16,  0],
       [ 0,  0,  9]])

In [212]:
from sklearn.naive_bayes import MultinomialNB

In [213]:
mnb = MultinomialNB()
mnb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [214]:
y_pred = mnb.predict(x_test)
confusion_matrix(y_test, y_pred)

array([[13,  0,  0],
       [ 0,  0, 16],
       [ 0,  0,  9]])

In [215]:
from sklearn.naive_bayes import GaussianNB

In [216]:
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 1) 

In [217]:
mnb = MultinomialNB()
mnb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [218]:
y_pred = mnb.predict(x_test)
confusion_matrix(y_test, y_pred)

array([[13,  0,  0],
       [ 0,  0, 16],
       [ 0,  0,  9]])

In [219]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)

GaussianNB(priors=None)

In [220]:
y_pred = gnb.predict(x_test)
confusion_matrix(y_test, y_pred)

array([[13,  0,  0],
       [ 0, 15,  1],
       [ 0,  0,  9]])