# Concept Session
## Importing Libraries

In [1]:
import numpy as np
import pandas as pd 
from sklearn import datasets, neighbors
import matplotlib.pyplot as plt
from ipywidgets import interactive

%matplotlib inline

# Demo1: k-Nearest Neighbor

### Showing the impact of the number of neighbors on the prediction

In [5]:
def knn(k):
    iris = datasets.load_iris()
    
    # the data
    X = iris.data[:, 0:2]
    y = iris.target

    # learning the classifier
    clf = neighbors.KNeighborsClassifier(k)
    clf.fit(X, y)

    # Plot the decision boundary. Predict on a mesh
    h = 0.05
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                            np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    
    Z = Z.reshape(xx.shape)

    # create a contour plot
    plt.figure(figsize=(8, 6))
    plt.contourf(xx, yy, Z, alpha=0.2)

    # Plot also the training points
    plt.scatter(x=X[:, 0], y=X[:, 1], c=y)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i)"
                % (k))
    plt.xlabel(iris.feature_names[0])
    plt.ylabel(iris.feature_names[1])

    plt.show()

In [3]:
interactive_plot = interactive(knn, k=(1, 20, 2))
output = interactive_plot.children[-1]

interactive_plot

interactive(children=(IntSlider(value=9, description='k', max=20, min=1, step=2), Output()), _dom_classes=('wi…

### If you play around with k, you should notice the following:
- for small k, there are very small regions around every point
- if you enlarge it, the region around the single yellow point on the left smoothens out fastly
- with medium k, reactivated regions pop up in the lower right, where the prediction (blue) is not intuitive anymore
- large k smoothens out this effect but handles many points as outliers

# Demo 2:
## 1. From count data to conditional distributions

### Dataset: Quinlan, J. Ross. "Induction of decision trees." Machine learning 1.1 (1986): 81-106.

### Input columns (outlook, temperature, humidity, windy) tell if a person plays golf given the Saturday morning attributes.

In [5]:
outlook = ['sunny']*2 + ['overcast'] + ['rain']*3 + ['overcast'] + ['sunny']*2 + ['rain'] + ['sunny'] + ['overcast'] * 2 + ['rain']
temperature = ['hot']*3 + ['mild'] + ['cool']*3+ ['mild'] + ['cool'] + ['mild']*3 + ['hot', 'mild']
humidity = ['high']*4 + ['normal']*3 + ['high'] + ['normal']*3 + ['high', 'normal', 'high']
windy = ['false', 'true'] + ['false'] * 3 + ['true']*2 + ['false']*3 + ['true']*2 + ['false', 'true']
play = ['0', '0', '1', '1', '1', '0', '1', '0', '1', '1', '1', '1', '1', '0']

df = pd.DataFrame(list(zip(outlook, temperature, humidity, windy, play)), columns = ['outlook', 'temperature', 'humidity', 'windy', 'play'])
df

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,hot,high,False,0
1,sunny,hot,high,True,0
2,overcast,hot,high,False,1
3,rain,mild,high,False,1
4,rain,cool,normal,False,1
5,rain,cool,normal,True,0
6,overcast,cool,normal,True,1
7,sunny,mild,high,False,0
8,sunny,cool,normal,False,1
9,rain,mild,normal,False,1


In [6]:
# for each input column: get the number of counts, the user plays
features = ['outlook', 'temperature', 'humidity', 'windy']

for feature in features:
    print(df.groupby('play')[feature].value_counts())

play  outlook 
0     sunny       3
      rain        2
1     overcast    4
      rain        3
      sunny       2
Name: outlook, dtype: int64
play  temperature
0     hot            2
      mild           2
      cool           1
1     mild           4
      cool           3
      hot            2
Name: temperature, dtype: int64
play  humidity
0     high        4
      normal      1
1     normal      6
      high        3
Name: humidity, dtype: int64
play  windy
0     true     3
      false    2
1     false    6
      true     3
Name: windy, dtype: int64


In [7]:
# get the conditional probabilities by dividing over all entries
for feature in features:
    print(df.groupby('play')[feature].value_counts() / df.groupby('play')[feature].count())

play  outlook 
0     sunny       0.600000
      rain        0.400000
1     overcast    0.444444
      rain        0.333333
      sunny       0.222222
Name: outlook, dtype: float64
play  temperature
0     hot            0.400000
      mild           0.400000
      cool           0.200000
1     mild           0.444444
      cool           0.333333
      hot            0.222222
Name: temperature, dtype: float64
play  humidity
0     high        0.800000
      normal      0.200000
1     normal      0.666667
      high        0.333333
Name: humidity, dtype: float64
play  windy
0     true     0.600000
      false    0.400000
1     false    0.666667
      true     0.333333
Name: windy, dtype: float64


In [8]:
# the marginal probabilities of 'play'
df['play'].value_counts()/ df['play'].count()

1    0.642857
0    0.357143
Name: play, dtype: float64

# 2. Predicting with sklearn's naive bayes


## plain categorical naive bayes
### preprocessing

In [9]:
# in sklearn, we need to transform all categories to integers. Therefore, we use its' "labelencoder"

from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import LabelEncoder

def get_encoded_df(df, columns=None):
    if columns == None:
        columns = df.columns

    for col in columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

    return df


df_encoded = get_encoded_df(df)
df_encoded.head()
#

Unnamed: 0,outlook,temperature,humidity,windy,play
0,2,1,0,0,0
1,2,1,0,1,0
2,0,1,0,0,1
3,1,2,0,0,1
4,1,0,1,0,1


### initiating the model

In [10]:
features = ['outlook', 'temperature', 'humidity', 'windy']
labels = 'play'

X = df_encoded[features]
y = df_encoded[labels]

clf = CategoricalNB()
clf.fit(X, y)

CategoricalNB()

### making a prediction

In [11]:
x_new = np.array([[2, 0, 0, 1]])

clf.predict(x_new)

array([0])

### Laplacian smoothing is the default:

In [12]:
CategoricalNB?

## Gaussian Naive Bayes
is implemented in sklearn, but we can use only numerical features, therefore we generate new "random" data

In [13]:
from numpy.random import default_rng
rng = default_rng()
vals = rng.standard_normal(10)

# for each input column: get the number of counts, the user plays
features = ['number of clouds', 'temperature', 'humidity', 'wind speed']
n_samples = 100

n_clouds = 3 + rng.standard_normal(n_samples)*1
temperatures_numeric = 20 + rng.standard_normal(n_samples) * 2
humidity = 50 + rng.standard_normal(n_samples) * 20
wind_speed = 15 + rng.standard_normal(n_samples) * 5

X = np.array([n_clouds, temperatures_numeric, humidity, wind_speed]).T
print(X[:10])

# do not play, if the number of clouds is larger than 3, if the humidity is higher than 80 % and if the wind speed is larger than 20 km/h
y = np.ones((X.shape[0]))
y[X[:, 0] > 3] = 0
y[X[:, 2] > 80] = 0
y[X[:, 3] > 20] = 0
print(y[:10])

[[ 2.40515148 18.768406   80.79978688 15.01540824]
 [ 1.40554339 20.44641532 77.42156633 16.31462848]
 [ 3.25795038 20.14599138 -5.28858414 19.50406665]
 [ 2.29606264 21.6165749  42.28306827 23.07014771]
 [ 4.74826428 19.67959497 32.51911147 13.92230838]
 [ 2.50021424 18.62827249 91.96338489 12.80844948]
 [ 1.67911885 19.72777735 51.89456853 16.22988241]
 [ 2.46482375 20.69439061 78.63792283 13.65927081]
 [ 4.29051697 20.85406145 43.30385356 12.07912857]
 [ 3.31911481 23.22137124 49.58757578 13.68215535]]
[0. 1. 0. 0. 0. 0. 1. 1. 0. 0.]


In [14]:
# create a split to have prediction data
X_train, y_train = X[:79], y[:79]
X_test, y_test = X[79:], y[79:]

In [15]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)

GaussianNB()

In [16]:
gnb.predict(X_test)

array([0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0.,
       1., 0., 0., 1.])

In [17]:
y_test

array([0., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0.,
       1., 0., 0., 1.])

### Result: 

The classifier is fine, there is only one wrong prediction.