In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# Regression

In [None]:
# Use sklearn for linear regression closed form solution

In [None]:
df = pd.read_csv('linear1.csv')
df.head()

In [None]:
_, ax = plt.subplots(figsize=(10, 10))

ax.scatter(df['x'], df['y'])

In [None]:
from sklearn.linear_model import LinearRegression

X = df['x'].values.reshape(-1, 1)
y = df['y']

linreg = LinearRegression()
linreg.fit(X=X, y=y)

In [None]:
linreg.coef_, linreg.intercept_

In [None]:
_, ax = plt.subplots(figsize=(10, 10))

ax.scatter(df['x'], df['y'], alpha=0.7)

x = np.linspace(-2, 2, 1000).reshape(-1, 1)
y = linreg.predict(x)

ax.plot(x, y, color='red', linewidth=4)

In [None]:
# Use gradient descent for linear regression

In [None]:
fig, axes = plt.subplots(10, 10, figsize=(20, 20))
axes = axes.reshape(-1)

m = 0.
c = 0.

alpha = 0.1
epoch = 100

n = float(len(x))

X = df['x']
y = df['y']

for i in range(epoch): 
    y_pred = m*X + c
    
    delta_m = -np.sum(X * (y - y_pred)) / n
    delta_c = -np.sum(y - y_pred) / n
    
    m = m - (alpha * delta_m)
    c = c - (alpha * delta_c)
        
    ax = axes[i]
    ax.scatter(df['x'], df['y'], alpha=0.5)

    xs = np.linspace(-2, 2, 1000).reshape(-1, 1)
    ys = m*x + c
    ax.plot(xs, ys, color='red', linewidth=4)
    
    title = 'iter '+str(i) +'\nm '+str(m)[:4]+' c '+str(c)[:4]
    ax.set_title(title)
    
plt.tight_layout()
print (m, c)

# Classification

In [None]:
# Use sklearn logistic regression to classify easy dataset

In [None]:
easy = pd.read_csv('easy_dataset.csv')
easy.head()

In [None]:
def plot_dataset(df, ax=None):
    
    if ax is None:
        _, ax = plt.subplots(figsize=(10, 10))
        
    ax.scatter(x='x', y='y', data=df[df['label'] == 0], color='red')
    ax.scatter(x='x', y='y', data=df[df['label'] == 1], color='green')
    
    return ax

In [None]:
plot_dataset(easy)

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

X = easy[['x', 'y']]
y = easy['label']

In [None]:
logreg.fit(X=X, y=y)

In [None]:
from sklearn.metrics import accuracy_score

prediction = logreg.predict(X)
accuracy_score(prediction, y)

In [None]:
def plot_decision_boundary(df, model=None, func=None, ax=None):
    
    ax = plot_dataset(df, ax)
    
    if model is None:
        return
    
    max_x = np.max(df['x'])
    max_y = np.max(df['y'])
    
    min_x = np.min(df['x'])
    min_y = np.min(df['y'])
    
    n_sample = 200
    xs = np.linspace(min_x, max_x, n_sample)
    ys = np.linspace(min_y, max_y, n_sample)
    
    xx, yy = np.meshgrid(xs, ys)
    
    X = np.array([xx.reshape(-1), yy.reshape(-1)]).T
    X = pd.DataFrame(X)
    X.columns = ['x', 'y']
    y = df['label']
    
    if func is not None:
        X = func(X)
    
    zz = model.predict(X).reshape(n_sample, n_sample)
    ax.contour(xx, yy, zz, levels=(-1, 0, 1), linewidths=5, colors='black', alpha=0.95)
    ax.contourf(xx, yy, zz, levels=(-1, 0, 1), colors=('#ff0000', '#00ff00'), alpha=0.2)
    
    ax.set_xlim((min_x, max_x))
    ax.set_ylim((min_y, max_y))
    
    return zz

In [None]:
plot_decision_boundary(easy, logreg);

In [None]:
# Use sklearn logistic regression to classify medium dataset

In [None]:
medium = pd.read_csv('medium_dataset.csv')
plot_dataset(medium)

In [None]:
X = medium[['x', 'y']]
y = medium['label']

logreg = LogisticRegression()
logreg.fit(X, y)

In [None]:
prediction = logreg.predict(X)
accuracy_score(prediction, y)

In [None]:
plot_decision_boundary(medium, logreg);

In [None]:
def transform(X):
    
    result = X.copy(deep=True)
    result['x2'] = X['x']**2
    result['y2'] = X['y']**2
    
    return result

In [None]:
X_transformed = transform(X)

logreg.fit(X_transformed, y)
prediction = logreg.predict(X_transformed)
accuracy_score(prediction, y)

In [None]:
plot_decision_boundary(medium, logreg, transform);

In [None]:
# Use sklearn logistic regression to classify hard dataset

In [None]:
hard = pd.read_csv('hard_dataset.csv')
plot_dataset(hard)

In [None]:
X = hard[['x', 'y']]
y = hard['label']

logreg = LogisticRegression()
logreg.fit(X, y)

prediction = logreg.predict(X)
accuracy_score(prediction, y)

In [None]:
plot_decision_boundary(hard, logreg);

In [None]:
def transform(X):
    
    result = X.copy(deep=True)
    result['x2'] = X['x']**12
    result['y2'] = X['y']**45
    
    return result

X_transformed = transform(X)

logreg.fit(X_transformed, y)
prediction = logreg.predict(X_transformed)
accuracy_score(prediction, y)

In [None]:
plot_decision_boundary(hard, logreg, transform)

In [None]:
#EXERCISE: manually find the transformation that can classified hard dataset!!!

In [None]:
# Use decision tree to classify hard dataset

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X, y)

prediction = dtc.predict(X)
accuracy_score(prediction, y)

In [None]:
plot_decision_boundary(hard, dtc);

In [None]:
# Test different max_depth hyperparam

In [None]:
fig, axes = plt.subplots(4, 4, figsize=(20, 20))

for i in range(16):
    dec = DecisionTreeClassifier(max_depth=i+1, criterion='entropy')
    dec.fit(X=hard[['x', 'y']], y=hard['label'])
    
    ax = axes.reshape(-1)[i]
    plot_decision_boundary(hard, dec, ax=ax)
    ax.set_title('Maximum depth '+str(i+1))

In [None]:
# EXERCISE

In [None]:
# Split train test to validly assess the performance

In [None]:
# cross validation to find best hyperparameter