# Clustering and Regression

In [None]:
import numpy as np
import pandas as pd
import random

from sklearn.linear_model import LinearRegression
from sklearn.cluster import DBSCAN

%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
# Colour the Clusters
def set_colors(labels, colors = "rgbykcm"):
    # In: labels = 
    # Out: array of colours for each given label
    colored_labels = []
    for label in labels:
        colored_labels.append(colors[label])
    return colored_labels

Now we generate some sample data.

In [None]:
## Create some synthetic data
from scipy.stats import multivariate_normal

np.random.seed(100)
data = []
# multivariate_normal generates a multivariate normal random variable
# creating two set of numbers with mean 0 and 0
# dist = multivariate_normal(mean = [0, 0], cov = [[0.5, 0.1], [0.0, 0.01]])
dist = multivariate_normal(mean = [0, 0], cov = [[0.5, 0.5], [0.0, 0.1]])
for i in range(150):
    # Add a random samples from a multivariate normal distribution
    data.append(dist.rvs())
print(data[0:5])

# creating two set of numbers with mean 1 and 5
# dist = multivariate_normal(mean = [1, 5], cov = [[0.5, 0.2], [0.0, 0.02]])
dist = multivariate_normal(mean = [1, 5], cov = [[0.5, 0.5], [0.0, 0.1]])
for i in range(150):
    data.append(dist.rvs())

# creating two set of numbers with mean 2 and 10
dist = multivariate_normal(mean = [2, 10], cov = [[0.5, 0.5], [0.0, 0.1]])
for i in range(150):
    data.append(dist.rvs())

# convert data into a dataframe
df = pd.DataFrame(data, columns = ["x1", "x2"])
df.head()

In [None]:
plt.scatter(df["x1"], df["x2"])
plt.xlabel("x1")
plt.ylabel("x2")
plt.show()

## Find Clusters

In [None]:
# Fit a DBSCAN estimator
# create a DBScan Estimator
estimator = DBSCAN(eps = 0.8, min_samples = 10)
# Two input variables
X = df[["x1", "x2"]]
# Note that for UNsupervised methods there is no known output (y)
estimator.fit(X)

In [None]:
# Clusters are given in the labels_ attribute
df["label"] = estimator.labels_
df["color"] = set_colors(df["label"])
df.head()

In [None]:
for i, cluster in df.groupby("label"):
    plt.scatter(cluster["x1"], cluster["x2"], c = cluster["color"], label = "Cluster %d" % i)
plt.legend(loc = "upper left")
plt.xlabel("x1")
plt.ylabel("x2")
plt.show()

## Add Cluster Labels back to the Data Frame and Fit a Linear Model

In [None]:
df = pd.concat([df, pd.get_dummies(df["label"], prefix = "cluster")], axis = 1)
df.head()

In [None]:
model = LinearRegression()
X = df[["x1", "cluster_0", "cluster_1", "cluster_2"]]
y = df["x2"]
model.fit(X, y)
print(model.score(X, y))

In [None]:
# Plot the model
plt.scatter(df["x1"], df["x2"], c = df["color"])
plt.xlabel("x")
plt.ylabel("y")
plt.scatter(df["x1"], model.predict(X), color = "black")
plt.show()

## Another example

In [None]:
## Create some synthetic data
from scipy.stats import multivariate_normal

np.random.seed(200)
data = []
# dist = multivariate_normal(mean = [0, 0], cov = [[0.5, 0.1], [0.0, 0.01]])
dist = multivariate_normal(mean = [0, 0], cov = [[0.1, 0.5], [0.0, 0.2]])
for i in range(150):
    p = list(dist.rvs())
    data.append(dist.rvs())
dist = multivariate_normal(mean = [1, 5], cov = [[0.6, 0.0], [0.2, 0.1]])
for i in range(150):
    data.append(dist.rvs())
dist = multivariate_normal(mean = [2, 10], cov = [[0.5, 0.5], [0.0, 0.1]])
for i in range(150):
    data.append(dist.rvs())

df = pd.DataFrame(data, columns = ["x1", "x2"])
df.head()

In [None]:
plt.scatter(df["x1"], df["x2"])
plt.xlabel("x1")
plt.ylabel("x2")
plt.show()

In [None]:
# Fit a DBSCAN estimator
estimator = DBSCAN(eps = 0.8, min_samples = 10)
X = df[["x1", "x2"]]
estimator.fit(X)

In [None]:
# Clusters are given in the labels_ attribute
df["label"] = estimator.labels_
df["color"] = set_colors(df["label"])
df.head()

In [None]:
for i, cluster in df.groupby("label"):
    plt.scatter(cluster["x1"], cluster["x2"], c = cluster["color"], label = "Cluster %d" % i)
plt.legend(loc = "upper left")
plt.xlabel("x1")
plt.ylabel("x2")
plt.show()

## Modeling
This time we have to fit a model to each cluster since they are not the same shape with offsets.

In [None]:
from collections import Counter
counts = Counter(df["label"])
print(counts)

In [None]:
# Plot the raw data
plt.scatter(df["x1"], df["x2"], c = df["color"])
plt.xlabel("x1")
plt.ylabel("x2")

# Fit a model to each cluster
models = dict()
for label in set(df["label"]):
    if counts[label] > 10:
        model = LinearRegression()
        subdf = df[df["label"] == label]
        X = subdf[["x1"]]
        y = subdf[["x2"]]
        model.fit(X, y)
        models["label"] = model
        plt.scatter(X, model.predict(X), color = "black")
plt.show()