#HW1 - Data Exploration and Preparation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

## Part 1: Data Loadin and First Look

## Loading the dataset

In [None]:
!rm *.csv

from google.colab import files
print("Please load the raw csv data")
uploadedData = files.upload()
filename = list(uploadedData)[0]

In [None]:
dataset = pd.read_csv(filename)

In [None]:
# Q1 getting the datas #rows and #cols
dataset.shape

In [None]:
# Q2 inspecting conversations_per_day field
dataset.conversations_per_day.value_counts()

In [None]:
# Q3 inspecting the features of the data
print(dataset.columns)
print(dataset.PCR_10.value_counts())
dataset.head()

In [None]:
# Q4 partitioning the data to train and test sets
train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=8)

## Part 2: Missing Values

In [None]:
# Q5 looking for null values in the data
print("training set null values: \n", pd.isnull(train_set).sum(),"\n")
print("test set null values: \n",pd.isnull(test_set).sum(), "\n")

In [None]:
# Q6 checking the distribution of income values
sns.histplot(train_set.household_income, kde=True)
plt.grid()
plt.title(r'Histogram of household income')

In [None]:
# Q7 inspecting household_income statistics for replacing null values
print("mean is: ", train_set.household_income.mean())
print("median is: ", train_set.household_income.median())

In [None]:
# Q7 - Task A - filling null values with median values
filled_train_set = train_set.fillna({"household_income":train_set.household_income.median()})
filled_test_set = test_set.fillna({"household_income":test_set.household_income.median()})
print("mean is: ", filled_train_set.household_income.mean())
print("median is: ", filled_train_set.household_income.median())

## Part 3: Warming up with k-Nearest Neighbors

In [None]:
# Task B
g=sns.pairplot(train_set, hue="spread", palette="tab10" , vars=["PCR_04", "PCR_07", "PCR_09"], plot_kws={"s":12})
g.fig.suptitle("Pairplot of PCR_04, PCR_07 and PCR_09 according to spread", y=1.04)
for ax in np.ravel(g.axes):
  ax.grid(alpha=0.5)
g.fig.set_size_inches(12,8)

In [None]:
# Q8
g=sns.pairplot(train_set, hue="spread", palette="tab10" , vars=["PCR_04", "PCR_09"], plot_kws={"s":12})
g.fig.suptitle("Pairplot of PCR_04 and PCR_09 according to spread", y=1.04)
for ax in np.ravel(g.axes):
  ax.grid(alpha=0.5)
g.fig.set_size_inches(12,8)

In [None]:
def visualize_clf(clf, X, Y, title,
                  xlabel, ylabel,
                  marker_size=50,
                  grid_length=300,
                  linewidths=None):
    import matplotlib.pyplot as plt
    import pandas as pd
    from matplotlib.colors import ListedColormap

    if isinstance(X, pd.DataFrame):
        X = X.to_numpy()

    # For internal use here, make sure labels are 0 and 1
    Y = np.ravel(Y).astype(int)
    labels = set(Y)
    assert len(labels) == 2, "Can only visualize two unique labels"

    if labels == set([-1,1]):
      Y = (Y + 1) // 2
      labels = set(Y)

    assert labels == set([0,1]), "Could not handle given labels"

    plt.figure(figsize=(8, 8))

    # Parameters
    n_classes = 2
    markers = ["D", "o"]
    palette = sns.color_palette("hls", 2)
    custom_cmap = ListedColormap(palette.as_hex())

    x_delta = np.abs(X[:, 0].max() - X[:, 0].min()) * 0.1
    y_delta = np.abs(X[:, 1].max() - X[:, 1].min()) * 0.1
    x_min, x_max = X[:, 0].min() - x_delta, X[:, 0].max() + x_delta
    y_min, y_max = X[:, 1].min() - y_delta, X[:, 1].max() + y_delta
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, grid_length),
                         np.linspace(y_min, y_max, grid_length))
    plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    cs = plt.contourf(xx, yy, Z, cmap=custom_cmap, alpha=0.35)

    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(alpha=0.75)

    # Plot the training points
    for i, color, marker in zip(labels, palette, markers):
        idx = np.where(Y == i)
        plt.scatter(X[idx, 0], X[idx, 1], color=color,
                    marker=marker,
                    edgecolor='white', s=marker_size,
                    linewidths=linewidths)

    plt.title(title, fontsize=20)
    plt.axis("tight")
    plt.show()


In [None]:
# Task C: k-NN implementation
from sklearn.base import BaseEstimator, ClassifierMixin
import scipy as sp

class kNN(BaseEstimator, ClassifierMixin):
  def __init__(self, n_neighbors: int = 3):
    self.n_neighbors = n_neighbors

  def fit(self, X, y):
    self.X_train = X
    self.Y_train = y
    return self

  def predict(self, X):
    distances = sp.spatial.distance.cdist(X, self.X_train)
    partition_ind = np.argpartition(distances, self.n_neighbors)
    neighbor_ind = np.delete(partition_ind, np.s_[self.n_neighbors:], axis=1)
    predictions = np.sum(np.array(self.Y_train)[neighbor_ind], axis=1)
    predictions[predictions < 0] = -1
    predictions[predictions >= 0] = 1
    return predictions

In [None]:
# Task D
tmp = train_set[["PCR_04", "PCR_09", "spread"]]
clf = kNN(1)
X = tmp[["PCR_04", "PCR_09"]].values
y = tmp.spread.values
clf.fit(X,y)
visualize_clf(clf, X, y, "1-NN Model - Decision regions", "PCR_04", "PCR_09")

tmp_test = test_set[["PCR_04", "PCR_09", "spread"]]
X_test = tmp_test[["PCR_04", "PCR_09"]].values
y_test = tmp_test.spread.values

print(f"Trainig accurecy: {clf.score(X, y): .3f}", )
print(f"Test accurecy: {clf.score(X_test, y_test): .3f}")

In [None]:
# Q11 - Nornalization using min-max scailing
from sklearn.preprocessing import MinMaxScaler
# y (spread) is alredy normelized to min max so no need to norm again,
y_normed = tmp.spread.values
scaler = MinMaxScaler((-1, 1))
X = tmp[["PCR_04", "PCR_09"]].values
X_normed = scaler.fit_transform(X)

clf.fit(X_normed,y_normed)
visualize_clf(clf, X_normed, y_normed, "1-NN Model with normelaized data - Decision regions", "PCR_04", "PCR_09")

tmp_test = test_set[["PCR_04", "PCR_09", "spread"]]
y_test_normed = tmp_test.spread.values

X_test = tmp_test[["PCR_04", "PCR_09"]].values
X_test_normed = scaler.fit_transform(X_test)

print(f"Trainig accurecy: {clf.score(X_normed,y_normed)}")
print(f"Test accurecy: {clf.score(X_test_normed, y_test_normed)}")

In [None]:
# Q12
clf = kNN(5)
clf.fit(X_normed,y_normed)
visualize_clf(clf, X_normed, y_normed, "5-NN Model with normelaized data - Decision regions", "PCR_04", "PCR_09")


y_test_normed = tmp_test.spread.values

X_test_normed = scaler.fit_transform(X_test)

print(f"Trainig accurecy: {clf.score(X_normed,y_normed)}")
print(f"Test accurecy: {clf.score(X_test_normed, y_test_normed)}")

## Part 4: Data exploration

In [None]:
# Task E t=
train_set['SpecialProperty']= train_set['blood_type'].isin(["O+", "B+"])
train_set.head()

In [None]:
train_set.drop("blood_type", inplace=True, axis=1)
train_set.head()

In [None]:
train_set.columns

In [None]:
NON_NUMERIC_VALUE_COLS = ["sex", "current_location", "pcr_date", "SpecialProperty"]
COL_NAME = [col for col in train_set.columns if col not in ["spread", "risk"] and col not in NON_NUMERIC_VALUE_COLS]
COLS, ROWS = (2, len(COL_NAME))
plt.figure(figsize=(5 * COLS, 4 * ROWS))

for row in range(ROWS):
  column = COL_NAME[row]

  for j, cls in enumerate(["risk", "spread"]):
    plt.subplot(ROWS,COLS, row * COLS + 1 + j)

    isContinuous = "float" in train_set[column].dtype.name
    sns.histplot(data=train_set, x=column, hue=cls, line_kws={"linewidth": 3}, kde=isContinuous, multiple="layer" if isContinuous else "dodge")
    plt.grid(alpha=0.5)
    plt.title(f"Patients {cls} by {column}")

  plt.tight_layout()

Q16

In [None]:
special_prop_data =  train_set[train_set.SpecialProperty == True]
special_prop_data

In [None]:
sns.pairplot(special_prop_data[["PCR_01"]+["PCR_03"]+ ["PCR_05"]+["PCR_10"]+["risk"]], plot_kws={"s": 3}, hue="risk", palette="tab10")

In [None]:
non_special_prop_data =  train_set[train_set.SpecialProperty == False]
non_special_prop_data

In [None]:
sns.pairplot(non_special_prop_data[["PCR_01"]+["PCR_03"]+ ["PCR_05"]+["PCR_10"]+["risk"]], plot_kws={"s": 3}, hue="risk", palette="tab10")

In [None]:
# Q17
g = sns.jointplot(data=special_prop_data, x="PCR_03", y="PCR_10", hue="risk", palette="tab10")
g.fig.suptitle("PCR_03 vs. PCR_10 - Special property boold type")
g.fig.subplots_adjust(top=0.95)
g.ax_joint.grid(alpha=0.5)

In [None]:
g = sns.jointplot(data=non_special_prop_data, x="PCR_03", y="PCR_10", hue="risk", palette="tab10")
g.fig.suptitle("PCR_03 vs. PCR_10 - Non special property boold type")
g.fig.subplots_adjust(top=0.95)
g.ax_joint.grid(alpha=0.5)

In [None]:
g = sns.jointplot(data=train_set, x="PCR_03", y="PCR_10", hue="risk", palette="tab10")
g.fig.suptitle("PCR_03 vs. PCR_10 - All boold types")
g.fig.subplots_adjust(top=0.95)
g.ax_joint.grid(alpha=0.5)

In [None]:
def plot3d(df, colX, colY, colZ, title, hue=None, s=1):
  %matplotlib inline

  fig = plt.figure(figsize=(20, 4))
  axes = [fig.add_subplot(1, 4, 1, projection='3d'),
          fig.add_subplot(1, 4, 2, projection='3d'),
          fig.add_subplot(1, 4, 3, projection='3d'),
          fig.add_subplot(1, 4, 4, projection='3d')]

  palette = sns.color_palette("hls", 2)

  for i,ax in enumerate(axes):
    if hue is None:
      ax.scatter3D(df[colX], df[colY], df[colZ], color=palette[0], s=s)
    else:
      idx = df[hue] == True
      df1 = df[idx]
      df2 = df[~idx]

      ax.scatter3D(df1[colX], df1[colY], df1[colZ], color=palette[0], s=s)
      ax.scatter3D(df2[colX], df2[colY], df2[colZ], color=palette[1], s=s)

    if i == 3 or i == 1:
      ax.set_xlabel(colX, labelpad=20)
      ax.set_ylabel(colY, labelpad=20)
    else:
      ax.xaxis.set_ticklabels([])
      ax.yaxis.set_ticklabels([])

    ax.view_init(elev=5., azim=45*i+15)

  axes[1].set_zlabel("\n" + colZ, labelpad=1)

  plt.suptitle(title)
  plt.subplots_adjust(wspace=0.1, top=0.99)
  plt.show()

In [None]:
plot3d(train_set, "PCR_03", "SpecialProperty", "PCR_10", title="Patients risk by PCR_03, PCR_10 and Special Property blood type", hue="risk", s=5)

# How to download from Colab?

In [None]:
from google.colab import files
outputPath = "saved_file.csv"
dataset.to_csv(outputPath)
files.download(outputPath)