In [14]:
import scipy
import sklearn
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import multivariate_normal
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

pd.options.plotting.backend = "plotly"

# Qeustion 1

Loading dataset

In [2]:
heart_df = pd.read_csv('data/heart.csv')
train, test = train_test_split(heart_df, train_size=0.8, test_size=0.2)

Define continuous and discrete features

In [3]:
CONTINUOUS_FEATURES = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
DISCRETE_FEATURES = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
EPSILON = 0.001

## Part A

Defining naive bayes classifier

In [4]:
class DependantVariableNaiveBayesClassifier:
    def __init__(self, dataset, train, continuous_features=CONTINUOUS_FEATURES, discrete_features=DISCRETE_FEATURES):
        self.continuous_features = continuous_features
        self.discrete_features = discrete_features
        # Ppos: Probability of positive class occurrence,
        # Pneg: Probability of negative class occurrence
        plus_count, neg_count = train['target'].value_counts()
        self.Ppos = plus_count / len(train['target'])
        self.Pneg = neg_count / len(train['target'])

        # Conditional Probability for discrete features
        self.P_disc = {'+': dict(), '-': dict()}
        for feature in self.discrete_features:
            self.P_disc['+'][feature] = dict()
            self.P_disc['-'][feature] = dict()
            for value in pd.unique(dataset[feature]):
                self.P_disc['+'][feature][value] = \
                    train[(train['target'] == 1) & (train[feature] == value)][feature].count() / plus_count
                self.P_disc['-'][feature][value] = \
                    train[(train['target'] == 0) & (train[feature] == value)][feature].count() / neg_count

        # Conditional Probability for continuous features
        self.P_cont = {
            '+': multivariate_normal(
                mean=train[self.continuous_features][(train['target'] == 1)].mean(),
                cov=train[self.continuous_features][(train['target'] == 1)].cov()),
            '-': multivariate_normal(
                mean=train[self.continuous_features][(train['target'] == 0)].mean(),
                cov=train[self.continuous_features][(train['target'] == 0)].cov())
            }

    def predict(self, sample):
        # Initiazling probability based on prior probability
        pos_probability = self.Ppos
        neg_probability = self.Pneg

        # Applying probability of continuous features
        pos_probability = self.P_cont['+'].pdf(sample[self.continuous_features])
        neg_probability = self.P_cont['-'].pdf(sample[self.continuous_features])

        # Applying probability of discrete features
        for (feature, value) in sample.items():
            if feature in self.discrete_features:
                pos_probability *= self.P_disc['+'][feature][value] * EPSILON
                neg_probability *= self.P_disc['-'][feature][value] * EPSILON

        return int(pos_probability > neg_probability)

Learning the model

In [5]:
dependant_nbc = DependantVariableNaiveBayesClassifier(heart_df, train)

Testing trained model.

In [6]:
y_true = train['target']
y_pred = list()
for _, sample in train.iterrows():
    y_pred.append(dependant_nbc.predict(sample))

print("Metrics based on Train Dataset")
pd.DataFrame(
    {
        'Precision': [sklearn.metrics.precision_score(y_true, y_pred)],
        'Recall': [sklearn.metrics.recall_score(y_true, y_pred)],
        'F1': [sklearn.metrics.f1_score(y_true, y_pred)],
        'Accuracy': [sklearn.metrics.accuracy_score(y_true, y_pred)]
    }
)

Metrics based on Train Dataset


Unnamed: 0,Precision,Recall,F1,Accuracy
0,0.855814,0.87619,0.865882,0.860976


In [7]:
y_true = test['target']
y_pred = list()
for _, sample in test.iterrows():
    y_pred.append(dependant_nbc.predict(sample))

print("Metrics based on Test Dataset")
pd.DataFrame(
    {
        'Precision': [sklearn.metrics.precision_score(y_true, y_pred)],
        'Recall': [sklearn.metrics.recall_score(y_true, y_pred)],
        'F1': [sklearn.metrics.f1_score(y_true, y_pred)],
        'Accuracy': [sklearn.metrics.accuracy_score(y_true, y_pred)]
    }
)

Metrics based on Test Dataset


Unnamed: 0,Precision,Recall,F1,Accuracy
0,0.845455,0.869159,0.857143,0.84878


## Part B

In [7]:
class IndependantVariableNaiveBayesClassifier:
    def __init__(self, dataset, train, contiuous_features=CONTINUOUS_FEATURES, discrete_features=DISCRETE_FEATURES):
        self.continuous_features = contiuous_features
        self.discrete_features = discrete_features
        # Ppos: Probability of positive class occurrence,
        # Pneg: Probability of negative class occurrence
        plus_count, neg_count = train['target'].value_counts()
        self.Ppos = plus_count / len(train['target'])
        self.Pneg = neg_count / len(train['target'])

        # Conditional Probability for discrete features
        self.P_disc = {'+': dict(), '-': dict()}
        for feature in self.discrete_features:
            self.P_disc['+'][feature] = dict()
            self.P_disc['-'][feature] = dict()
            for value in pd.unique(dataset[feature]):
                self.P_disc['+'][feature][value] = \
                    train[(train['target'] == 1) & (train[feature] == value)][feature].count() / plus_count
                self.P_disc['-'][feature][value] = \
                    train[(train['target'] == 0) & (train[feature] == value)][feature].count() / neg_count

        # Conditional Probability for continuous features
        self.P_cont = {'+': dict(), '-': dict()}
        for feature in self.continuous_features:
            self.P_cont['+'][feature] = \
                scipy.stats.norm(loc=train[(train['target'] == 1)][feature].mean(),
                                 scale=train[(train['target'] == 1)][feature].cov(train[(train['target'] == 1)][feature]))
            self.P_cont['-'][feature] = \
                scipy.stats.norm(loc=train[(train['target'] == 0)][feature].mean(),
                                 scale=train[(train['target'] == 0)][feature].cov(train[(train['target'] == 0)][feature]))

    def predict(self, sample):
        # Initiazling probability based on prior probability
        pos_probability = self.Ppos
        neg_probability = self.Pneg

        for (feature, value) in sample.items():
            # Applying probability of discrete features
            if feature in self.discrete_features:
                pos_probability *= self.P_disc['+'][feature][value]
                neg_probability *= self.P_disc['-'][feature][value]
            # Applying probability of continuous features
            elif feature in self.continuous_features:
                pos_probability *= self.P_cont['+'][feature].pdf(value) * EPSILON
                neg_probability *= self.P_cont['-'][feature].pdf(value) * EPSILON

        return int(pos_probability > neg_probability)

Learning the model

In [8]:
independant_nbc = IndependantVariableNaiveBayesClassifier(heart_df, train)

Testing trained model.

In [9]:
y_true = train['target']
y_pred = list()
for _, sample in train.iterrows():
    y_pred.append(independant_nbc.predict(sample))

print("Metrics based on Train Dataset")
pd.DataFrame(
    {
        'Precision': [sklearn.metrics.precision_score(y_true, y_pred)],
        'Recall': [sklearn.metrics.recall_score(y_true, y_pred)],
        'F1': [sklearn.metrics.f1_score(y_true, y_pred)],
        'Accuracy': [sklearn.metrics.accuracy_score(y_true, y_pred)]
    }
)

Metrics based on Train Dataset


Unnamed: 0,Precision,Recall,F1,Accuracy
0,0.824834,0.885714,0.854191,0.845122


In [10]:
y_true = test['target']
y_pred = list()
for _, sample in test.iterrows():
    y_pred.append(independant_nbc.predict(sample))

print("Metrics based on Test Dataset")
pd.DataFrame(
    {
        'Precision': [sklearn.metrics.precision_score(y_true, y_pred)],
        'Recall': [sklearn.metrics.recall_score(y_true, y_pred)],
        'F1': [sklearn.metrics.f1_score(y_true, y_pred)],
        'Accuracy': [sklearn.metrics.accuracy_score(y_true, y_pred)]
    }
)

Metrics based on Test Dataset


Unnamed: 0,Precision,Recall,F1,Accuracy
0,0.807018,0.867925,0.836364,0.82439


## Part C

Removing the `chol` feature and training based on this feature

In [11]:
dependant_nbc = DependantVariableNaiveBayesClassifier(heart_df, train,continuous_features=['age', 'trestbps', 'thalach', 'oldpeak'])
y_true = train['target']
y_pred = list()
for _, sample in train.iterrows():
    y_pred.append(dependant_nbc.predict(sample))

print("Metrics for Train Dataset")
print(
    pd.DataFrame(
        {
            'Precision': [sklearn.metrics.precision_score(y_true, y_pred)],
            'Recall': [sklearn.metrics.recall_score(y_true, y_pred)],
            'F1': [sklearn.metrics.f1_score(y_true, y_pred)],
            'Accuracy': [sklearn.metrics.accuracy_score(y_true, y_pred)]
        }
    )
)

y_true = test['target']
y_pred = list()
for _, sample in test.iterrows():
    y_pred.append(dependant_nbc.predict(sample))

print("Metrics for Test Dataset")
print(
    pd.DataFrame(
        {
            'Precision': [sklearn.metrics.precision_score(y_true, y_pred)],
            'Recall': [sklearn.metrics.recall_score(y_true, y_pred)],
            'F1': [sklearn.metrics.f1_score(y_true, y_pred)],
            'Accuracy': [sklearn.metrics.accuracy_score(y_true, y_pred)]
        }
    )
)

Metrics for Train Dataset
   Precision    Recall        F1  Accuracy
0   0.853147  0.871429  0.862191  0.857317
Metrics for Test Dataset
   Precision    Recall        F1  Accuracy
0   0.842593  0.858491  0.850467  0.843902


Removing the `oldpeak` feature and training based on this feature

In [13]:
dependant_nbc = DependantVariableNaiveBayesClassifier(heart_df, train,continuous_features=['age', 'trestbps', 'chol', 'thalach'])
y_true = train['target']
y_pred = list()
for _, sample in train.iterrows():
    y_pred.append(dependant_nbc.predict(sample))

print("Metrics for Train Dataset")
print(
    pd.DataFrame(
        {
            'Precision': [sklearn.metrics.precision_score(y_true, y_pred)],
            'Recall': [sklearn.metrics.recall_score(y_true, y_pred)],
            'F1': [sklearn.metrics.f1_score(y_true, y_pred)],
            'Accuracy': [sklearn.metrics.accuracy_score(y_true, y_pred)]
        }
    )
)

y_true = test['target']
y_pred = list()
for _, sample in test.iterrows():
    y_pred.append(dependant_nbc.predict(sample))

print("Metrics for Test Dataset")
print(
    pd.DataFrame(
        {
            'Precision': [sklearn.metrics.precision_score(y_true, y_pred)],
            'Recall': [sklearn.metrics.recall_score(y_true, y_pred)],
            'F1': [sklearn.metrics.f1_score(y_true, y_pred)],
            'Accuracy': [sklearn.metrics.accuracy_score(y_true, y_pred)]
        }
    )
)

Metrics for Train Dataset
   Precision  Recall        F1  Accuracy
0   0.858173    0.85  0.854067   0.85122
Metrics for Test Dataset
   Precision    Recall        F1  Accuracy
0   0.833333  0.801887  0.817308  0.814634


# Question 2

Loadind from file

In [15]:
train = pd.read_csv('data/titanic/train.csv')
test = pd.read_csv('data/titanic/test.csv')

## Part A

Checking missing values in Train and Test dataset

In [16]:
pd.concat(
    [
        train.isnull().sum().rename('Train Quantity'), 
        train.isnull().sum().rename('Train Percent') / len(train),
        test.isnull().sum().rename('Test Quantity'),
        test.isnull().sum().rename('Test Percent') / len(test)
    ], 
    axis=1)

Unnamed: 0,Train Quantity,Train Percent,Test Quantity,Test Percent
PassengerId,0,0.0,0.0,0.0
Survived,0,0.0,,
Pclass,0,0.0,0.0,0.0
Name,0,0.0,0.0,0.0
Sex,0,0.0,0.0,0.0
Age,177,0.198653,86.0,0.205742
SibSp,0,0.0,0.0,0.0
Parch,0,0.0,0.0,0.0
Ticket,0,0.0,0.0,0.0
Fare,0,0.0,1.0,0.002392


We do following Cleanups:

    - Removing `Cabin` column because most of the rows are missing.
    - Filling `Age` missing column with mean of the known values.
    - Filling `Fare` missing column with mean of the known values.
    - Filling `Embarked` with mode of the known values

In [17]:
train = train.drop(['Cabin'], axis=1)
train['Age'].fillna(train['Age'].mean(), inplace=True)
train['Embarked'].fillna(train['Embarked'].mode().iloc[0], inplace=True)

test = test.drop(['Cabin'], axis=1)
test['Age'].fillna(test['Age'].mean(), inplace=True)
test['Fare'].fillna(test['Fare'].mean(), inplace=True)

## Part B

Dropping unneccessary columns and training the DecisionTreeClassifier

In [18]:
x_train = train.loc[:, (train.columns!='Survived') & (train.columns!='PassengerId') & (train.columns!='Name') & (train.columns!='Ticket')]
x_train.loc[:,'Sex']=x_train['Sex'].map({'male':0, 'female': 1})
x_train.loc[:,'Embarked']=x_train['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
y_train = train.loc[:, train.columns=='Survived']
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
predictions = clf.predict(x_train).astype('bool')
np.savetxt("train_predictions.csv", predictions, delimiter=",")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


## Part C

Clearing up the test dataset and predicting the survivals.

In [19]:
x_test = test.loc[:, (test.columns!='Survived') & (test.columns!='PassengerId') & (test.columns!='Name') & (test.columns!='Ticket')]
x_test.loc[:,'Sex']=x_test['Sex'].map({'male':0, 'female': 1})
x_test.loc[:,'Embarked']=x_test['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
predictions = clf.predict(x_test).astype('bool')
np.savetxt("test_predictions.csv", predictions, delimiter=",")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


# Question 3

Loading the dataset

In [44]:
iris = pd.read_csv('data/iris.csv')

Implementing of KNN algorithm.

In [55]:
def KNN(k, train, test):
    confusion_matrix = {
        'Iris-setosa': {
            'Iris-setosa': 0,
            'Iris-versicolor': 0,
            'Iris-virginica': 0
        },
        'Iris-versicolor': {
            'Iris-setosa': 0,
            'Iris-versicolor': 0,
            'Iris-virginica': 0
        },
        'Iris-virginica': {
            'Iris-setosa': 0,
            'Iris-versicolor': 0,
            'Iris-virginica': 0
        }
    }

    x_train = train.loc[:, train.columns != 'class'].to_numpy()  # Train dataset features
    y_train = train['class']  # Train dataset target
    for _, p in test.iterrows():
        class_p = p['class']
        x_p = p.drop('class').to_numpy('float')
        # Find distance from the point p to other points
        distances = np.linalg.norm(x_train - x_p, axis=1)
        # Sort distances and get corresponding indices
        top_nearest_neighbors = np.argsort(distances)
        # Find class of the top nearest neighbors
        label_of_top_nearst_neighbors = np.take(y_train, top_nearest_neighbors)
        # Select top K neighbors and find the mode of the lables
        knn_predicted_class = scipy.stats.mode(label_of_top_nearst_neighbors[:k])[0][0]
        confusion_matrix[class_p][knn_predicted_class] += 1
    return confusion_matrix


# Part A

Implementing 10-fold cross validation to determine best value for K in KNN.

In [56]:
folds_number = 10
dataset_length = len(iris)

# Making folds
step = dataset_length//folds_number
folds = [iris[i*step:(i+1)*step] for i in range(folds_number)]

# Determining error on train and test data
accuracy = {"k": list(), "accuracy": list()}
for k in range(1, 30, 2):
    fold_mean_accuracy = 0
    for i, fold in enumerate(folds):
        # Specify train dataset
        train = pd.concat(folds[:i] + folds[i+1:]).reset_index(drop=True)
        # Specify test dataset
        test = folds[i]
        confusion_matrix = KNN(k,train,test)
        fold_accuracy = (confusion_matrix['Iris-setosa']['Iris-setosa'] + \
                         confusion_matrix['Iris-versicolor']['Iris-versicolor'] + \
                         confusion_matrix['Iris-virginica']['Iris-virginica']) / len(test)
        fold_mean_accuracy += (fold_accuracy/folds_number)
    accuracy["k"].append(k)
    accuracy["accuracy"].append(round(fold_mean_accuracy,4))

In [57]:
fig = px.line(pd.DataFrame(accuracy), x='k', y='accuracy', text="accuracy",
        title='Check Impact of K in KNN', markers=True,
        template=dict(layout=go.Layout(xaxis=dict(dtick=2))))
fig.show()

## Part B

In [58]:
train, test = train_test_split(iris, train_size=0.8, test_size=0.2)

In [61]:
pd.DataFrame(KNN(1,train,train))

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
Iris-setosa,38,0,0
Iris-versicolor,0,42,0
Iris-virginica,0,0,40


In [60]:
pd.DataFrame(KNN(1,train,test))

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
Iris-setosa,12,0,0
Iris-versicolor,0,8,1
Iris-virginica,0,0,9
