In [71]:
import scipy
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import multivariate_normal
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

pd.options.plotting.backend = "plotly"

# Qeustion 1

Loading dataset

In [38]:
heart_df = pd.read_csv('data/heart.csv')
train, test = train_test_split(heart_df, train_size=0.8, test_size=0.2)

## Part A

Training naive bayes classifier

In [6]:
y_train = train['target']
y_test = test['target']
continuous_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
discrete_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

In [36]:
class PartaNaiveBayesClassifier:
    def __init__(self, train):
        self.positive_class_quantity, self.negative_class_quantity = train['target'].value_counts()
        self.positive_class_probability, self.negative_class_sample = self.positive_class_quantity/ len(train['target']), self.negative_class_quantity / len(train['target'])
        self.P = dict()
        for feature in discrete_features:
            self.P[feature] = dict()
            for value in pd.unique(heart_df[feature]):
                self.P[feature][value] = dict()
                self.P[feature][value]['positive'] = train[(train['target'] == 1) & (train[feature] == value)] / self.positive_class_quantity
                self.P[feature][value]['negative'] = train[(train['target'] == 0) & (train[feature] == value)] / self.negative_class_quantity

        continuous_features_mean = train[continuous_features].mean()
        continuous_featuers_cov = train[continuous_features].cov()
        self.continuous_features_probability = multivariate_normal(continuous_features_mean, continuous_featuers_cov)

    def predict(self, sample):
        positive_class = self.continuous_features_probability.pdf(sample[continuous_features])
        negative_class = self.continuous_features_probability.pdf(sample[continuous_features])
        for (feature, value) in sample.items():
            if feature in discrete_features:
                positive_class *= self.P[feature][value]['positive']
                negative_class *= self.P[feature][value]['positive']
        return int(positive_class > negative_class)

IndentationError: expected an indented block (33307734.py, line 3)

In [None]:
number_of_true_predicted = 0
total_number_of_instances = len(test)
for sample in test:
    y_real = sample['target']
    y_predicted = PartaNaiveBayesClassifier.naive_bayes_classifier(sample)
    if y_real == y_predicted:
        number_of_true_predicted += 1

# Question 2

Loadind from file

In [62]:
train = pd.read_csv('data/titanic/train.csv')
test = pd.read_csv('data/titanic/test.csv')

## Part A

In [63]:
pd.concat(
    [
        train.isnull().sum().rename('Train Quantity'), 
        train.isnull().sum().rename('Train Percent') / len(train),
        test.isnull().sum().rename('Test Quantity'),
        test.isnull().sum().rename('Test Percent') / len(test)
    ], 
    axis=1)

Unnamed: 0,Train Quantity,Train Percent,Test Quantity,Test Percent
PassengerId,0,0.0,0.0,0.0
Survived,0,0.0,,
Pclass,0,0.0,0.0,0.0
Name,0,0.0,0.0,0.0
Sex,0,0.0,0.0,0.0
Age,177,0.198653,86.0,0.205742
SibSp,0,0.0,0.0,0.0
Parch,0,0.0,0.0,0.0
Ticket,0,0.0,0.0,0.0
Fare,0,0.0,1.0,0.002392


In [64]:
train = train.drop(['Cabin'], axis=1)
train['Age'].fillna(train['Age'].mean(), inplace=True)
train['Embarked'].fillna(train['Embarked'].mode().iloc[0], inplace=True)

test = test.drop(['Cabin'], axis=1)
test['Age'].fillna(test['Age'].mean(), inplace=True)
test['Fare'].fillna(test['Fare'].mean(), inplace=True)

## Part B and C

Dropping unneccessary columns and training the DecisionTreeClassifier

In [69]:
x_train = train.loc[:, (train.columns!='Survived') & (train.columns!='PassengerId') & (train.columns!='Name') & (train.columns!='Ticket')]
x_train.loc[:,'Sex']=x_train['Sex'].map({'male':0, 'female': 1})
x_train.loc[:,'Embarked']=x_train['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
y_train = train.loc[:, train.columns=='Survived']
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


DecisionTreeClassifier()

Clearing up the test dataset and predicting the survivals.

In [72]:
x_test = test.loc[:, (test.columns!='Survived') & (test.columns!='PassengerId') & (test.columns!='Name') & (test.columns!='Ticket')]
x_test.loc[:,'Sex']=x_test['Sex'].map({'male':0, 'female': 1})
x_test.loc[:,'Embarked']=x_test['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
predictions = clf.predict(x_test)
print(predictions)

[0 0 1 1 1 0 0 0 1 0 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 1 0 0 0 1 0 1 1 0
 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0
 1 0 0 0 0 1 1 0 0 0 1 0 1 1 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 0
 0 1 1 1 1 0 1 0 1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0
 0 0 1 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 1 1 0 1
 0 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 1 0
 1 1 1 0 0 0 0 0 0 1 0 0 1 0 1 1 1 1 1 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0
 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1 1
 0 1 0 0 1 1 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 1 0 1 1
 0 0 1 0 1 0 0 1 0 0 1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


## Part C

In [79]:
predictions = predictions.astype('bool')
np.savetxt("predictions.csv", predictions, delimiter=",")

# Question 3

Loading the dataset

In [44]:
iris = pd.read_csv('data/iris.csv')

Implementing of KNN algorithm.

In [55]:
def KNN(k, train, test):
    confusion_matrix = {
        'Iris-setosa': {
            'Iris-setosa': 0,
            'Iris-versicolor': 0,
            'Iris-virginica': 0
        },
        'Iris-versicolor': {
            'Iris-setosa': 0,
            'Iris-versicolor': 0,
            'Iris-virginica': 0
        },
        'Iris-virginica': {
            'Iris-setosa': 0,
            'Iris-versicolor': 0,
            'Iris-virginica': 0
        }
    }

    x_train = train.loc[:, train.columns != 'class'].to_numpy()  # Train dataset features
    y_train = train['class']  # Train dataset target
    for _, p in test.iterrows():
        class_p = p['class']
        x_p = p.drop('class').to_numpy('float')
        # Find distance from the point p to other points
        distances = np.linalg.norm(x_train - x_p, axis=1)
        # Sort distances and get corresponding indices
        top_nearest_neighbors = np.argsort(distances)
        # Find class of the top nearest neighbors
        label_of_top_nearst_neighbors = np.take(y_train, top_nearest_neighbors)
        # Select top K neighbors and find the mode of the lables
        knn_predicted_class = scipy.stats.mode(label_of_top_nearst_neighbors[:k])[0][0]
        confusion_matrix[class_p][knn_predicted_class] += 1
    return confusion_matrix


# Part A

Implementing 10-fold cross validation to determine best value for K in KNN.

In [56]:
folds_number = 10
dataset_length = len(iris)

# Making folds
step = dataset_length//folds_number
folds = [iris[i*step:(i+1)*step] for i in range(folds_number)]

# Determining error on train and test data
accuracy = {"k": list(), "accuracy": list()}
for k in range(1, 30, 2):
    fold_mean_accuracy = 0
    for i, fold in enumerate(folds):
        # Specify train dataset
        train = pd.concat(folds[:i] + folds[i+1:]).reset_index(drop=True)
        # Specify test dataset
        test = folds[i]
        confusion_matrix = KNN(k,train,test)
        fold_accuracy = (confusion_matrix['Iris-setosa']['Iris-setosa'] + \
                         confusion_matrix['Iris-versicolor']['Iris-versicolor'] + \
                         confusion_matrix['Iris-virginica']['Iris-virginica']) / len(test)
        fold_mean_accuracy += (fold_accuracy/folds_number)
    accuracy["k"].append(k)
    accuracy["accuracy"].append(round(fold_mean_accuracy,4))

In [57]:
fig = px.line(pd.DataFrame(accuracy), x='k', y='accuracy', text="accuracy",
        title='Check Impact of K in KNN', markers=True,
        template=dict(layout=go.Layout(xaxis=dict(dtick=2))))
fig.show()

## Part B

In [58]:
train, test = train_test_split(iris, train_size=0.8, test_size=0.2)

In [61]:
pd.DataFrame(KNN(1,train,train))

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
Iris-setosa,38,0,0
Iris-versicolor,0,42,0
Iris-virginica,0,0,40


In [60]:
pd.DataFrame(KNN(1,train,test))

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
Iris-setosa,12,0,0
Iris-versicolor,0,8,1
Iris-virginica,0,0,9
