In [6]:
import numpy as np
import pandas as pd
import scipy.stats as sstats
import csv
from sklearn import datasets
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

# Zadanie 1

## a)

In [3]:
def knn(trainX, trainY, testX, ks):
    dists = np.sum(trainX**2, axis=1).reshape(1, -1) + np.sum(testX**2, axis=1).reshape(-1, 1)
    dists -= 2 * testX.dot(trainX.T)
        
    cloest = np.argsort(dists, axis=1)
    
    targets = trainY[cloest]
    
    predictions = {}
    for k in ks:
        testY = sstats.mode(targets[:, :k], axis=1)[0]
        testY = testY.ravel()
        predictions[k] = testY
    return predictions

## b)

In [4]:
iris = datasets.load_iris()
iris_data = iris.data
iris_target = iris.target

In [5]:
N = 500
ks = [3, 5, 7, 9, 11, 13, 15, 17, 19]


columns = ["iter " + str(i) for i in range(N)]
index = ["k=" + str(k) for k in ks]
df = pd.DataFrame(index=index)

for i in range(N):
    idxs = np.arange(150)
    np.random.shuffle(idxs)
    
    train_idx = idxs[:100]
    test_idx = idxs[100:]
    
    preds = knn(iris_data[train_idx, :], iris_target[train_idx], iris_data[test_idx, :], ks)
    
    errs = []
    true_targets = iris_target[test_idx]
    for pred in preds.values():

        correct = np.sum(np.equal(true_targets, pred))
        errs.append(1 - correct / 50)
    df["iter " + str(i)] = errs

res_df = pd.DataFrame(index=index)
res_df['average error'] = df.mean(numeric_only=True, axis=1).round(3)
res_df

Unnamed: 0,average error
k=3,0.042
k=5,0.039
k=7,0.037
k=9,0.034
k=11,0.035
k=13,0.038
k=15,0.042
k=17,0.044
k=19,0.049


## c)

In [6]:
digits_tra = pd.read_csv('optdigits.tra', header=None)
digits_tes = pd.read_csv('optdigits.tes', header=None)
digits_data = pd.concat([digits_tra.iloc[:, :64], digits_tes.iloc[:, :64]],ignore_index=True)
digits_target = pd.concat([digits_tra.iloc[:, 64], digits_tes.iloc[:, 64]],ignore_index=True)

In [10]:
N = 100
ks = [3, 5, 7, 9, 11, 13, 15, 17, 19]


columns = ["iter " + str(i) for i in range(N)]
index = ["k=" + str(k) for k in ks]
df = pd.DataFrame(index=index)
threshold = len(digits_data) * 2 // 3

for i in range(N):
    idxs = np.arange(len(digits_data))
    np.random.shuffle(idxs)
    
    
    train_idx = idxs[:threshold]
    test_idx = idxs[threshold:]
    trainX = digits_data.iloc[train_idx, :].to_numpy()
    trainY = digits_target.iloc[train_idx].to_numpy()
    testX = digits_data.iloc[test_idx, :].to_numpy()
    
    preds = knn(trainX, trainY, testX, ks)
    
    errs = []
    true_targets = digits_target.iloc[test_idx].to_numpy()
    for pred in preds.values():
        

        correct = np.sum(np.equal(true_targets, pred))
        errs.append(1 - correct / (len(digits_data) - threshold))
    df["iter " + str(i)] = errs

res_df = pd.DataFrame(index=index)
res_df['average error'] = df.mean(numeric_only=True, axis=1).round(3)
res_df

Unnamed: 0,average error
k=3,0.014
k=5,0.015
k=7,0.016
k=9,0.016
k=11,0.017
k=13,0.018
k=15,0.019
k=17,0.02
k=19,0.021


## d)

In [92]:
def knn_cross_valid(dataX, dataY, N=10):
    
    ks = [3, 5, 7, 9, 11, 13, 15, 17, 19]

    columns = ["iter " + str(i) for i in range(N)]
    index = ["k=" + str(k) for k in ks]
    df = pd.DataFrame(index=index)
    
    
    idxs = np.arange(len(dataY))
    np.random.shuffle(idxs)
    dataX = dataX[idxs, :]
    dataY = dataY[idxs]
    dataX = np.split(dataX, N)
    dataY = np.split(dataY, N)
    
    
    for i in range(N):
        trainX = np.vstack(dataX[:i] + dataX[i + 1:])
        trainY = np.hstack(dataY[:i] + dataY[i + 1:])
        testX = dataX[i]
        testY = dataY[i]

        preds = knn(trainX, trainY, testX, ks)
    
        errs = []
        for pred in preds.values():
            correct = np.sum(np.equal(testY, pred))
            errs.append(1 - correct / testY.shape[0])
        df["iter " + str(i)] = errs

    res_df = pd.DataFrame(index=index)
    res_df['average error'] = df.mean(numeric_only=True, axis=1).round(3)
    return res_df 

In [93]:
knn_cross_valid(iris_data, iris_target)

Unnamed: 0,average error
k=3,0.04
k=5,0.033
k=7,0.04
k=9,0.033
k=11,0.04
k=13,0.027
k=15,0.027
k=17,0.027
k=19,0.027


In [94]:
knn_cross_valid(digits_data.to_numpy(), digits_target.to_numpy())

Unnamed: 0,average error
k=3,0.012
k=5,0.013
k=7,0.014
k=9,0.014
k=11,0.014
k=13,0.015
k=15,0.015
k=17,0.016
k=19,0.017


# Zadanie 2

In [139]:
titanic_df = pd.read_csv('titanic.csv')
# usunięcie z zestawu danych atrybutów nieistotnych dla klasyfikacji
titanic_df = titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
titanic_df = titanic_df.dropna()
# zmiana kodowania atrybutów nominalnych w zestawie danych
titanic_df['Sex'] = pd.Categorical(titanic_df['Sex']).codes
titanic_df['Embarked'] = pd.Categorical(titanic_df['Embarked']).codes
# rozbicie zestawu danych na dane opisujące pasażera (X) i etykietę klasy (y)
titanic_data = titanic_df.drop('Survived', axis = 1)
titanic_target = titanic_df['Survived']

## a)

In [140]:
t_gini = tree.DecisionTreeClassifier()
t_gini = t_gini.fit(titanic_data, titanic_target) # default criterion is 'gini'
with open('titanic_gini.dot', "w") as f:
    tree.export_graphviz(t_gini, out_file=f, feature_names=titanic_data.columns)

In [141]:
t_entropy = tree.DecisionTreeClassifier(criterion='entropy')
t_entropy = t_entropy.fit(titanic_data, titanic_target) # default criterion is 'gini'
with open('titanic_entropy.dot', "w") as f:
    tree.export_graphviz(t_entropy, out_file=f, feature_names=titanic_data.columns)

## b)

In [142]:
M = len(titanic_data)
threshold = round(M * 0.65)
idxs = np.arange(M)
np.random.shuffle(idxs)
    
train_idx = idxs[:threshold]
test_idx = idxs[threshold:]
trainX = titanic_data.iloc[train_idx, :]
trainY = titanic_target.iloc[train_idx]
testX = titanic_data.iloc[test_idx, :]
testY = titanic_target.iloc[test_idx]


t = tree.DecisionTreeClassifier()
t = t.fit(trainX, trainY)

print(t.score(trainX, trainY))
print(t.score(testX, testY))

0.9892008639308856
0.7590361445783133


## c)

In [143]:
D = np.arange(2, 20)


df = pd.DataFrame(index=['train score', 'test score'], columns=D)

for d in D:
    t = tree.DecisionTreeClassifier(max_depth=d)
    t = t.fit(trainX, trainY)
    
    df[d] = [t.score(trainX, trainY), t.score(testX, testY)]

df.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
train score,0.788337,0.814255,0.846652,0.863931,0.87473,0.889849,0.920086,0.933045,0.956803,0.965443,0.971922,0.980562,0.984881,0.984881,0.989201,0.989201,0.989201,0.989201
test score,0.763052,0.827309,0.779116,0.795181,0.803213,0.807229,0.787149,0.7751,0.742972,0.742972,0.746988,0.726908,0.759036,0.75502,0.730924,0.738956,0.730924,0.726908


## d)

In [144]:
L = np.arange(5, 50, 5)


df = pd.DataFrame(index=['train score', 'test score'], columns=L)

for l in L:
    t = tree.DecisionTreeClassifier(min_samples_split=l)
    t = t.fit(trainX, trainY)
    
    df[l] = [t.score(trainX, trainY), t.score(testX, testY)]

df.head()

Unnamed: 0,5,10,15,20,25,30,35,40,45
train score,0.941685,0.907127,0.887689,0.87041,0.87041,0.866091,0.853132,0.848812,0.848812
test score,0.75502,0.831325,0.815261,0.811245,0.811245,0.795181,0.803213,0.795181,0.795181


## e)

In [14]:
def tree_cross_valid(tree, dataX, dataY, N=10, show_conf_mtrx=False):
    assert dataX.shape[0] == dataY.shape[0]
    assert dataX.shape[0] % N == 0
    
    idxs = np.arange(len(dataY))
    np.random.shuffle(idxs)
    dataX = dataX[idxs, :]
    dataY = dataY[idxs]
    dataX = np.split(dataX, N)
    dataY = np.split(dataY, N)
    
    score = []
    for i in range(N):
        trainX = np.vstack(dataX[:i] + dataX[i + 1:])
        trainY = np.hstack(dataY[:i] + dataY[i + 1:])
        testX = dataX[i]
        testY = dataY[i]
        
        tree.fit(trainX, trainY)

        if show_conf_mtrx == True:
            predY = tree.predict(testX)
            print(confusion_matrix(testY, predY))
        
        score.append(tree.score(testX, testY))

    return np.array(score)

In [148]:
t_gini = tree.DecisionTreeClassifier()
print("Tree with Gini criterion:", tree_cross_valid(t_gini, titanic_data.to_numpy(), titanic_target.to_numpy(), N=8).mean())

t_entropy = tree.DecisionTreeClassifier(criterion='entropy')
print("Tree with Entropy criterion:", tree_cross_valid(t_entropy, titanic_data.to_numpy(), titanic_target.to_numpy(), N=8).mean())

t_max_depth = tree.DecisionTreeClassifier(max_depth=5)
print("Tree with max_depth=5:", tree_cross_valid(t_max_depth, titanic_data.to_numpy(), titanic_target.to_numpy(), N=8).mean())

t_min_samples_splt = tree.DecisionTreeClassifier(min_samples_split=35)
print("Tree with min_samples_split=35:", tree_cross_valid(t_max_depth, titanic_data.to_numpy(), titanic_target.to_numpy(), N=8).mean())

Tree with Gini criterion: 0.7514044943820225
Tree with Entropy criterion: 0.7514044943820224
Tree with max_depth=5: 0.7949438202247191
Tree with min_samples_split=35: 0.7963483146067416


# Zadanie 3

In [153]:
rf_clf = RandomForestClassifier(n_estimators=20)

scores = cross_val_score(rf_clf, iris_data, iris_target, cv=10)
# print(scores)
print(scores.mean())

0.9533333333333334


In [154]:
extra_rf_clf = ExtraTreesClassifier(n_estimators=20)
scores = cross_val_score(extra_rf_clf, iris_data, iris_target, cv=10)
# print(scores)
print(scores.mean())

0.9466666666666667


In [155]:
rf_clf = RandomForestClassifier(n_estimators=20)

scores = cross_val_score(rf_clf, titanic_data.to_numpy(), titanic_target.to_numpy(), cv=10)
# print(scores)
print(scores.mean())

0.799459534987704


In [156]:
extra_rf_clf = ExtraTreesClassifier(n_estimators=20)
scores = cross_val_score(extra_rf_clf, titanic_data.to_numpy(), titanic_target.to_numpy(), cv=10)
# print(scores)
print(scores.mean())

0.788190811535882


# Zadanie 4

In [175]:
columns = [
 "target", "cap-shape", "cap-surface", "cap-color", "bruises?", "odor", 
 "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape", 
 "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", 
 "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", 
 "ring-number", "ring-type", "spore-print-color", "population", "habitat", ]

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
mushroom_df = pd.read_csv(url, header=None, names=columns)


for name in columns:
    mushroom_df[name] = pd.Categorical(mushroom_df[name]).codes
    
mushroom_df.drop('stalk-root', axis=1, inplace=True)

mushroom_data = mushroom_df.drop('target', axis = 1)
mushroom_target = mushroom_df['target']

In [176]:
t = tree.DecisionTreeClassifier()
scores = tree_cross_valid(t, mushroom_data.to_numpy(), mushroom_target.to_numpy(), N=12, show_conf_mtrx=True)
print(scores.mean())

[[354   0]
 [  0 323]]
[[341   0]
 [  0 336]]
[[339   0]
 [  0 338]]
[[351   0]
 [  0 326]]
[[348   0]
 [  0 329]]
[[363   0]
 [  0 314]]
[[353   0]
 [  0 324]]
[[357   0]
 [  0 320]]
[[340   0]
 [  0 337]]
[[360   0]
 [  0 317]]
[[334   0]
 [  0 343]]
[[368   0]
 [  0 309]]
1.0


In [183]:
t_max_depth = tree.DecisionTreeClassifier(max_depth=5)
scores = tree_cross_valid(t_max_depth, mushroom_data.to_numpy(), mushroom_target.to_numpy(), N=12, show_conf_mtrx=True)
print(scores.mean())

[[338   4]
 [  2 333]]
[[335  13]
 [  6 323]]
[[335   9]
 [  3 330]]
[[336  14]
 [  1 326]]
[[333  14]
 [  4 326]]
[[339   7]
 [  9 322]]
[[358  10]
 [  4 305]]
[[342  10]
 [  7 318]]
[[350   9]
 [  2 316]]
[[349  10]
 [  7 311]]
[[344   6]
 [  1 326]]
[[329  14]
 [  2 332]]
0.9793205317577548


# Zadanie 5

In [128]:
columns = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "target"]

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
car_df = pd.read_csv(url, header=None, names=columns)


for name in columns:
    car_df[name] = pd.Categorical(car_df[name]).codes
    
car_data = car_df.drop('target', axis = 1)
car_target = car_df['target']

In [166]:
t = tree.DecisionTreeClassifier()
scores = tree_cross_valid(t, car_data.to_numpy(), car_target.to_numpy(), N=12, show_conf_mtrx=True)
print(scores.mean())

[[ 28   0   1   0]
 [  1   7   0   0]
 [  0   0 102   0]
 [  0   0   0   5]]
[[30  0  1  0]
 [ 0  7  0  0]
 [ 2  0 99  0]
 [ 0  0  0  5]]
[[37  1  0  1]
 [ 0  3  1  0]
 [ 0  1 96  0]
 [ 0  0  0  4]]
[[ 24   0   1   0]
 [  0   5   0   0]
 [  0   0 110   0]
 [  0   0   0   4]]
[[ 30   0   0   0]
 [  0   7   0   0]
 [  1   0 100   0]
 [  1   1   0   4]]
[[29  0  0  0]
 [ 1  6  0  0]
 [ 0  0 99  0]
 [ 0  0  0  9]]
[[ 33   1   4   0]
 [  1   1   0   0]
 [  1   0 100   0]
 [  0   0   0   3]]
[[27  0  1  0]
 [ 1 12  0  0]
 [ 0  0 95  0]
 [ 0  0  0  8]]
[[41  0  0  0]
 [ 0  3  0  0]
 [ 2  0 92  0]
 [ 1  0  0  5]]
[[ 31   0   0   1]
 [  0   1   0   0]
 [  1   0 106   0]
 [  0   0   0   4]]
[[ 27   0   1   0]
 [  0   7   0   0]
 [  2   0 100   0]
 [  0   0   0   7]]
[[34  0  0  0]
 [ 0  5  0  0]
 [ 1  1 99  0]
 [ 0  0  0  4]]
0.9809027777777777


In [180]:
t_max_depth = tree.DecisionTreeClassifier(max_depth=6)
scores = tree_cross_valid(t_max_depth, car_data.to_numpy(), car_target.to_numpy(), N=12, show_conf_mtrx=True)
print(scores.mean())

[[23  0  0  1]
 [ 6  0  0  0]
 [14  0 89  0]
 [ 0  0  0 11]]
[[23  3  0  2]
 [ 2  1  0  1]
 [13  1 91  0]
 [ 0  0  0  7]]
[[26  0  2  1]
 [ 5  0  0  0]
 [11  0 95  0]
 [ 0  0  0  4]]
[[28  4  0  4]
 [ 3  2  0  0]
 [ 8  0 88  0]
 [ 0  0  0  7]]
[[37  6  0  2]
 [ 6  2  0  0]
 [ 6  0 82  0]
 [ 0  0  0  3]]
[[27  0  0  1]
 [ 5  0  0  0]
 [12  0 95  0]
 [ 0  0  0  4]]
[[23  0  5  3]
 [ 4  0  0  1]
 [ 6  0 95  0]
 [ 0  0  0  7]]
[[ 22   0   0   3]
 [  5   0   0   2]
 [  8   0 100   0]
 [  0   0   0   4]]
[[32  2  0  1]
 [ 6  1  0  1]
 [ 6  1 90  0]
 [ 0  0  0  4]]
[[29  3  5  3]
 [ 4  2  0  3]
 [ 1  0 89  0]
 [ 0  0  0  5]]
[[31  0  7  0]
 [ 3  0  0  0]
 [ 5  0 92  0]
 [ 0  0  0  6]]
[[ 24   0   0   1]
 [  3   0   0   1]
 [ 11   0 101   0]
 [  0   0   0   3]]
0.8709490740740743


# Zadanie 6

In [29]:
# columns = ["age", "job", "marital", "education", "default", "balance", "housing", "loan", "contact", "day", "month", "duration", "campaign", "pdays", "previous", "poutcome", "y"]

categorical_cols = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome", "y"]

bank_df = pd.read_csv('bank/bank-full.csv', sep=';', header=0)

print(bank_df.shape)

for name in categorical_cols:
    bank_df[name] = pd.Categorical(bank_df[name]).codes
bank_data = bank_df.drop('y', axis = 1)
bank_target = bank_df['y']

(45211, 17)


In [30]:
t = tree.DecisionTreeClassifier(max_depth=5)
scores = tree_cross_valid(t, bank_data.to_numpy(), bank_target.to_numpy(), N=29, show_conf_mtrx=True)
print(scores.mean())

[[1345   48]
 [ 104   62]]
[[1337   46]
 [ 117   59]]
[[1316   42]
 [ 109   92]]
[[1321   35]
 [ 132   71]]
[[1322   45]
 [ 134   58]]
[[1332   41]
 [ 105   81]]
[[1348   45]
 [ 112   54]]
[[1332   54]
 [  91   82]]
[[1339   56]
 [  89   75]]
[[1314   59]
 [ 105   81]]
[[1336   42]
 [ 116   65]]
[[1342   44]
 [ 109   64]]
[[1302   57]
 [ 120   80]]
[[1336   39]
 [ 123   61]]
[[1328   44]
 [ 130   57]]
[[1329   38]
 [ 125   67]]
[[1343   32]
 [ 116   68]]
[[1329   45]
 [ 118   67]]
[[1308   41]
 [ 142   68]]
[[1340   39]
 [ 114   66]]
[[1358   45]
 [  98   58]]
[[1335   51]
 [ 105   68]]
[[1316   57]
 [ 116   70]]
[[1329   56]
 [  99   75]]
[[1333   49]
 [ 111   66]]
[[1323   51]
 [ 121   64]]
[[1356   41]
 [ 106   56]]
[[1317   40]
 [ 131   71]]
[[1336   38]
 [ 121   64]]
0.8973922275552412


# Zadanie 7

In [12]:
human_data = []

with open('UCI HAR Dataset/train/X_train.txt') as file:
    for row in csv.reader(file, delimiter='\n'):
        human_data.append(row[0].split())
        
human_target = []
with open('UCI HAR Dataset/train/y_train.txt') as file:
    for row in csv.reader(file, delimiter='\n'):
        human_target.append(row[0])

In [18]:
t = tree.DecisionTreeClassifier(max_depth=5)
scores = tree_cross_valid(t, np.array(human_data), np.array(human_target), N=8, show_conf_mtrx=True)
print(scores.mean())

[[131  10  13   0   0   0]
 [ 10 130   6   0   0   0]
 [  3  12  91   0   0   0]
 [  0   0   0 146   7   0]
 [  0   0   0  23 143   0]
 [  0   0   0   0   0 194]]
[[139  18   5   0   0   0]
 [ 14 107   8   0   0   0]
 [  8   9 114   0   0   0]
 [  0   0   0 140  32   0]
 [  0   0   0   8 152   0]
 [  0   0   0   0   0 165]]
[[155   9   4   0   0   0]
 [  9 115   5   0   0   0]
 [ 10  12  87   0   0   0]
 [  0   0   0 163  24   0]
 [  0   0   0   9 149   0]
 [  0   0   0   0   0 168]]
[[158   9   1   0   0   0]
 [ 13 127   9   0   0   0]
 [  7  13  98   0   0   0]
 [  0   0   0 130  14   0]
 [  0   0   0  11 162   0]
 [  0   0   0   0   0 167]]
[[122   4   7   0   0   0]
 [ 14 107   5   0   0   0]
 [  6   5 115   0   0   0]
 [  0   0   0 130  28   0]
 [  0   0   0  11 161   0]
 [  0   0   0   0   0 204]]
[[132  13   0   0   0   0]
 [  5 113   5   0   0   0]
 [ 12  20 123   0   0   0]
 [  0   1   0 143  12   0]
 [  0   0   0  23 146   0]
 [  0   0   0   0   0 171]]
[[133  10   3   0   0 

In [19]:
rf_clf = RandomForestClassifier(n_estimators=20)

scores = cross_val_score(rf_clf, np.array(human_data), np.array(human_target), cv=8)
# print(scores)
print(scores.mean())

0.9150161119089815


In [21]:
extra_rf_clf = ExtraTreesClassifier(n_estimators=20)
scores = cross_val_score(extra_rf_clf, np.array(human_data), np.array(human_target), cv=8)
# print(scores)
print(scores.mean())

0.9305170908111609
