In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pprint
from sklearn.linear_model import LogisticRegression
from sklearn import naive_bayes
import seaborn as sns
import math
import copy
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.utils import check_random_state

## Загрузим дату титаника и удалим все лишнее

In [2]:
d1 = pd.read_csv('titanic_data.csv')
d2 = pd.read_csv('titanic_surv.csv')
data = pd.concat([d1, d2], axis=1)

In [3]:
data = data.drop(columns=['Cabin'])

In [4]:
mean = data['Age'].mean()
std = data['Age'].std()
number_of_nulls = data['Age'].isnull().sum()
random_ages = np.random.randint(mean - std, mean + std, size=number_of_nulls)

In [5]:
new_ages = data['Age'].copy()
new_ages[np.isnan(new_ages)] = random_ages
data['Age'] = new_ages

In [6]:
data = data[data['Embarked'].notnull()]

In [7]:
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data['Embarked'] = data['Embarked'].map({"S": 0, "C": 1, "Q": 2})

In [8]:
data = data.drop(columns=['Name', 'PassengerId', 'Ticket'])

In [9]:
data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,0,22.0,1,0,7.25,0,0
1,1,1,38.0,1,0,71.2833,1,1
2,3,1,26.0,0,0,7.925,0,1
3,1,1,35.0,1,0,53.1,0,1
4,3,0,35.0,0,0,8.05,0,0


## Разделим на тестовую и обучающую выборки

In [10]:
train, test = train_test_split(data, test_size=0.2)

In [11]:
train.shape, test.shape

((711, 8), (178, 8))

In [12]:
x_train = train.drop(columns=['Survived'])
y_train = train['Survived']

In [13]:
x_test = test.drop(columns=['Survived'])
y_test = test['Survived']

## Логическая регрессия

In [55]:
class LR:
    def __init__(self, step=10**-2, it_c=1):
        self.step = step
        self.it_c = it_c

    def get_coeff(self, x, y):
        err = 0
        x = x.to_numpy()
        y = y.to_numpy()
        
        for i in range(len(y)): 
            if self.predict(x[i]) == y[i]:
                err += 1
        return err / len(y)
    
    def fit(self, x, y):
        x = x.to_numpy()
        y = y.to_numpy()
        np.c_[x, np.ones(len(x))] 
        self.w = np.zeros(x.shape[1])
        for i in range(self.it_c):
            z = [sum([x[i][j]*self.w[j] for j in range(len(x[i]))]) for i in range(len(x))]
            res = np.array([1 / (1 + np.exp(-a)) for a in z])
            grad = np.dot(x.T, res - y) / y.size
            self.w -= self.step * grad
    def predict(self, x):
        np.append(x,1)
        return (1 / (1 + np.exp(-np.dot(x, self.w)))).round()

In [56]:
mlg = LR()
hist = mlg.fit(x_train, y_train)
print(f"моя LogReg: {mlg.get_coeff(x_test, y_test)}")

моя LogReg: 0.6573033707865169


In [54]:
sklg = LogisticRegression()
sklg.fit(x_train, y_train)
print(f"sklearn: {sklg.score(x_test, y_test)}")

sklearn: 0.7808988764044944


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## SVM - Support Vector Machine

In [250]:
def projection_simplex(v, z=1):
    n_features = v.shape[0]
    u = np.sort(v)[::-1]
    cssv = np.cumsum(u) - z
    ind = np.arange(n_features) + 1
    cond = u - cssv / ind > 0
    rho = ind[cond][-1]
    theta = cssv[cond][-1] / float(rho)
    w = np.maximum(v - theta, 0)
    return w

In [432]:
class SVM:
    def __init__(self, C=1, max_iter=100, eps=0.01, random_state=None, verbose=0):
        self.C = C
        self.max_iter = max_iter
        self.eps = eps
        self.random_state = random_state
        self.verbose = verbose

    def partial_gradient(self,f, t, i):
#         print(f[0],self.coef.T)
        f = f.to_numpy()
        g = np.dot(f[i], self.coef.T) + 1
        g[int(t[i])] -= 1
        return g

    def violation(self, g, t, i):
        smallest = np.inf
        for k in range(g.shape[0]):
            if k == t[i] and self.dual_coef[k, i] >= self.C:
                continue
            elif k != t[i] and self.dual_coef[k, i] >= 0:
                continue

            smallest = min(smallest, g[k])
        return g.max() - smallest

    def solver(self, g, t, norms, i):
        Ci = np.zeros(g.shape[0])
        Ci[int(t[i])] = self.C
        beta_hat = norms[i] * (Ci - self.dual_coef[:, i]) + g / norms[i]
        z = self.C * norms[i]
        beta = projection_simplex(beta_hat, z)
        return Ci - self.dual_coef[:, i] - beta / norms[i]

    def fit(self,f, t):
        n_samples, n_features =f.shape
        n_classes = 4
        self.dual_coef = np.zeros((n_classes, n_samples), dtype=np.float64)
        self.coef = np.zeros((n_classes, n_features))
        norms = np.sqrt(np.sum(f ** 2, axis=1))
        rs = check_random_state(self.random_state)
        ind = np.arange(n_samples)
        rs.shuffle(ind)
        violation_init = None
        for it in range(self.max_iter):
            violation_sum = 0
            for idx in range(n_samples):
                i = ind[idx]

                if norms[i] == 0:
                    continue
                g = self.partial_gradient(f, t, i)
                v = self.violation(g, t, i)
                violation_sum += v
                if v < 1e-12:
                    continue
                delta = self.solver(g, t, norms, i)
                self.coef += (delta *f[i][:, np.newaxis]).T
                self.dual_coef[:, i] += delta
            if it == 0:
                violation_init = violation_sum
            vratio = violation_sum / violation_init
            if self.verbose >= 1:
                print("iter", it + 1, "violation", vratio)
            if vratio < self.eps:
                if self.verbose >= 1:
                    print("Converged")
                break
        return self

    def predict(self,f):
        decision = np.dot(f, self.coef.T)
        pred = decision.argmax(axis=0)
        return pred

    def get_coeff(self, features, target):
        cnt = 0
        for i in range(target.shape[0]):
            if self.predict(features[i]) == target[i]:
                cnt += 1
        return cnt / target.shape[0]

In [434]:
svm = SVM()
# svm.fit(x_train, y_train)
# print('Результат собственной реализации дерева решений: {}'
#       .format(svm.get_coeff(x_train, y_train) * 100))

In [435]:
svc = SVC(decision_function_shape='ovr')
svc.fit(x_train, y_train)
print('Результат реализации sklearn: {}'
.format(svc.score(x_train, y_train) * 100))

Результат реализации sklearn: 67.65119549929676


In [440]:
class SVM:
    def __init__(self, err=10**-2, step=1000):
        self.err=err
        self.step=1000
    
    def fit(self,x,y):
        self.x = x.to_numpy().tolist()
        self.y = y.to_numpy().tolist()
        
        
        
        
        
        
    def predict(self,x):
        pass
    def predictAll(self,x,y):
        pass

## Дерево решений

In [404]:
class Node:
    def __init__(self, x, y, deep=0, mDeep=3):
        self.x = copy.deepcopy(x)
        self.y = copy.deepcopy(y)
        self.leaf = False
        self.deep = deep
        if deep != mDeep and max([self.entropy(x,y,j) for j in range(len(x))]) != 0:
#             print(f"start Node d:{self.deep}, x:{len(x)}")
            self.make()
        else:
            self.leaf = True
        
    def split(self,x,y,j):
        lx,ly,rx,ry = [],[],[],[]
        mean = sum([x[i][j] for i in range(len(x))]) / len(x)
        for i in range(len(x)):
            if x[i][j] <= mean:
                lx.append(copy.copy(x[i]))
                ly.append(y[i])
            else:
                rx.append(copy.copy(x[i]))
                ry.append(y[i])
        return lx,ly,rx,ry
    
    def make(self):
        assert len(self.x) > 0
        r = [self.gain(self.x,self.y,j) for j in range(len(self.x[0]))]
        m = r.index(max(r)) # индекс максимального выигрыша по которому будем сплитить дерево
        
        lx, ly, rx, ry = self.split(self.x,self.y,m)
        self.indexOfSplit = m
        self.mean = sum([self.x[i][m] for i in range(len(self.x))]) / len(self.x)
#         print(f"made Node d:{self.deep}, l:{len(lx)}, r:{len(rx)}, m:{m}")
        
        self.l = Node(lx,ly,self.deep+1)
        self.r = Node(rx,ry,self.deep+1)

    def entropy(self,x,y,j):
        assert len(x) > 0
        r = [0,0]
        for i in range(len(x)):
            r[y[i]] += 1
        return -(r[0]/len(x) * math.log2(r[0]/len(x)) + r[1]/len(x) * math.log2(r[1]/len(x))) # число от 0 до 1
    
    def gain(self,x,y,j):
        lx,ly,rx,ry = self.split(x,y,j)
        if len(lx) == 0 or len(rx) == 0:
            return 0
        return self.entropy(x,y,j) - (len(lx) / len(x)) * self.entropy(lx,ly,j) - (len(rx) / len(x)) * self.entropy(rx,ry,j) # надо сплитить там где это число максимальное
    
    def display(self,deep=0):
        if self.leaf:
            print('\t'*deep + f"d:{self.deep},x:{len(self.x)}")
        else:
            print('\t'*deep + f"d:{self.deep},m:{self.indexOfSplit}")
            self.r.display(deep+1)
            self.l.display(deep+1)
            
    def predict(self,x):
        if self.leaf:
            r = [0,0]
            for i in range(len(x)):
                r[self.y[i]] += 1
            return r.index(max(r))
        else:
            if x[self.indexOfSplit] <= self.mean:
                return self.l.predict(x)
            else:
                return self.r.predict(x)

In [409]:
class dTree:
    def __init__(self,x,y):
        self.x = x.to_numpy().tolist()
        self.y = y.to_numpy().tolist()
        self.root = Node(self.x,self.y)
        print()
        self.root.display()
    
    def predict(self,x):
        return self.root.predict(x)
    
    def predictAll(self,x,y):
        res = [self.predict(a) for a in x.to_numpy().tolist()]
        y = y.to_numpy().tolist()
        err = 0
        for i in range(len(res)):
            if res[i] == y[i]:
                err += 1
        return err/len(res)

In [410]:
dtr = dTree(x_train,y_train)


d:0,m:1
	d:1,m:0
		d:2,m:6
			d:3,x:49
			d:3,x:67
		d:2,m:5
			d:3,x:49
			d:3,x:78
	d:1,m:5
		d:2,m:3
			d:3,x:26
			d:3,x:96
		d:2,m:4
			d:3,x:28
			d:3,x:318


In [416]:
print(f"мое дерево: {dtr.predictAll(x_test,y_test)}")

мое дерево: 0.7752808988764045


In [417]:
dt = DecisionTreeClassifier(max_depth=7)
dt.fit(x_train, y_train)
dt.score(x_test, y_test)
print(f"sklearn: {dt.score(x_test, y_test)}")

sklearn: 0.7921348314606742
