In [196]:
import numpy as np

In [197]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [198]:
def loss(y, y_hat):
    return -np.sum(np.multiply(y, np.log(y_hat))+np.multiply(1-y, np.log(1-y_hat)))

In [199]:
def hypothesis(X, W):
    return sigmoid(np.dot(X, W))

In [200]:
def GD(X, y, learning_rate=0.0001, iterations=10):
    N, m = np.shape(X)
    W_init = np.random.randn(m, 1)
    W = [W_init]
    loss_history = np.zeros(iterations)
    for i in range(iterations):
        y_hat = hypothesis(X, W[-1])
        _loss = loss(y, y_hat)
        W_new = W[-1] - learning_rate * np.dot(X.T, (y_hat - y))
        W.append(W_new)
        loss_history[i] = _loss
        if np.linalg.norm(W[-2] - W[-1]) < 1e-3:
            break
    return W[-1], loss_history

In [201]:
def SGD(X, y, learning_rate=0.01, iterations=1000):
    N, m = np.shape(X)
    W_init = np.random.randn(m, 1)
    W = [W_init]
    loss_history = np.zeros(iterations)
    for i in range(iterations):
        _loss = 0.0
        # epoch
        for index, x in enumerate(X):
            x_i = x.reshape(1, -1)
            y_i = y[index]
            y_hat = hypothesis(x_i, W[-1])
            _loss += loss(y_i, y_hat)
            W_new = W[-1] - learning_rate * np.dot(x_i.T, (y_hat - y_i))
            
        W.append(W_new)
        loss_history[i] = _loss
        if np.linalg.norm(W[-1] - W[-2]) < 1e-3:
            break
    return W[-1], loss_history


In [202]:
def get_batchs(X, y, batch_size):
    for i in range(0, len(X), batch_size):
        x_batch = X[i:i+batch_size]
        y_batch = y[i:i+batch_size]
        yield x_batch, y_batch
    
    
def Mini_batch(X, y, learning_rate=0.01, iterations=1000, batch_size=28):
    N, m = np.shape(X)
    W_init = np.random.randn(m, 1)
    W = [W_init]
    loss_history = np.zeros(iterations)
    for i in range(iterations):
        _loss = 0.0
        # epoch
        for x_batch, y_batch in get_batchs(X, y, batch_size):
            y_hat = hypothesis(x_batch, W[-1])
            _loss += loss(y_batch, y_hat)
            W_new = W[-1] - learning_rate * np.dot(x_batch.T, (y_hat - y_batch))
        
        W.append(W_new) 
        loss_history[i] = _loss
        if np.linalg.norm(W[-2] - W[-1]) < 1e-3:
            break   
    return W[-1], loss_history

## Bài tập

In [203]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [204]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,84,0.0,0.0,0.0,0.0,0.304,21,0
1,9,112,82.0,24.0,0.0,28.2,1.282,50,1
2,1,139,46.0,19.0,83.0,28.7,0.654,22,0
3,0,161,50.0,0.0,0.0,21.9,0.254,65,0
4,6,134,80.0,37.0,370.0,46.2,0.238,46,1


In [205]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98,58.0,33.0,190.0,34.0,0.43,43,0
1,2,112,75.0,32.0,0.0,35.7,0.148,21,0
2,2,108,64.0,0.0,0.0,30.8,0.158,21,0
3,8,107,80.0,0.0,0.0,24.6,0.856,34,0
4,7,136,90.0,0.0,0.0,29.9,0.21,50,0


In [206]:
def missing_value(X, y):
    
    #BloodPressure
    blood_pressure_imputer = SimpleImputer(np.nan, strategy = 'mean')
    X[:, 2] = blood_pressure_imputer.fit_transform(X[:, 2].reshape(-1, 1)).ravel()
    
    #SkinThickness
    skin_thickness_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
    X[:, 3] = skin_thickness_imputer.fit_transform(X[:, 3].reshape(-1, 1)).ravel()
    
    #Insulin
    insulin_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
    X[:, 4] = insulin_imputer.fit_transform(X[:, 4].reshape(-1, 1)).ravel()
    
    #BMI
    bmi_imputer = SimpleImputer(np.nan, strategy = 'mean')
    X[:, 5] = bmi_imputer.fit_transform(X[:, 5].reshape(-1, 1)).ravel()
    
    #DiabetesPedigreeFunction
    diabetes_pedigree_function_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
    X[:, 6] = diabetes_pedigree_function_imputer.fit_transform(X[:, 6].reshape(-1, 1)).ravel()
    
    return X, y

In [207]:
X_train = train_df.iloc[:, :-1].values
y_train = train_df.iloc[:, -1].values

X_test = test_df.iloc[:, :-1].values
y_test = test_df.iloc[:, -1].values

In [208]:
X_train, y_train = missing_value(X_train, y_train)
X_test, y_test  = missing_value(X_test, y_test)

In [209]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [210]:
y_pred = clf.predict(X_test)
print("Accuracy_score:", accuracy_score(y_test, y_pred))

Accuracy_score: 0.7467532467532467
