In [5]:
import numpy as np
import pandas as pd
import sys
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, fbeta_score

In [6]:
def read_data():
    data = pd.read_csv('census.csv')
    
    income_raw = data['income']
    features_raw = data.drop('income', axis=1)
    
    return data, features_raw, income_raw

In [7]:
def pre_data(features_raw, data):
    skewed = ['capital-gain', 'capital-loss']
    features_raw[skewed] = data[skewed].apply(lambda x:np.log(x+1))
    
    scaler = MinMaxScaler()
    numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
    features_raw[numerical] = scaler.fit_transform(data[numerical])
    
    return features_raw

In [8]:
def one_hot_encoding(features_raw, income_raw):
    features = pd.get_dummies(features_raw)
    income = income_raw.replace(['>50K', '<=50K'], [1, 0])
    
    return features, income

In [9]:
def split_dataset(features, income):
    train_x, test_x, train_y, test_y = train_test_split(features, income, test_size=0.2, random_state=0)
    
    return train_x, test_x, train_y, test_y

In [10]:
def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))

In [17]:
def create_model(x, y):
    #model = linear_model.LogisticRegression(random_state=0)
    x = np.mat(x)
    y = np.mat(y).transpose()

    m, n = np.shape(x)
    alpha = 0.01
    max_cycles = 1000
    weights = np.ones((n, 1))
    
    for k in range(max_cycles):
        h = sigmoid((x * weights))
        weights = weights - alpha * x.transpose() * (h - y)
        
    return weights

In [12]:
def get_results(weights, train_x, test_x, train_y, test_y):
    test_x = np.mat(test_x)
    train_x = np.mat(train_x)
    
    predictions_test = sigmoid(test_x * weights)
    predictions_train = sigmoid(train_x * weights)
    
    test_acc = []
    train_acc = []
    for test in predictions_test:
        if test > 0.5:
            test_acc.append(1.0)
        else:
            test_acc.append(0.0)
    for train in predictions_train:
        if train > 0.5:
            train_acc.append(1.0)
        else:
            train_acc.append(0.0)
    
    results = {}
    results['train_accuracy'] = accuracy_score(train_y, train_acc)
    results['test_accuracy'] = accuracy_score(test_y, test_acc)
    results['train_fbeta'] = fbeta_score(train_y, train_acc, beta=0.5)
    results['test_fbeta'] = fbeta_score(test_y, test_acc, beta=0.5)
    
    return results

In [13]:
def main():
    data, features_raw, income_raw = read_data()
    features_raw = pre_data(features_raw, data)
    features, income = one_hot_encoding(features_raw, income_raw)
    train_x, test_x, train_y, test_y = split_dataset(features, income)
    
    weights = create_model(train_x, train_y)
    
    results = get_results(weights, train_x, test_x, train_y, test_y)
    print(results)

In [18]:
main()

[[0.99992597]
 [0.99991473]
 [0.99988593]
 ...
 [0.99995046]
 [0.99992253]
 [0.99989341]]


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
