# Logistic Regression

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None)
data.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
                'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
                'hours-per-week', 'native-country', 'income']

data



Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [11]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [12]:
# Age is going to be the attribute and Income the target
data1 = data[['age', 'income']]
data2 = data[['age', 'hours-per-week', 'income']]

In [13]:
# 'Income is categorical. We convert it to numerical_ 1 for >50k, 0for <=50k
data1['income'] = np.where(data['income'] == ' >50K', 1, 0)
data2['income'] = np.where(data['income'] == ' >50K', 1, 0)

# Split the dataset into training and testing sets
train_data1, test_data1 = train_test_split(data1, test_size=0.2, random_state=0)
train_data2, test_data2 = train_test_split(data2, test_size=0.2, random_state=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1['income'] = np.where(data['income'] == ' >50K', 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['income'] = np.where(data['income'] == ' >50K', 1, 0)


In [14]:
def sigmoid(z):
    """Compute the sigmoid function."""
    return 1 / (1 + np.exp(-z))

def logistic_regression(train_data,num_iterations, learning_rate):
    
    # Initialize the parameters
    m, n = train_data.shape
    w = np.zeros(n - 1)
    b = 0
    
    # Training
    for i in range(num_iterations):
        # Logits
        z = np.dot(train_data.iloc[:, :-1], w) + b
        
        # Predicted Probabilities
        y_pred = sigmoid(z)
        
        # Compute the cost function
        
        J = -(1 / m) * np.sum(train_data.iloc[:, -1] * np.log(y_pred) + (1 - train_data.iloc[:, -1]) * np.log(1 - y_pred))
        
        # Optimizing MLE
        dw = (1 / m) * np.dot(train_data.iloc[:, :-1].T, (y_pred - train_data.iloc[:, -1]))
        db = (1 / m) * np.sum(y_pred - train_data.iloc[:, -1])
        
        # Update the parameters
        w = w - learning_rate * dw
        b = b - learning_rate * db
        
    return w,b

In [15]:
def predict(X, w, b):
    """
    Predict class labels for input data X using trained parameters w and b.
    """
    y_pred = sigmoid(np.dot(X, w) + b)
    y_pred[y_pred >= 0.5] = 1
    y_pred[y_pred < 0.5] = 0
    
    return y_pred.astype(int)



In [16]:
w,b=logistic_regression(train_data2, 1000, 0.1)
print(w,b)

  J = -(1 / m) * np.sum(train_data.iloc[:, -1] * np.log(y_pred) + (1 - train_data.iloc[:, -1]) * np.log(1 - y_pred))


[ 0.43589981 -0.17239898] -3.182462812548914


In [17]:
# Predict on test data
X_test = test_data2.iloc[:, :-1].values
y_test = test_data2.iloc[:, -1].values
y_pred = predict(X_test, w, b)
    
# Calculate accuracy
accuracy = np.mean(y_pred == y_test)
print(accuracy)

0.34285275602640874


With two attributes