In [133]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [134]:
print(train_df.head(1))

   PassengerId  Survived  Pclass                     Name   Sex   Age  SibSp  \
0            1         0       3  Braund, Mr. Owen Harris  male  22.0      1   

   Parch     Ticket  Fare Cabin Embarked  
0      0  A/5 21171  7.25   NaN        S  


## Data Preprocessing

In [135]:
def preprocess_data():
    """ Age preprocessing: Adding Age_is_missing col and filling the empty age with an estimate based on Sex and PClass """
    train_df['Age_Missing'] = train_df['Age'].isna().astype(int)
    test_df['Age_Missing'] = test_df['Age'].isna().astype(int)
    
    global_age = train_df['Age'].median()
    group_age = train_df.groupby(['Sex','Pclass'])['Age'].median()
    def estimate_age(row):
        if pd.isna(row['Age']):
            age = group_age.get((row['Sex'], row['Pclass']), np.nan)
            return age if not np.isnan(age) else global_age
        return row['Age']
    
    train_df['Age'] = train_df.apply(estimate_age, axis=1)
    test_df['Age'] = test_df.apply(estimate_age, axis=1)
    
    """ Adding a Family_size col that reflects sum of sibsp and parch """
    
    train_df['Family_Size'] = train_df['Parch'] + train_df['SibSp'] + 1
    test_df['Family_Size'] = test_df['Parch'] + test_df['SibSp'] + 1
    
    """ Fare preprocessing: Filling the empty fare with an estimate based on Sex and PClass """
    global_fare = train_df['Fare'].median()
    group_fare = train_df.groupby(['Sex', 'Pclass'])['Fare'].median()
    
    def estimate_fare(row):
        if pd.isna(row['Fare']):
            fare = group_fare.get((row['Sex'], row['Pclass']), np.nan)
            return fare if not pd.isna(fare) else global_fare
        return row['Fare']
    train_df['Fare'] = train_df.apply(estimate_fare, axis=1)
    test_df['Fare'] = test_df.apply(estimate_fare, axis=1)
    
    """ Change cabin to be the first letter of cabin and also fill with U if it is empty """
    train_df['Cabin'] = train_df['Cabin'].str[0].fillna('U')
    test_df['Cabin'] = test_df['Cabin'].str[0].fillna('U')
    
    """ If Embarked is empty, use 'U' """
    train_df['Embarked'] = train_df['Embarked'].fillna('U')
    test_df['Embarked'] = test_df['Embarked'].fillna('U')
    
    """ Convert Sex to numerical col (female=1, male=0)"""
    train_df['Sex'] = (train_df['Sex']=='female').astype(int)
    test_df['Sex'] = (test_df['Sex']=='female').astype(int)
    
    
    """ Convert other cat cols to numerical cols """
    cat_cols = ['Cabin', 'Embarked']
    x_train_cat = pd.get_dummies(train_df[cat_cols], prefix=cat_cols, dummy_na=False)
    x_test_cat = pd.get_dummies(test_df[cat_cols], prefix=cat_cols, dummy_na=False)
    x_test_cat = x_test_cat.reindex(columns=x_train_cat.columns, fill_value=0)
    
    """ Standardize some numerical cols """
    cols_to_standardize = ['Age', 'Fare', 'Family_Size']
    mu = train_df[cols_to_standardize].mean()
    std = train_df[cols_to_standardize].std(ddof=0)
    std = std.replace(0, 1.0)
    train_df[cols_to_standardize] = (train_df[cols_to_standardize]-mu)/std
    test_df[cols_to_standardize] = (test_df[cols_to_standardize]-mu)/std
    
    """ Select numerical cols that we want to use in logistic regression """
    num_cols = ['Sex', 'Age', 'Age_Missing', 'Family_Size', 'Fare']
    x_train_num = train_df[num_cols].astype(float)
    x_test_num = test_df[num_cols].astype(float)
    

    
    
    x_train = pd.concat([x_train_num, x_train_cat], axis=1)
    x_test = pd.concat([x_test_num, x_test_cat], axis=1)
    
    return x_train, x_test
    

## Performing data preprocessing and converting to NumPy

In [136]:
x_train_df, x_test_df = preprocess_data()

x_train = x_train_df.to_numpy(dtype=np.float64, copy=False)
x_test = x_test_df.to_numpy(dtype=np.float64, copy=False)
y_train = train_df['Survived'].to_numpy(dtype=np.float64, copy=False)


## Logistic Regression From Scratch

In [127]:
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def cost_function(X, y, w, b):
    z = X @ w + b
    return (np.logaddexp(0, z) - y*z).mean()

def compute_gradient(X, y, w, b):
    m, n = X.shape
    z = X @ w + b
    f_x = sigmoid(z)
    err = f_x - y
    d_j_b = err.mean()
    d_j_w = (X.T @ err) /m 
    return d_j_w, d_j_b


def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters):
    pass


""" TODOs: """
def cost_function_with_regularization():
    pass

def gradient_descent_with_regularization():
    pass


In [124]:
# arr = np.array([1, 2, 3])

# print(sigmoid(arr))

print(x_train.shape)
print(y_train.shape)

(891, 18)
(891,)


In [110]:
# Initialize parameters:
m, n = x_train.shape
w = np.zeros(n)
b = 0
alpha = 0.001

""" Define the logistic regression model """

# f_w_b = sigmoid(w.x+b)

""" Compute the cost """
cost = cost_function(x_train, y_train, w, b)


""" Update w and b using gradient descent """


""" Get train accuracy """

AttributeError: 'list' object has no attribute 'x'

## SKLearn baseline + tuning