In [104]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [105]:
print(train_df.head(1))

   PassengerId  Survived  Pclass                     Name   Sex   Age  SibSp  \
0            1         0       3  Braund, Mr. Owen Harris  male  22.0      1   

   Parch     Ticket  Fare Cabin Embarked  
0      0  A/5 21171  7.25   NaN        S  


In [106]:
def preprocess_data():
#     train_df = pd.read_csv('train.csv')
#     test_df = pd.read_csv('test.csv')
    """ Age preprocessing: Adding Age_is_missing col and filling the empty age with an estimate based on Sex and PClass """
    train_df['Age_Missing'] = train_df['Age'].isna().astype(int)
    test_df['Age_Missing'] = test_df['Age'].isna().astype(int)
    
    global_age = train_df['Age'].median()
    group_age = train_df.groupby(['Sex','Pclass'])['Age'].median()
    def estimate_age(row):
        if pd.isna(row['Age']):
            age = group_age.get((row['Sex'], row['Pclass']), np.nan)
            return age if not np.isnan(age) else global_age
        return row['Age']
    
    train_df['Age'] = train_df.apply(estimate_age, axis=1)
    test_df['Age'] = test_df.apply(estimate_age, axis=1)
    
    """ Adding a Family_size col that reflects sum of sibsp and parch """
    
    train_df['Family_Size'] = train_df['Parch'] + train_df['SibSp'] + 1
    test_df['Family_Size'] = test_df['Parch'] + test_df['SibSp'] + 1
    
    """ Fare preprocessing: Filling the empty fare with an estimate based on Sex and PClass """
    global_fare = train_df['Fare'].median()
    group_fare = train_df.groupby(['Sex', 'Pclass'])['Fare'].median()
    
    def estimate_fare(row):
        if pd.isna(row['Fare']):
            fare = group_fare.get((row['Sex'], row['Pclass']), np.nan)
            return fare if not pd.isna(fare) else global_fare
        return row['Fare']
    train_df['Fare'] = train_df.apply(estimate_fare, axis=1)
    test_df['Fare'] = test_df.apply(estimate_fare, axis=1)
    
    """ Change cabin to be the first letter of cabin and also fill with U if it is empty """
    train_df['Cabin'] = train_df['Cabin'].str[0].fillna('U')
    test_df['Cabin'] = test_df['Cabin'].str[0].fillna('U')
    
    """ If Embarked is empty, use 'U' """
    train_df['Embarked'] = train_df['Embarked'].fillna('U')
    test_df['Embarked'] = test_df['Embarked'].fillna('U')
    
    """ Convert Sex to numerical col (female=1, male=0)"""
    train_df['Sex'] = (train_df['Sex']=='female').astype(int)
    test_df['Sex'] = (test_df['Sex']=='female').astype(int)
    
    
    """ Convert other cat cols to numerical cols """
    cat_cols = ['Cabin', 'Embarked']
    x_train_cat = pd.get_dummies(train_df[cat_cols], prefix=cat_cols, dummy_na=False)
    x_test_cat = pd.get_dummies(test_df[cat_cols], prefix=cat_cols, dummy_na=False)
    x_test_cat = x_test_cat.reindex(columns=x_train_cat.columns, fill_value=0)
    
    """ Standardize some numerical cols """
    cols_to_standardize = ['Age', 'Fare', 'Family_Size']
    mu = train_df[cols_to_standardize].mean()
    std = train_df[cols_to_standardize].std(ddof=0)
    std = std.replace(0, 1.0)
    train_df[cols_to_standardize] = (train_df[cols_to_standardize]-mu)/std
    test_df[cols_to_standardize] = (test_df[cols_to_standardize]-mu)/std
    
    """ Select numerical cols that we want to use in logistic regression """
    num_cols = ['Sex', 'Age', 'Age_Missing', 'Family_Size', 'Fare']
    x_train_num = train_df[num_cols].astype(float)
    x_test_num = test_df[num_cols].astype(float)
    

    
    
    x_train = pd.concat([x_train_num, x_train_cat], axis=1)
    x_test = pd.concat([x_test_num, x_test_cat], axis=1)
    
    return x_train, x_test
    

In [107]:
x_train, x_test = preprocess_data()

## Logistic Regression From Scratch

In [108]:
print(x_train.head(10))

   Sex       Age  Age_Missing  Family_Size      Fare  Cabin_A  Cabin_B  \
0  0.0 -0.534891          0.0     0.059160 -0.502445        0        0   
1  1.0  0.668392          0.0     0.059160  0.786845        0        0   
2  1.0 -0.234070          0.0    -0.560975 -0.488854        0        0   
3  1.0  0.442776          0.0     0.059160  0.420730        0        0   
4  0.0  0.442776          0.0    -0.560975 -0.486337        0        0   
5  0.0 -0.309276          1.0    -0.560975 -0.478116        0        0   
6  0.0  1.871675          0.0    -0.560975  0.395814        0        0   
7  0.0 -2.038995          0.0     1.919564 -0.224083        0        0   
8  1.0 -0.158865          0.0     0.679295 -0.424256        0        0   
9  1.0 -1.136533          0.0     0.059160 -0.042956        0        0   

   Cabin_C  Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_T  Cabin_U  Embarked_C  \
0        0        0        0        0        0        0        1           0   
1        1        0      

In [None]:
def sigmoid(z):
    return 1/(1+np.power(np.e, -1*z))

def cost_function():
    pass

def gradient_descent():
    pass

def logistic_regression():
    pass

In [83]:
arr = np.array([1, 2, 3])

print(sigmoid(arr))

[0.73105858 0.88079708 0.95257413]


## SKLearn baseline + tuning