In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import math

In [None]:
#!pip install seaborn

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
print(df_train.shape)

df_train_top = df_train.columns
print(df_train_top)

# how many missing values in each column
df_train.isna().sum()


(891, 12)
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
df_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [None]:
df_train.head()

In [4]:
# calculate the mean age for each sex and store the values in a variable
df_mean = df_train[['Age', 'Sex']].groupby('Sex').mean()
mean_male = df_mean.iloc[1][0]
mean_female = df_mean.iloc[0][0]
# replace the NA values with its respective mean values
female_fill =  df_train[df_train['Sex'] == 'female']['Age'].fillna(mean_female)
male_fill = df_train[df_train['Sex'] == 'male']['Age'].fillna(mean_male)
# join the Series along the index number and replace the NA containing Age column of df_train
Age_fill = pd.concat([female_fill, male_fill], axis=0, join='outer')
df_train['Age'] = Age_fill

In [5]:
# calculate the mean age for each sex and store the values in a variable
df_mean_test = df_test[['Age', 'Sex']].groupby('Sex').mean()
mean_male_test = df_mean_test.iloc[1][0]
mean_female_test = df_mean_test.iloc[0][0]
# replace the NA values with its respective mean values
female_fill_test =  df_test[df_test['Sex'] == 'female']['Age'].fillna(mean_female)
male_fill_test = df_test[df_test['Sex'] == 'male']['Age'].fillna(mean_male)
# join the Series along the index number and replace the NA containing Age column of df_train
Age_fill_test = pd.concat([female_fill_test, male_fill_test], axis=0, join='outer')
df_test['Age'] = Age_fill_test

In [6]:
# calculate the mean age for each sex and store the values in a variable
df_mean_test_fare = df_test[['Fare', 'Pclass']].groupby('Pclass').mean()
first_c = df_mean_test_fare.iloc[0][0]
second_c = df_mean_test_fare.iloc[1][0]
third_c = df_mean_test_fare.iloc[2][0]
first_c_fill = df_test[df_test['Pclass'] == 1]['Fare'].fillna(first_c)
second_c_fill = df_test[df_test['Pclass'] == 2]['Fare'].fillna(second_c)
third_c_fill = df_test[df_test['Pclass'] == 3]['Fare'].fillna(third_c)
Fare_fill_test = pd.concat([first_c_fill, second_c_fill, third_c_fill], axis=0, join='outer')
df_test['Fare'] = Fare_fill_test

In [7]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [8]:
df_train['Sex'] = df_train['Sex'].replace(['male', 'female'], [0, 1])
df_train['Embarked'] = df_train['Embarked'].replace(['C', 'S', 'Q'], [0, 1, 2])
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked       float64
dtype: object

In [9]:
df_test['Sex'] = df_test['Sex'].replace(['male', 'female'], [0, 1])
df_test['Embarked'] = df_test['Embarked'].replace(['C', 'S', 'Q'], [0, 1, 2])
df_test.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked         int64
dtype: object

In [10]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,0.0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,1.0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,1.0


In [11]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,2
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,1


In [12]:
def normalization_z(x):
    """
    Compute the z-score of data

    Args:
        x (vector or Data.Series any size)
    Returns:
        z_score (vector or Data.Series any size)
         
    """
    m = len(x)
    z_score = []
    for i in range(m):
        z_score.append((x[i] - np.mean(x)) / np.std(x))
    return z_score

In [13]:
df_train['Age'] = normalization_z(df_train['Age'])
df_train['Fare'] = normalization_z(df_train['Fare'])

In [14]:
df_test['Age'] = normalization_z(df_test['Age'])
df_test['Fare'] = normalization_z(df_test['Fare'])

In [15]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,-0.594732,1,0,A/5 21171,-0.502445,,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0.635319,1,0,PC 17599,0.786845,C85,0.0
2,3,1,3,"Heikkinen, Miss. Laina",1,-0.28722,0,0,STON/O2. 3101282,-0.488854,,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0.404684,1,0,113803,0.42073,C123,1.0
4,5,0,3,"Allen, Mr. William Henry",0,0.404684,0,0,373450,-0.486337,,1.0


In [16]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,0.340531,0,0,330911,-0.497311,,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,1.329961,1,0,363272,-0.512175,,1
2,894,2,"Myles, Mr. Thomas Francis",0,2.517277,0,0,240276,-0.463999,,2
3,895,3,"Wirz, Mr. Albert",0,-0.253127,0,0,315154,-0.482373,,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,-0.648899,1,1,3101298,-0.417392,,1


In [None]:
sns.scatterplot(data=df_train, x = 'Pclass', y = 'Age', style='Survived')

In [None]:
sns.scatterplot(data=df_train, x = 'SibSp', y = 'Parch', style='Survived')

In [None]:
sns.scatterplot(data=df_train, x = 'Pclass', y = 'Fare', style='Survived')

In [None]:
g = sns.FacetGrid(df_train, col='Survived', row='Pclass')
g.map(plt.hist, 'Age', bins = 20)

In [17]:
X_train = df_train.drop(['Cabin', 'Survived', 'Name', 'Ticket', 'PassengerId'], axis=1).to_numpy()
y_train = df_train['Survived'].to_numpy()

In [18]:
X_test = df_test.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1).to_numpy()

In [19]:
print ('The shape of X_train is: ' + str(X_train.shape))
print ('The shape of y_train is: ' + str(y_train.shape))
print ('We have m = %d training examples' % (len(y_train)))

The shape of X_train is: (891, 7)
The shape of y_train is: (891,)
We have m = 891 training examples


In [20]:
print ('The shape of X_test is: ' + str(X_test.shape))

The shape of X_test is: (418, 7)


In [21]:
X_train = np.nan_to_num(X_train, nan=0)
X_test = np.nan_to_num(X_test, nan=0)

In [22]:
def sigmoid(z):
    """
    Compute the sigmoid of z

    Args:
        z (ndarray): A scalar, numpy array of any size.

    Returns:
        g (ndarray): sigmoid(z), with the same shape as z
         
    """
          
    ### START CODE HERE ### 
    g = 1/(1+np.exp(-z))
    ### END SOLUTION ###  
    
    return g

In [23]:
# UNQ_C2
# GRADED FUNCTION: compute_cost
def compute_cost(X, y, w, b, lambda_= 1):
    """
    Computes the cost over all examples
    Args:
      X : (ndarray Shape (m,n)) data, m examples by n features
      y : (array_like Shape (m,)) target value 
      w : (array_like Shape (n,)) Values of parameters of the model      
      b : scalar Values of bias parameter of the model
      lambda_: unused placeholder
    Returns:
      total_cost: (scalar)         cost 
    """

    m, n = X.shape
    
    ### START CODE HERE ###
    cost = 0
    cost_temp = 0
    for i in range(m):
       z = np.dot(w, X[i]) + b
       cost_temp = (-y[i] * np.log(sigmoid(z))) - ((1 - y[i]) * np.log(1 - sigmoid(z)))
       cost = cost + cost_temp

    total_cost = cost / m
    
    ### END CODE HERE ### 

    return total_cost

In [24]:
m, n = X_train.shape

# Compute and display cost with w initialized to zeroes
initial_w = np.zeros(n)
initial_b = 0.
cost = compute_cost(X_train, y_train, initial_w, initial_b)
print('Cost at initial w (zeros): {:.3f}'.format(cost))

Cost at initial w (zeros): 0.693


In [25]:
# UNQ_C3
# GRADED FUNCTION: compute_gradient
def compute_gradient(X, y, w, b, lambda_=None): 
    """
    Computes the gradient for logistic regression 
 
    Args:
      X : (ndarray Shape (m,n)) variable such as house size 
      y : (array_like Shape (m,1)) actual value 
      w : (array_like Shape (n,1)) values of parameters of the model      
      b : (scalar)                 value of parameter of the model 
      lambda_: unused placeholder.
    Returns
      dj_dw: (array_like Shape (n,1)) The gradient of the cost w.r.t. the parameters w. 
      dj_db: (scalar)                The gradient of the cost w.r.t. the parameter b. 
    """
    m, n = X.shape
    dj_dw = np.zeros(w.shape)
    dj_db = 0.

    ### START CODE HERE ### 
    for i in range(m):
        z = np.dot(w, X[i]) + b
        dj_db_temp = sigmoid(z) - y[i]
        dj_db = dj_db + dj_db_temp

        for j in range(n):
            dj_dw[j] = dj_dw[j] + (dj_db_temp * X[i,j])

    dj_db = dj_db/m
    dj_dw = dj_dw/m
                                          
        
    ### END CODE HERE ###

        
    return dj_db, dj_dw

In [26]:
# Compute and display gradient with w initialized to zeroes
initial_w = np.zeros(n)
initial_b = 0.

dj_db, dj_dw = compute_gradient(X_train, y_train, initial_w, initial_b)
print(f'dj_db at initial w (zeros):{dj_db}' )
print(f'dj_dw at initial w (zeros):{dj_dw.tolist()}' )

dj_db at initial w (zeros):0.11616161616161616
dj_dw at initial w (zeros):[0.4057239057239057, -0.08529741863075196, 0.03912606685364879, 0.07968574635241302, 0.012345679012345678, -0.12513313244901056, 0.13692480359147025]


In [27]:
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters, lambda_): 
    """
    Performs batch gradient descent to learn theta. Updates theta by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X :    (array_like Shape (m, n)
      y :    (array_like Shape (m,))
      w_in : (array_like Shape (n,))  Initial values of parameters of the model
      b_in : (scalar)                 Initial value of parameter of the model
      cost_function:                  function to compute cost
      alpha : (float)                 Learning rate
      num_iters : (int)               number of iterations to run gradient descent
      lambda_ (scalar, float)         regularization constant
      
    Returns:
      w : (array_like Shape (n,)) Updated values of parameters of the model after
          running gradient descent
      b : (scalar)                Updated value of parameter of the model after
          running gradient descent
    """
    
    # number of training examples
    m = len(X)
    
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w_history = []
    
    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_db, dj_dw = gradient_function(X, y, w_in, b_in, lambda_)   

        # Update Parameters using w, b, alpha and gradient
        w_in = w_in - alpha * dj_dw               
        b_in = b_in - alpha * dj_db              
       
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            cost =  cost_function(X, y, w_in, b_in, lambda_)
            J_history.append(cost)

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters/10) == 0 or i == (num_iters-1):
            w_history.append(w_in)
            print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}   ")
        
    return w_in, b_in, J_history, w_history #return w and J,w history for graphing

In [28]:
np.random.seed(1)
initial_w = np.random.randint(-1,1,(n))
initial_b = 0.


# Some gradient descent settings
iterations = 10000
alpha = 0.1

w,b, J_history,_ = gradient_descent(X_train ,y_train, initial_w, initial_b, 
                                   compute_cost, compute_gradient, alpha, iterations, 0)

Iteration    0: Cost     0.74   
Iteration 1000: Cost     0.44   
Iteration 2000: Cost     0.44   
Iteration 3000: Cost     0.44   
Iteration 4000: Cost     0.44   
Iteration 5000: Cost     0.44   
Iteration 6000: Cost     0.44   
Iteration 7000: Cost     0.44   
Iteration 8000: Cost     0.44   
Iteration 9000: Cost     0.44   
Iteration 9999: Cost     0.44   


In [29]:
J_history[-10:]

[0.4417974162743413,
 0.4417974162740459,
 0.44179741627375235,
 0.4417974162734572,
 0.4417974162731638,
 0.4417974162728704,
 0.441797416272578,
 0.4417974162722864,
 0.4417974162719943,
 0.44179741627170344]

In [30]:
print('final w is: %a' % w)
print('final b is: %f' % b)

final w is: array([-1.0705825 ,  2.75551041, -0.52103169, -0.35204852, -0.11534832,
        0.12865737, -0.14601569])
final b is: 1.212619


In [31]:
# UNQ_C4
# GRADED FUNCTION: predict

def predict(X, w, b): 
    """
    Predict whether the label is 0 or 1 using learned logistic
    regression parameters w
    
    Args:
    X : (ndarray Shape (m, n))
    w : (array_like Shape (n,))      Parameters of the model
    b : (scalar, float)              Parameter of the model

    Returns:
    p: (ndarray (m,1))
        The predictions for X using a threshold at 0.5
    """
    # number of training examples
    m, n = X.shape   
    p = np.zeros(m)
   
    ### START CODE HERE ### 

    for i in range(m):
        z = np.dot(w, X[i]) + b
        fx = sigmoid(z)
        if fx >= 0.5:
            p[i] = 1
        else:
            p[i] = 0
        
    ### END CODE HERE ### 
    return p

In [32]:
y_predict = predict(X_train, w, b)
print('The accuracy of the logistic regression with gradient descent for the training set is: %f%%' % (np.mean(y_train == y_predict)*100))

The accuracy of the logistic regression with gradient descent is: 80.022447%


In [55]:
y_test = predict(X_test, w, b)
y_test_series = pd.Series(y_test, name='Survived')
df_submission = pd.concat([df_test['PassengerId'], y_test_series], axis=1)
print(df_submission.head())
df_submission.to_csv('Output/submission.csv', index=False)

   PassengerId  Survived
0          892       0.0
1          893       0.0
2          894       0.0
3          895       0.0
4          896       1.0
