In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import metrics

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Preprocess the training dataset
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna('S', inplace=True)
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
train_data['Embarked'] = train_data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
train_data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1, inplace=True)

# Preprocess the testing dataset
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Embarked'].fillna('S', inplace=True)
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})
test_data['Embarked'] = test_data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
test_data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1, inplace=True)

# Define features and target for training data
X = train_data.drop('Survived', axis=1).values
y = train_data['Survived'].values

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
def logistic_regression_train(X_train, y_train, learning_rate=0.02, num_iterations=800):
    m, n = X_train.shape
    theta = np.zeros(n)
    cost_history = []

    def sigmoid(z):
        return 1 / (1 + np.exp(-z))

    def compute_cost(X, y, theta):
        m = len(y)
        predictions = sigmoid(np.dot(X, theta))
        cost = -1/m * (np.dot(y, np.log(predictions + 1e-10)) + np.dot((1 - y), np.log(1 - predictions + 1e-10)))
        return cost

    for i in range(num_iterations):
        linear_model = np.dot(X_train, theta)
        predictions = sigmoid(linear_model)
        gradients = np.dot(X_train.T, (predictions - y_train)) / m
        theta -= learning_rate * gradients
        
        # Compute and store the cost for plotting
        cost = compute_cost(X_train, y_train, theta)
        cost_history.append(cost)
        
        # Print cost after every 100 iterations
        if (i+1) % 100 == 0:
            print(f"Iteration {i+1}/{num_iterations}, Cost: {cost}")

    return theta, cost_history


In [None]:
def logistic_regression_predict(X_test, theta):
    def sigmoid(z):
        return 1 / (1 + np.exp(-z))

    linear_model = np.dot(X_test, theta)
    predictions = sigmoid(linear_model)
    return [1 if i > 0.5 else 0 for i in predictions]


In [None]:
def plot_cost_function(cost_history):
    plt.figure(figsize=(10, 5))
    plt.plot(range(len(cost_history)), cost_history, color='blue')
    plt.xlabel('Number of Iterations')
    plt.ylabel('Cost')
    plt.title('Cost Function Over Iterations')
    plt.grid(True)
    plt.show()

def plot_bias_term(theta):
    plt.figure(figsize=(10, 5))
    plt.plot(theta, 'o-', color='red')
    plt.xlabel('Feature Index')
    plt.ylabel('Parameter Value (Theta)')
    plt.title('Bias Term (Theta) Values')
    plt.grid(True)
    plt.show()

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(test_data.values)

# Train the model
theta, cost_history = logistic_regression_train(X_train, y_train, learning_rate=0.02, num_iterations=800)

# Predict on the validation set
y_val_pred = logistic_regression_predict(X_val, theta)

# Print the classification report
print(classification_report(y_val, y_val_pred))

# Plot the cost function
plot_cost_function(cost_history)
confusion_matrix = metrics.confusion_matrix(y_val, y_val_pred)
# Plot the bias term (theta) values
print(confusion_matrix)
# Display predictions
print('Predictions:', logistic_regression_predict(X_test, theta))

In [None]:
def logistic_regression_predict(X_test, theta):
    def sigmoid(z):
        return 1 / (1 + np.exp(-z))

    linear_model = np.dot(X_test, theta)
    predictions = sigmoid(linear_model)
    return [1 if i > 0.5 else 0 for i in predictions]


In [None]:
# Extract the bias term (intercept)
bias_term = theta[0]

# Print the bias term
print(f"Bias Term (Intercept): {bias_term}")

# Visualize the theta values including the bias term
plot_bias_term(theta)


In [None]:
def logistic_regression_train(X_train, y_train, learning_rate=0.02, num_iterations=800):
    m, n = X_train.shape
    theta = np.zeros(n)  # Initialize theta as a vector of zeros
    cost_history = []
    theta_history = []

    def sigmoid(z):
        return 1 / (1 + np.exp(-z))

    def compute_cost(X, y, theta):
        m = len(y)
        predictions = sigmoid(np.dot(X, theta))
        cost = -1/m * (np.dot(y, np.log(predictions + 1e-10)) + np.dot((1 - y), np.log(1 - predictions + 1e-10)))
        return cost

    for i in range(num_iterations):
        linear_model = np.dot(X_train, theta)
        predictions = sigmoid(linear_model)
        gradients = np.dot(X_train.T, (predictions - y_train)) / m
        theta -= learning_rate * gradients
        
        # Compute and store the cost for plotting
        cost = compute_cost(X_train, y_train, theta)
        cost_history.append(cost)
        
        # Store the current theta values (including bias term)
        theta_history.append(theta.copy())
        
        # Print cost and theta after every 100 iterations
        if (i+1) % 100 == 0:
            print(f"Iteration {i+1}/{num_iterations}, Cost: {cost}, Theta: {theta}")

    return theta, cost_history, theta_history

# Train the model and track theta values
theta, cost_history, theta_history = logistic_regression_train(X_train, y_train, learning_rate=0.02, num_iterations=800)

# Extract bias term from theta history
bias_history = [theta[0] for theta in theta_history]

# Plot the bias term over iterations
plt.figure(figsize=(10, 5))
plt.plot(range(len(bias_history)), bias_history, color='green')
plt.xlabel('Number of Iterations')
plt.ylabel('Bias Term (Theta[0])')
plt.title('Bias Term (Theta[0]) Over Iterations')
plt.grid(True)
plt.show()

# Plot the parameter vector (theta) changes over iterations
plt.figure(figsize=(10, 5))
for i in range(1, len(theta)):
    plt.plot(range(len(theta_history)), [theta[i] for theta in theta_history], label=f'Theta[{i}]')

plt.xlabel('Number of Iterations')
plt.ylabel('Theta Values')
plt.title('Parameter Vector (Theta) Changes Over Iterations')
plt.legend()
plt.grid(True)
plt.show()
