In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import precision_score, recall_score
import statsmodels.api as sm
import sklearn.metrics as metrics

df = pd.read_csv('train.csv')

df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Fare'].fillna(df['Fare'].mean(), inplace=True)
df.drop(columns=['Cabin', 'Ticket'], inplace=True)
df.dropna(subset=['Embarked'], inplace=True)
df.drop_duplicates(inplace=True)

categorical_features = ['Sex', 'Embarked']
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch']

X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_num = X_train[numerical_features]
X_test_num = X_test[numerical_features]
X_train_cat = X_train[categorical_features]
X_test_cat = X_test[categorical_features]

std_scaler = StandardScaler().fit(X_train_num)
X_train_num_scaled = std_scaler.transform(X_train_num)
X_test_num_scaled = std_scaler.transform(X_test_num)

onehot_encoder = OneHotEncoder(sparse_output=False).fit(X_train_cat)
X_train_cat_encoded = onehot_encoder.transform(X_train_cat)
X_test_cat_encoded = onehot_encoder.transform(X_test_cat)

X_train_preprocessed = np.hstack((X_train[['Pclass']], X_train_num_scaled, X_train_cat_encoded))
X_test_preprocessed = np.hstack((X_test[['Pclass']], X_test_num_scaled, X_test_cat_encoded))

X_train_ols = sm.add_constant(X_train_preprocessed)
ols_model = sm.OLS(y_train, X_train_ols).fit()
X_test_ols = sm.add_constant(X_test_preprocessed)
y_pred_ols = ols_model.predict(X_test_ols)
y_pred_ols = np.where(y_pred_ols > 0.5, 1, 0)

lr_model = LinearRegression()
lr_model.fit(X_train_preprocessed, y_train)
y_pred_lr = lr_model.predict(X_test_preprocessed)
y_pred_lr = np.where(y_pred_lr > 0.5, 1, 0)

precision_ols = precision_score(y_test, y_pred_ols)
recall_ols = recall_score(y_test, y_pred_ols)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)

print(f"OLS Precision: {precision_ols:.2f}, Recall: {recall_ols:.2f}")
print(f"Linear Regression Precision: {precision_lr:.2f}, Recall: {recall_lr:.2f}")

if precision_ols > precision_lr and recall_ols > recall_lr:
    best_model = 'OLS Regression'
else:
    best_model = 'Linear Regression'

print(f"The best model is: {best_model}")

def regression_results(y_true, y_pred):
    mean_absolute_error = metrics.mean_absolute_error(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    median_absolute_error = metrics.median_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)

    print('r2: ', round(r2, 4))
    print('MAE: ', round(mean_absolute_error, 4))
    print('MSE: ', round(mse, 4))
    print('RMSE: ', round(np.sqrt(mse), 4))

regression_results(y_test, y_pred_lr)
regression_results(y_test, y_pred_ols)

OLS Precision: 0.71, Recall: 0.75
Linear Regression Precision: 0.71, Recall: 0.75
The best model is: Linear Regression
r2:  0.1007
MAE:  0.2135
MSE:  0.2135
RMSE:  0.462
r2:  0.1007
MAE:  0.2135
MSE:  0.2135
RMSE:  0.462
