In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score
import statsmodels.api as sm

df = pd.read_csv('train.csv')

df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Fare'].fillna(df['Fare'].mean(), inplace=True)
df.drop(columns=['Cabin', 'Ticket'], inplace=True)
df.dropna(subset=['Embarked'], inplace=True)

df.drop_duplicates(inplace=True)

categorical_features = ['Sex', 'Embarked']
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

X_train_ols = sm.add_constant(X_train)
ols_model = sm.OLS(y_train, X_train_ols).fit()
X_test_ols = sm.add_constant(X_test)
y_pred_ols = ols_model.predict(X_test_ols)
y_pred_ols = np.where(y_pred_ols > 0.5, 1, 0)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
y_pred_lr = np.where(y_pred_lr > 0.5, 1, 0)

precision_ols = precision_score(y_test, y_pred_ols)
recall_ols = recall_score(y_test, y_pred_ols)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)

print(f"OLS Precision: {precision_ols:.2f}, Recall: {recall_ols:.2f}")
print(f"Linear Regression Precision: {precision_lr:.2f}, Recall: {recall_lr:.2f}")

if precision_ols > precision_lr and recall_ols > recall_lr:
    best_model = 'OLS Regression'
else:
    best_model = 'Linear Regression'

print(f"The best model is: {best_model}")


OLS Precision: 0.74, Recall: 0.77
Linear Regression Precision: 0.74, Recall: 0.77
The best model is: Linear Regression
