In [None]:
import pandas as pd

In [None]:
# load data
titanic_in = pd.read_csv(
    "https://public.opendatasoft.com/explore/dataset/titanic-passengers/download",
    delimiter=";",
)
titanic_in

In [None]:
# pre-process and assemble data

titanic = pd.DataFrame()
titanic["class"] = titanic_in["pclass"]
titanic["female"] = titanic_in["sex"].replace({"female": True, "male": False})
titanic["age"] = titanic_in["age"]
titanic["age"].fillna(value=titanic["age"].mean(), inplace=True)
titanic["family_members"] = titanic_in["sibsp"]
titanic["parch"] = titanic_in["parch"]
titanic["fare"] = titanic_in["fare"]
titanic["survived"] = titanic_in["survived"].replace({"Yes": True, "No": False})

titanic

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
# train model and inspect coefficients

x = titanic.loc[:, ["class", "female", "age", "family_members", "parch", "fare"]]
y = titanic.loc[:, "survived"]

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

model = LinearRegression()
model.fit(x_train, y_train)
print(model.coef_)

y_pred = model.predict(x_test).round()
score = metrics.accuracy_score(y_pred.round(), y_test)
print(f"score: {score}")

model.predict([[3, 0, 30, 0, 0, 10], [1, 1, 10, 3, 1, 40]])

In [None]:
# train model with reduced set of coefficients

x = titanic.loc[:, ["class", "female", "age", "family_members"]]
y = titanic.loc[:, "survived"]

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

model = LinearRegression()
model.fit(x_train, y_train)
print(model.coef_)

y_pred = model.predict(x_test).round()
score = metrics.accuracy_score(y_pred.round(), y_test)
print(f"score: {score}")

# results:
# a class number that's lower by 1 increases the chance of survival by 14%
# female passengers have a 49% higher chance of survival than males