In [23]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [13]:
data = pd.read_csv(r"C:\Users\nico_\Desktop\fichiers_csv\traitement.csv", sep=";")

sex (binary): 1 if Male, 0 if Female

age (int): age of patient at the beginning of the study

obstruct (binary): obstruction of colon by tumor

outcome (binary): 1 if patient died within 5 years

TRTTMT (binary): if patient was treated

In [14]:
data.head()

Unnamed: 0.1,Unnamed: 0,Sex,age,obstruct,outcome,TRTTMT
0,1,0,57,0,1,True
1,2,1,68,0,0,False
2,3,0,72,0,0,True
3,4,0,66,1,1,True
4,5,1,69,0,1,False


In [15]:
data.drop('Unnamed: 0', inplace=True, axis=1)

In [16]:
data.isna().sum()

Sex         0
age         0
obstruct    0
outcome     0
TRTTMT      0
dtype: int64

# Logistic Regression

In [17]:
y = data["outcome"]

X = data.drop('outcome', axis=1)

classifier = LogisticRegression(solver='lbfgs').fit(X, y)

# Odds

odds = p/(1-p)

p : probability of events (such as death)
1-p : probability that the event doesn't happen (non death)


odds = σ(θTx) / (1−σ(θTx))

σ(θTx(i)) = 1 / (1+e(−θTx))

odds = e(θTx(i))

θT : coefficients
x(i) : features

# coefficient for each feature 

In [18]:
thetas = classifier.coef_
thetas

array([[-0.21704833,  0.0460642 ,  0.37798496, -0.418984  ]])

In [19]:
for i in range(len(X.columns)):
    print("Feature {:<9s}: coefficient = {:<10f}".format(X.columns[i], thetas[0, i]))

Feature Sex      : coefficient = -0.217048 
Feature age      : coefficient = 0.046064  
Feature obstruct : coefficient = 0.377985  
Feature TRTTMT   : coefficient = -0.418984 


# Odds Ratio

OR for binary variables, it's defined as the odds when the variable is 1 divided by the odds when the variable is 0

ORxj = e(θj)

In [20]:
odds_ratios = np.exp(thetas)
odds_ratios

array([[0.80489107, 1.04714164, 1.45934099, 0.65771472]])

In [21]:
for i in range(len(X.columns)):
    print("Feature {:<10s}: coefficient = {:<10f} // OR = {:.2f}".format(X.columns[i], thetas[0, i], odds_ratios[0, i]))

Feature Sex       : coefficient = -0.217048  // OR = 0.80
Feature age       : coefficient = 0.046064   // OR = 1.05
Feature obstruct  : coefficient = 0.377985   // OR = 1.46
Feature TRTTMT    : coefficient = -0.418984  // OR = 0.66


Sex : OR = 0.80 < 1 donc être un homme réduit le risque de décès 

age : OR = 1.05 > 1 donc être jeune réduit le risque de décès

obstruct : OR = 1.46 > 1 donc ne pas présenter de tumeur obstructive du côlon réduit le risque de décès

TRTTMT : OR = 0.66 donc prendre un traitement réduit le risque de décès

# Train/Test split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
print(f"Number of observations for training: {y_train.size}")
print(f"Number of observations for testing: {y_test.size}")

Number of observations for training: 37
Number of observations for testing: 13


# Training and prediction

In [25]:
lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train, y_train)

In [28]:
predictions = lr.predict(X_test)
prediction_probs = lr.predict_proba(X_test)

In [29]:
lr.classes_

array([0, 1], dtype=int64)

# For the first five elements of dataset

In [30]:
for i in range(5):
    print(f"Element number: {i}")
    print(f"Predicted class: {predictions[i]}")
    print(f"Probability of predicting class 0: {prediction_probs[i][0]}")
    print(f"Probability of predicting class 1: {prediction_probs[i][1]}\n")

Element number: 0
Predicted class: 0
Probability of predicting class 0: 0.904194043220079
Probability of predicting class 1: 0.09580595677992103

Element number: 1
Predicted class: 1
Probability of predicting class 0: 0.469949454202261
Probability of predicting class 1: 0.530050545797739

Element number: 2
Predicted class: 1
Probability of predicting class 0: 0.4169940839679571
Probability of predicting class 1: 0.5830059160320429

Element number: 3
Predicted class: 0
Probability of predicting class 0: 0.5591261226476416
Probability of predicting class 1: 0.44087387735235833

Element number: 4
Predicted class: 1
Probability of predicting class 0: 0.4344955139352179
Probability of predicting class 1: 0.5655044860647821



# Probability for label 1

In [31]:
prediction_probs[:, 1]

array([0.09580596, 0.53005055, 0.58300592, 0.44087388, 0.56550449,
       0.22445097, 0.30728025, 0.27817283, 0.50271464, 0.47640982,
       0.07874681, 0.1710679 , 0.3502622 ])

# Mean Accuracy

In [32]:
lr.score(X_test, y_test)

0.6923076923076923