In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import statsmodels.api as sm

In [18]:
df = pd.read_excel("Logistic_Regression_Database.xlsx")  

In [19]:
X = df[['Income', 'Income Type', 'Owns property']]
y = df['Bought']

numerical_features = ['Income']
categorical_features = ['Income Type', 'Owns property']


In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

In [21]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [22]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


model.fit(X_train, y_train)

In [23]:
y_pred = model.predict(X_test)

In [24]:

print("Confusion Matriz:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Confusion Matriz:
 [[ 4  7]
 [ 1 18]]

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.36      0.50        11
           1       0.72      0.95      0.82        19

    accuracy                           0.73        30
   macro avg       0.76      0.66      0.66        30
weighted avg       0.75      0.73      0.70        30

Accuracy: 0.7333333333333333


In [25]:

ohe = model.named_steps['preprocessor'].named_transformers_['cat']
feature_names = numerical_features + list(ohe.get_feature_names_out(categorical_features))

coefs = model.named_steps['classifier'].coef_[0]
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefs})

print("\nImportance of Features:")
print(coef_df)


Importance of Features:
           Feature  Coefficient
0           Income     1.541401
1    Income Type_2    -0.489254
2    Income Type_3     0.106230
3  Owns property_2    -0.107007


In [26]:

preprocessor.fit(X_train)
X_train_processed = preprocessor.transform(X_train)


ohe = preprocessor.named_transformers_['cat']
feature_names = numerical_features + list(ohe.get_feature_names_out(categorical_features))


X_train_df = pd.DataFrame(X_train_processed, columns=feature_names, index=X_train.index)


X_train_df = sm.add_constant(X_train_df)


logit_model = sm.Logit(y_train, X_train_df)
result = logit_model.fit()


print(result.summary())


alpha = 0.005
print(f"\n📊 Hypothesis test for each variable (significance level) = {alpha}):\n")
for feature, pval in result.pvalues.items():
    if pval < alpha:
        print(f"✅ {feature}: p-valor = {pval:.4f} → Rejects H₀ → Significant variable")
    else:
        print(f"❌ {feature}: p-valor = {pval:.4f} → Does not reject H₀ → Variable NOT significant")

Optimization terminated successfully.
         Current function value: 0.426845
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                 Bought   No. Observations:                   70
Model:                          Logit   Df Residuals:                       65
Method:                           MLE   Df Model:                            4
Date:                Sun, 22 Jun 2025   Pseudo R-squ.:                  0.3451
Time:                        17:31:41   Log-Likelihood:                -29.879
converged:                       True   LL-Null:                       -45.623
Covariance Type:            nonrobust   LLR p-value:                 2.435e-06
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               1.0946      0.786      1.392      0.164      -0.446       2.635
Income        

In [29]:

novo_cliente = pd.DataFrame({
    'Income': [10000],
    'Income Type': [1],      
    'Owns property': [1]
})

prob = model.predict_proba(novo_cliente)[0][1]
classe = model.predict(novo_cliente)[0]
print(f"Likelihood of purchase: {prob:.2f} → Class: {classe}")

Likelihood of purchase: 0.85 → Class: 1
