Simple Imputes new category for categorical and mode for numerical <br>
Binned to 2 classes <br>
Interactions

In [None]:
import pathlib
import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, PolynomialFeatures, KBinsDiscretizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn import set_config
import joblib
from statsmodels.discrete.discrete_model import Logit

# Enable pandas output globally
set_config(transform_output="pandas")

In [26]:
# Get path to train file
train_path = os.path.join(pathlib.Path.cwd().parent, "raw", "train.csv")
test_path = os.path.join(pathlib.Path.cwd().parent, "raw", "test.csv")
train_path

'c:\\Users\\bogus\\Desktop\\Projects\\Kaggle competitions\\Introverts\\raw\\train.csv'

In [27]:
train_df = pd.read_csv(train_path, index_col=0)
test_df = pd.read_csv(test_path, index_col=0)
train_df.head()

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [None]:
numerical_vars = train_df.select_dtypes(include='number').columns.tolist()
categorical_vars = train_df.drop(labels="Personality", axis=1).select_dtypes(include='object').columns.tolist()
y_var = "Personality"
numerical_vars, categorical_vars, y_var

(['Time_spent_Alone',
  'Social_event_attendance',
  'Going_outside',
  'Friends_circle_size',
  'Post_frequency'],
 ['Stage_fear', 'Drained_after_socializing'],
 'Personality')

In [227]:
numerical_pipeline = Pipeline(
    [
        # ("missing_ind", MissingIndicator(sparse=False)),
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("minmax", KBinsDiscretizer(n_bins=2, encode="ordinal", strategy="uniform")),
        ("onehot", OneHotEncoder(drop="if_binary", sparse_output=False))
    ]
)

categorical_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
        ("onehot", OneHotEncoder(drop="first", sparse_output=False))
    ]
)

preprocessing_pipeline = Pipeline(
    [
        ("column_transformer", ColumnTransformer(
            [
                ("numerical", numerical_pipeline, numerical_vars),
                ("categorical", categorical_pipeline, categorical_vars),
                ("missing_indicator", MissingIndicator(), numerical_vars)
            ]
        ))
    ]
)

pipeline = Pipeline(
    [
        ("preprocessing", preprocessing_pipeline),
        # ("interactions", PolynomialFeatures(degree=2, interaction_only=True))
        ("interactions", ColumnTransformer(
            transformers=[
                ("interactions", PolynomialFeatures(degree=2, interaction_only=True), 
                 [
                     "numerical__Social_event_attendance_1.0", 
                     "numerical__Post_frequency_1.0",
                     # "categorical__Stage_fear_No", 
                     # "categorical__Drained_after_socializing_No"
                ]),
                ('drop_cols', 'drop', [
                    'missing_indicator__missingindicator_Going_outside', 
                    "missing_indicator__missingindicator_Post_frequency",
                    "missing_indicator__missingindicator_Social_event_attendance",
                    "missing_indicator__missingindicator_Friends_circle_size",
                    # "numerical__Going_outside_1.0",
                    # "categorical__Drained_after_socializing_Yes",
                    # "categorical__Stage_fear_Yes",
                    # "numerical__Friends_circle_size_1.0"
                ])
            ],
            remainder="passthrough"
        )
        )
        
    ]
)

In [228]:
pipeline.fit(train_df)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [229]:
cleaned_df = pipeline.transform(train_df)
cleaned_df.head()

Unnamed: 0_level_0,interactions__1,interactions__numerical__Social_event_attendance_1.0,interactions__numerical__Post_frequency_1.0,interactions__numerical__Social_event_attendance_1.0 numerical__Post_frequency_1.0,remainder__numerical__Time_spent_Alone_1.0,remainder__numerical__Going_outside_1.0,remainder__numerical__Friends_circle_size_1.0,remainder__categorical__Stage_fear_No,remainder__categorical__Stage_fear_Yes,remainder__categorical__Drained_after_socializing_No,remainder__categorical__Drained_after_socializing_Yes,remainder__missing_indicator__missingindicator_Time_spent_Alone
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,False
1,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,False
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,False
3,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,False
4,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,False


In [230]:
# cleaned_df['Personality'] = train_df[y_var].map({'Extrovert': 1, 'Introvert': 0})
model = Logit(train_df[y_var].map({'Extrovert': 1, 'Introvert': 0}), cleaned_df.astype(float))

In [231]:
res = model.fit(method="lbfgs")

In [232]:
y_pred_probs = res.predict(cleaned_df.astype(float))
sum((y_pred_probs >= 0.5).astype(int) == train_df[y_var].map({'Extrovert': 1, 'Introvert': 0}))/len(cleaned_df)

0.9689052040595983

In [233]:
res.summary()

0,1,2,3
Dep. Variable:,Personality,No. Observations:,18524.0
Model:,Logit,Df Residuals:,18512.0
Method:,MLE,Df Model:,11.0
Date:,"Mon, 21 Jul 2025",Pseudo R-squ.:,0.7545
Time:,20:52:46,Log-Likelihood:,-2608.4
converged:,True,LL-Null:,-10624.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
interactions__1,-0.9401,0.157,-5.999,0.000,-1.247,-0.633
interactions__numerical__Social_event_attendance_1.0,1.5984,0.158,10.114,0.000,1.289,1.908
interactions__numerical__Post_frequency_1.0,1.4152,0.170,8.337,0.000,1.082,1.748
interactions__numerical__Social_event_attendance_1.0 numerical__Post_frequency_1.0,-1.7092,0.222,-7.685,0.000,-2.145,-1.273
remainder__numerical__Time_spent_Alone_1.0,-0.9015,0.115,-7.865,0.000,-1.126,-0.677
remainder__numerical__Going_outside_1.0,0.1800,0.107,1.688,0.091,-0.029,0.389
remainder__numerical__Friends_circle_size_1.0,0.4393,0.106,4.128,0.000,0.231,0.648
remainder__categorical__Stage_fear_No,1.3277,0.124,10.720,0.000,1.085,1.570
remainder__categorical__Stage_fear_Yes,-0.7395,0.130,-5.691,0.000,-0.994,-0.485


## Predict on test data and create submission file

In [239]:
test_transformed = pipeline.transform(test_df)
y_pred_probs = res.predict(test_transformed.astype(float))
test_predict = (y_pred_probs > 0.5).astype(int).map({0: "Introvert", 1: "Extrovert"})

In [240]:
test_df = pd.DataFrame(
    {
        "id": test_df.index,
        y_var: test_predict
    }
)

In [241]:
test_df.to_csv("submission.csv", index=False)

## Save model

In [242]:
joblib.dump(pipeline, "pipeline.pkl")
joblib.dump(res, "logistic_regression.pkl")

['logistic_regression.pkl']