In [168]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

from sklearn.svm import SVC

# Data Loading

In [178]:
df = pd.read_csv(
    "../Membangun_model/predict_the_introverts_from_the_extroverts_preprocessing/train_preprocessing.csv"
)
print(df.shape)
df.head()

(10189, 11)


Unnamed: 0,id,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency,Stage_fear_No,Stage_fear_Yes,Drained_after_socializing_No,Drained_after_socializing_Yes,Personality
0,0,-1.001004,0.122633,-0.236003,1.595076,-0.184829,1.0,0.0,1.0,0.0,0
1,1,-0.620847,0.507691,-0.766592,0.367511,0.922299,1.0,0.0,1.0,0.0,0
2,3,0.139467,0.507691,-0.766592,0.613024,-0.184829,1.0,0.0,1.0,0.0,0
3,7,-0.24069,0.892749,-0.766592,-1.105568,-0.184829,1.0,0.0,1.0,0.0,0
4,9,-0.620847,0.892749,0.825176,1.349563,1.291341,1.0,0.0,1.0,0.0,0


In [179]:
# separate target and features
X = df.drop(columns=["id", "Personality"], axis=1)
y = df["Personality"]

In [180]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Modelling

## Logistic Regression

In [181]:
# logistic regression
logreg_model = Pipeline(
    steps=[("logreg", LogisticRegression(solver="liblinear", random_state=42))]
)

logreg_model.fit(X_train, y_train)

logreg_scores = cross_val_score(
    logreg_model, X_train, y_train, cv=5, scoring="accuracy"
)
for i, score in enumerate(logreg_scores):
    print(f"Cross val: {i + 1}, Accuracy: {score}")

Cross val: 1, Accuracy: 0.9583077866339669
Cross val: 2, Accuracy: 0.9736196319018405
Cross val: 3, Accuracy: 0.9650306748466257
Cross val: 4, Accuracy: 0.9613496932515337
Cross val: 5, Accuracy: 0.9705521472392638


In [182]:
# validation
y_pred_logreg = logreg_model.predict(X_test)
val_score_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"Accuracy validation data: {val_score_logreg}")

Accuracy validation data: 0.9573110893032385


## XGBoost

In [174]:
# xgboost
xgb_model = Pipeline(
    steps=[
        (
            "xgboost",
            XGBClassifier(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=5,
                random_state=42,
                n_jobs=-1,
            ),
        )
    ]
)

xgb_model.fit(X_train, y_train)

xgb_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring="accuracy")
for i, score in enumerate(xgb_scores):
    print(f"Cross val: {i + 1}, Accuracy: {score}")

Cross val: 1, Accuracy: 0.9564684242795831
Cross val: 2, Accuracy: 0.9693251533742331
Cross val: 3, Accuracy: 0.9668711656441717
Cross val: 4, Accuracy: 0.9631901840490797
Cross val: 5, Accuracy: 0.9674846625766871


In [175]:
# validation
y_pred_xgb = xgb_model.predict(X_test)
val_score_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy validation data: {val_score_xgb}")

Accuracy validation data: 0.9592737978410206


## Support Vector Machine

In [183]:
# svm
svm_model = Pipeline(
    steps=[("svm", SVC(kernel="rbf", C=1, gamma="scale", class_weight="balanced"))]
)

svm_model.fit(X_train, y_train)

xgb_scores = cross_val_score(svm_model, X_train, y_train, cv=5, scoring="accuracy")
for i, score in enumerate(xgb_scores):
    print(f"Cross val: {i + 1}, Accuracy: {score}")

Cross val: 1, Accuracy: 0.9576946658491723
Cross val: 2, Accuracy: 0.9730061349693252
Cross val: 3, Accuracy: 0.9656441717791411
Cross val: 4, Accuracy: 0.9607361963190184
Cross val: 5, Accuracy: 0.9705521472392638


In [184]:
# validation
y_pred_svm = svm_model.predict(X_test)
val_score_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy validation data: {val_score_svm}")

Accuracy validation data: 0.9573110893032385
