In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

In [3]:
df=pd.read_csv('personality_dataset.csv')

In [4]:
df.head(10)

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert
5,1.0,No,7.0,5.0,No,6.0,6.0,Extrovert
6,4.0,No,9.0,,No,7.0,7.0,Extrovert
7,2.0,No,8.0,4.0,No,7.0,8.0,Extrovert
8,10.0,Yes,1.0,3.0,Yes,0.0,3.0,Introvert
9,0.0,No,8.0,6.0,No,13.0,8.0,Extrovert


In [4]:
df.shape

(2900, 8)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           2837 non-null   float64
 1   Stage_fear                 2827 non-null   object 
 2   Social_event_attendance    2838 non-null   float64
 3   Going_outside              2834 non-null   float64
 4   Drained_after_socializing  2848 non-null   object 
 5   Friends_circle_size        2823 non-null   float64
 6   Post_frequency             2835 non-null   float64
 7   Personality                2900 non-null   object 
dtypes: float64(5), object(3)
memory usage: 181.4+ KB


In [6]:
df.isnull().sum()

Time_spent_Alone             63
Stage_fear                   73
Social_event_attendance      62
Going_outside                66
Drained_after_socializing    52
Friends_circle_size          77
Post_frequency               65
Personality                   0
dtype: int64

In [10]:
for col in df.select_dtypes(include='number').columns:
    df[col].fillna(df[col].mean(), inplace=True)

In [11]:
for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [16]:
df.isnull().sum()
df['Stage_fear'] = df['Stage_fear'].map({'Yes': 1, 'No': 0})
df['Drained_after_socializing'] = df['Drained_after_socializing'].map({'Yes': 1, 'No': 0})


In [10]:
df.head(10)

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,0,4.0,6.0,0,13.0,5.0,Extrovert
1,9.0,1,0.0,0.0,1,0.0,3.0,Introvert
2,9.0,1,1.0,2.0,1,5.0,2.0,Introvert
3,0.0,0,6.0,7.0,0,14.0,8.0,Extrovert
4,3.0,0,9.0,4.0,0,8.0,5.0,Extrovert
5,1.0,0,7.0,5.0,0,6.0,6.0,Extrovert
6,4.0,0,9.0,3.0,0,7.0,7.0,Extrovert
7,2.0,0,8.0,4.0,0,7.0,8.0,Extrovert
8,10.0,1,1.0,3.0,1,0.0,3.0,Introvert
9,0.0,0,8.0,6.0,0,13.0,8.0,Extrovert


In [7]:
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['Personality'], random_state=42 )

In [12]:
df_train.shape

(2320, 8)

In [13]:
df_test.shape

(580, 8)

In [14]:
X_train = df_train.drop(columns='Personality',axis=1)
X_test = df_test.drop(columns='Personality',axis=1)
y_test = df_test['Personality']
y_train = df_train['Personality']

In [15]:
model_dt = DecisionTreeClassifier(random_state=42)

In [16]:
model_dt.fit(X_train,y_train)

In [17]:
y_pred = model_dt.predict(X_test)

In [18]:
y_pred

array(['Introvert', 'Extrovert', 'Introvert', 'Extrovert', 'Extrovert',
       'Extrovert', 'Extrovert', 'Introvert', 'Extrovert', 'Introvert',
       'Extrovert', 'Extrovert', 'Extrovert', 'Extrovert', 'Introvert',
       'Extrovert', 'Extrovert', 'Extrovert', 'Introvert', 'Introvert',
       'Introvert', 'Extrovert', 'Extrovert', 'Introvert', 'Introvert',
       'Introvert', 'Introvert', 'Extrovert', 'Extrovert', 'Extrovert',
       'Extrovert', 'Extrovert', 'Introvert', 'Introvert', 'Extrovert',
       'Extrovert', 'Extrovert', 'Extrovert', 'Extrovert', 'Extrovert',
       'Introvert', 'Extrovert', 'Introvert', 'Introvert', 'Introvert',
       'Extrovert', 'Extrovert', 'Extrovert', 'Introvert', 'Introvert',
       'Introvert', 'Introvert', 'Extrovert', 'Introvert', 'Introvert',
       'Extrovert', 'Introvert', 'Extrovert', 'Extrovert', 'Introvert',
       'Introvert', 'Introvert', 'Introvert', 'Extrovert', 'Extrovert',
       'Introvert', 'Introvert', 'Introvert', 'Introvert', 'Extr

In [19]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

   Extrovert       0.87      0.85      0.86       298
   Introvert       0.84      0.87      0.86       282

    accuracy                           0.86       580
   macro avg       0.86      0.86      0.86       580
weighted avg       0.86      0.86      0.86       580



In [20]:
y_pred_train= model_dt.predict(X_train)

In [21]:
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

   Extrovert       0.98      0.99      0.99      1193
   Introvert       0.99      0.98      0.98      1127

    accuracy                           0.98      2320
   macro avg       0.99      0.98      0.98      2320
weighted avg       0.98      0.98      0.98      2320



In [46]:
clf = DecisionTreeClassifier(
    criterion='entropy',           # hoặc 'entropy' nếu bạn muốn thử mức độ phân chia tốt hơn
    max_depth=3,                # giới hạn độ sâu để tránh cây quá phức tạp
    min_samples_split=100,       # node phải có ít nhất 20 mẫu mới được chia tiếp
    min_samples_leaf=90,        # mỗi lá cuối cùng phải có ít nhất 10 mẫu
    max_features='sqrt',        # dùng căn bậc 2 số feature → giảm overfit
    random_state=42             # để kết quả tái lập được
)
clf.fit(X_train,y_train)


In [47]:
y_pred = model_dt.predict(X_test)
y_pred_train= model_dt.predict(X_train)

In [48]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

   Extrovert       0.87      0.85      0.86       298
   Introvert       0.84      0.87      0.86       282

    accuracy                           0.86       580
   macro avg       0.86      0.86      0.86       580
weighted avg       0.86      0.86      0.86       580



In [49]:
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

   Extrovert       0.98      0.99      0.99      1193
   Introvert       0.99      0.98      0.98      1127

    accuracy                           0.98      2320
   macro avg       0.99      0.98      0.98      2320
weighted avg       0.98      0.98      0.98      2320



In [18]:
df_train,df_test = train_test_split(df, test_size=0.2, stratify=df['Personality'])
X_train = df_train.drop(columns='Personality',axis=1)
y_train = df_train['Personality']
X_test = df_test.drop(columns='Personality',axis=1)
y_test = df_test['Personality']


In [30]:
model_knn = KNeighborsClassifier(n_neighbors = 5, weights='distance',metric ='euclidean')
model_knn.fit(X_train,y_train)
y_pred_knn = model_knn.predict(X_test)
print(classification_report(y_test,y_pred_knn))

              precision    recall  f1-score   support

   Extrovert       0.87      0.93      0.90       298
   Introvert       0.92      0.85      0.88       282

    accuracy                           0.89       580
   macro avg       0.89      0.89      0.89       580
weighted avg       0.89      0.89      0.89       580



In [40]:
model_lr = LogisticRegression(penalty='elasticnet',C=0.1,solver='saga',l1_ratio=0.8,max_iter=1000,random_state=42)
model_lr.fit(X_train,y_train)
y_pred_lr = model_lr.predict(X_test)
print(classification_report(y_test,y_pred_lr))

              precision    recall  f1-score   support

   Extrovert       0.92      0.92      0.92       298
   Introvert       0.91      0.91      0.91       282

    accuracy                           0.92       580
   macro avg       0.92      0.92      0.92       580
weighted avg       0.92      0.92      0.92       580



In [33]:
y_pred_lr = model_lr.predict(X_train)
print(classification_report(y_train,y_pred_lr))

              precision    recall  f1-score   support

   Extrovert       0.94      0.93      0.93      1193
   Introvert       0.92      0.94      0.93      1127

    accuracy                           0.93      2320
   macro avg       0.93      0.93      0.93      2320
weighted avg       0.93      0.93      0.93      2320



In [41]:
joblib.dump(model_lr,'model_lr.pkl')

['model_lr.pkl']