In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import optuna
import warnings
from sklearn.metrics import roc_curve
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s5e7/sample_submission.csv')
original = pd.read_csv('/kaggle/input/extrovert-vs-introvert-behavior-data-backup/personality_datasert.csv')

In [3]:
train.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [4]:
original = original.rename(columns={'Personality': 'match_p'})
drop_cols = [col for col in original.columns if col != 'match_p']
original = original.drop_duplicates(subset=drop_cols)

# Merge with train and test
train = train.merge(original, how='left')
test = test.merge(original, how='left')

In [5]:
train.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality,match_p
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert,
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert,
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert,
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert,
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert,


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
 9   match_p                    178 non-null    object 
dtypes: float64(5), int64(1), object(4)
memory usage: 1.4+ MB


In [7]:
train.describe()

Unnamed: 0,id,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency
count,18524.0,17334.0,17344.0,17058.0,17470.0,17260.0
mean,9261.5,3.137764,5.265106,4.044319,7.996737,4.982097
std,5347.562529,3.003786,2.753359,2.06258,4.223484,2.879139
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,4630.75,1.0,3.0,3.0,5.0,3.0
50%,9261.5,2.0,5.0,4.0,8.0,5.0
75%,13892.25,4.0,8.0,6.0,12.0,7.0
max,18523.0,11.0,10.0,7.0,15.0,10.0


In [8]:
X = train.drop(columns=['Personality'])
y = train['Personality']

In [9]:
target_encoder = LabelEncoder()
y = pd.Series(target_encoder.fit_transform(y))
print("[INFO] Label encoding completed. Classes:", target_encoder.classes_)

[INFO] Label encoding completed. Classes: ['Extrovert' 'Introvert']


In [10]:
def preprocess_fold(X_train, X_val):
    for df in [X_train, X_val]:
        df.columns = df.columns.str.lower().str.replace(' ', '_')
        df.drop(columns=['id'], inplace=True, errors='ignore')
        df['stage_fear'] = df['stage_fear'].fillna('unknown')
        df['drained_after_socializing'] = df['drained_after_socializing'].fillna('unknown')
        df['match_p_is_null'] = df['match_p'].isna().astype(int)
        df['match_p'] = df['match_p'].fillna('unknown')
        for col in df.select_dtypes(include='number').columns:
            df[col] = df[col].fillna(df[col].mean())
    cat_cols = X_train.select_dtypes(include="object").columns.tolist()
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    X_train[cat_cols] = encoder.fit_transform(X_train[cat_cols])
    X_val[cat_cols] = encoder.transform(X_val[cat_cols])
    return X_train, X_val, encoder

In [11]:
rf = RandomForestClassifier(n_estimators=344, max_depth=11, max_features=None,
                            min_samples_split=11, min_samples_leaf=1,
                            random_state=42, n_jobs=-1)

In [12]:
models, scores = [], []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\n[INFO] Fold {fold + 1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    X_train_prep, X_val_prep, encoder = preprocess_fold(X_train.copy(), X_val.copy())
    rf.fit(X_train_prep, y_train)
    acc = accuracy_score(y_val, rf.predict(X_val_prep))
    print(f"[INFO] Accuracy: {acc:.6f}")
    models.append(rf)
    scores.append(acc)

print("\n[INFO] Mean CV Accuracy:", np.mean(scores))


[INFO] Fold 1
[INFO] Accuracy: 0.971390

[INFO] Fold 2
[INFO] Accuracy: 0.968691

[INFO] Fold 3
[INFO] Accuracy: 0.967341

[INFO] Fold 4
[INFO] Accuracy: 0.970580

[INFO] Fold 5
[INFO] Accuracy: 0.972192

[INFO] Mean CV Accuracy: 0.9700389847354721


In [13]:
def preprocess_final_test(df, encoder):
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    df.drop(columns=['id'], inplace=True, errors='ignore')
    df['stage_fear'] = df['stage_fear'].fillna('unknown')
    df['drained_after_socializing'] = df['drained_after_socializing'].fillna('unknown')
    df['match_p_is_null'] = df['match_p'].isna().astype(int)
    df['match_p'] = df['match_p'].fillna('unknown')
    for col in df.select_dtypes(include='number').columns:
        df[col] = df[col].fillna(df[col].mean())
    cat_cols = df.select_dtypes(include='object').columns
    df[cat_cols] = encoder.transform(df[cat_cols])
    return df

In [14]:
X_test = preprocess_final_test(test.copy(), encoder)

# Prediction
probas = sum(model.predict_proba(X_test) for model in models) / len(models)
preds = target_encoder.inverse_transform(np.argmax(probas, axis=1))

# Submission
submission = pd.DataFrame({'id': test['id'], 'Personality': preds})
submission.to_csv('submission.csv', index=False)
print("[INFO] Submission saved to 'submission.csv'")

[INFO] Submission saved to 'submission.csv'


In [15]:
submission.head()

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
