In [5]:
import pandas as pd

# Load the dataset
df = pd.read_csv("data.csv")

# Check the first few rows
df.head()

Unnamed: 0,Age,Gender,Education,Introversion Score,Sensing Score,Thinking Score,Judging Score,Interest,Personality
0,21.0,Female,1,5.89208,2.144395,7.32363,5.462224,Arts,ENTP
1,24.0,Female,1,2.48366,3.206188,8.06876,3.765012,Unknown,INTP
2,26.0,Female,1,7.0291,6.469302,4.16472,5.454442,Others,ESFP
3,30.0,Male,0,5.46525,4.179244,2.82487,5.080477,Sports,ENFJ
4,31.0,Female,0,3.59804,6.189259,5.31347,3.677984,Others,ISFP


In [13]:
print("Missing values:\n", df.isnull().sum())
print("\nNumber of duplicate rows:", df.duplicated().sum())
df = df.drop_duplicates()

Missing values:
 Age                   0
Gender                0
Education             0
Introversion Score    0
Sensing Score         0
Thinking Score        0
Judging Score         0
Interest              0
Personality           0
dtype: int64

Number of duplicate rows: 0


In [None]:
# split personality into 4 binary target variables 
df['IE'] = df['Personality'].apply(lambda x: 1 if x[0] == 'I' else 0)
df['SN'] = df['Personality'].apply(lambda x: 1 if x[1] == 'N' else 0)
df['TF'] = df['Personality'].apply(lambda x: 1 if x[2] == 'T' else 0)
df['JP'] = df['Personality'].apply(lambda x: 1 if x[3] == 'J' else 0)

In [15]:
from sklearn.preprocessing import StandardScaler
# encoding categorical variables (gender and interests)

# set unknown as the baseline category (it will be the one dropped)
df['Interest'] = pd.Categorical(df['Interest'], 
                                 categories=['Unknown', 'Arts', 'Others', 'Sports', 'Technology'],
                                 ordered=True)

#dummy variable encoding 
df_new = pd.get_dummies(df, columns=['Gender', 'Interest'], drop_first=True)

df_new.drop(columns=['Personality'], inplace=True)

# features we are going to train our model with 
feature_cols = ['Age', 'Education', 'Introversion Score', 'Sensing Score',
                'Thinking Score', 'Judging Score'] + \
               [col for col in df_new.columns if col.startswith('Gender_') or col.startswith('Interest_')]

# scale numeric features 
scaler = StandardScaler()
#overwrite feature columns with scaled versions
df_new[feature_cols] = scaler.fit_transform(df_new[feature_cols])



In [17]:
# test pre-processing 
'''
print("Missing values:\n", df_new.isnull().sum().sum())
print("Duplicate rows:", df_new.duplicated().sum())
print("\nScaled feature stats:")
print(df_new[feature_cols].describe().T[['mean', 'std']])
print("\nExample dummy encoding (first 5 rows):")
print(df_new.filter(like='Interest_').head())
'''

'\nprint("Missing values:\n", df_new.isnull().sum().sum())\nprint("Duplicate rows:", df_new.duplicated().sum())\nprint("\nScaled feature stats:")\nprint(df_new[feature_cols].describe().T[[\'mean\', \'std\']])\nprint("\nExample dummy encoding (first 5 rows):")\nprint(df_new.filter(like=\'Interest_\').head())\n'