In [35]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import KNNImputer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [68]:
data = pd.read_csv("train.csv")
data

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert
...,...,...,...,...,...,...,...,...,...
18519,18519,3.0,No,7.0,3.0,No,9.0,7.0,Extrovert
18520,18520,1.0,,6.0,7.0,No,6.0,5.0,Extrovert
18521,18521,7.0,Yes,1.0,1.0,Yes,1.0,,Introvert
18522,18522,,Yes,1.0,0.0,Yes,5.0,2.0,Introvert


In [69]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


In [70]:
data.describe()


Unnamed: 0,id,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency
count,18524.0,17334.0,17344.0,17058.0,17470.0,17260.0
mean,9261.5,3.137764,5.265106,4.044319,7.996737,4.982097
std,5347.562529,3.003786,2.753359,2.06258,4.223484,2.879139
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,4630.75,1.0,3.0,3.0,5.0,3.0
50%,9261.5,2.0,5.0,4.0,8.0,5.0
75%,13892.25,4.0,8.0,6.0,12.0,7.0
max,18523.0,11.0,10.0,7.0,15.0,10.0


In [38]:
data.isna().sum()

id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64

In [39]:
data['Stage_fear'] = (data['Stage_fear'] == "Yes").astype(int)
data

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,0,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,0,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,1,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,0,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,0,4.0,4.0,No,13.0,,Extrovert
...,...,...,...,...,...,...,...,...,...
18519,18519,3.0,0,7.0,3.0,No,9.0,7.0,Extrovert
18520,18520,1.0,0,6.0,7.0,No,6.0,5.0,Extrovert
18521,18521,7.0,1,1.0,1.0,Yes,1.0,,Introvert
18522,18522,,1,1.0,0.0,Yes,5.0,2.0,Introvert


In [None]:
data['Drained_after_socializing'] = data['Drained_after_socializing'].map({
    'Yes': 1,
    'No': 0
})
data

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,0,6.0,4.0,0.0,15.0,5.0,Extrovert
1,1,1.0,0,7.0,3.0,0.0,10.0,8.0,Extrovert
2,2,6.0,1,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,0,7.0,3.0,0.0,11.0,5.0,Extrovert
4,4,1.0,0,4.0,4.0,0.0,13.0,,Extrovert
...,...,...,...,...,...,...,...,...,...
18519,18519,3.0,0,7.0,3.0,0.0,9.0,7.0,Extrovert
18520,18520,1.0,0,6.0,7.0,0.0,6.0,5.0,Extrovert
18521,18521,7.0,1,1.0,1.0,1.0,1.0,,Introvert
18522,18522,,1,1.0,0.0,1.0,5.0,2.0,Introvert


In [41]:
# Step 1: Separate numeric and non-numeric columns
numeric_cols = data.select_dtypes(include='number').columns
non_numeric_cols = data.select_dtypes(exclude='number').columns

# Step 2: Apply KNN Imputer on numeric data only
imputer = KNNImputer(n_neighbors=5)
numeric_data = pd.DataFrame(imputer.fit_transform(data[numeric_cols]), columns=numeric_cols)

# Step 3: Combine numeric and non-numeric data
data_imputed = pd.concat([numeric_data, data[non_numeric_cols].reset_index(drop=True)], axis=1)

In [8]:
data_imputed["Personality"] = data_imputed["Personality"].map({
    "Extrovert":1 ,
    "Introvert" :0 })  

In [42]:
data_imputed

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0.0,0.0,0.0,6.0,4.0,0.0,15.0,5.0,Extrovert
1,1.0,1.0,0.0,7.0,3.0,0.0,10.0,8.0,Extrovert
2,2.0,6.0,1.0,1.0,0.0,0.0,3.0,0.0,Introvert
3,3.0,3.0,0.0,7.0,3.0,0.0,11.0,5.0,Extrovert
4,4.0,1.0,0.0,4.0,4.0,0.0,13.0,5.0,Extrovert
...,...,...,...,...,...,...,...,...,...
18519,18519.0,3.0,0.0,7.0,3.0,0.0,9.0,7.0,Extrovert
18520,18520.0,1.0,0.0,6.0,7.0,0.0,6.0,5.0,Extrovert
18521,18521.0,7.0,1.0,1.0,1.0,1.0,1.0,5.6,Introvert
18522,18522.0,3.4,1.0,1.0,0.0,1.0,5.0,2.0,Introvert


In [43]:
data_imputed = data_imputed.drop("id",axis=1)
data_imputed

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0.0,0.0,6.0,4.0,0.0,15.0,5.0,Extrovert
1,1.0,0.0,7.0,3.0,0.0,10.0,8.0,Extrovert
2,6.0,1.0,1.0,0.0,0.0,3.0,0.0,Introvert
3,3.0,0.0,7.0,3.0,0.0,11.0,5.0,Extrovert
4,1.0,0.0,4.0,4.0,0.0,13.0,5.0,Extrovert
...,...,...,...,...,...,...,...,...
18519,3.0,0.0,7.0,3.0,0.0,9.0,7.0,Extrovert
18520,1.0,0.0,6.0,7.0,0.0,6.0,5.0,Extrovert
18521,7.0,1.0,1.0,1.0,1.0,1.0,5.6,Introvert
18522,3.4,1.0,1.0,0.0,1.0,5.0,2.0,Introvert


In [47]:
print(data_imputed.columns.tolist())


['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency', 'Personality']


In [48]:
train=data_imputed.drop('Personality',axis=1)
test=data_imputed['Personality']
x_train,x_test,y_train,y_test=train_test_split(train,test,random_state=66,test_size=.2)
print(y_test)
print(x_test)

12517    Extrovert
6112     Extrovert
2005     Extrovert
7832     Extrovert
15667    Extrovert
           ...    
1372     Extrovert
13736    Extrovert
15604    Extrovert
13486    Introvert
15297    Introvert
Name: Personality, Length: 3705, dtype: object
       Time_spent_Alone  Stage_fear  Social_event_attendance  Going_outside  \
12517               4.0         1.0                      3.4            2.0   
6112                0.0         0.0                      9.0            7.0   
2005                0.8         0.0                      5.0            6.0   
7832                1.0         0.0                      7.0            5.0   
15667               3.0         0.0                      8.0            5.0   
...                 ...         ...                      ...            ...   
1372                2.0         0.0                      9.0            6.0   
13736               0.0         0.0                      8.0            7.0   
15604               0.0         0

In [26]:
param_grid = {
    'hidden_layer_sizes': [(64, 32), (128, 64), (64, 64, 32)],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate_init': [0.001, 0.01]
}

grid = GridSearchCV(MLPRegressor(max_iter=500), param_grid, scoring='neg_mean_squared_error', cv=5)
grid.fit(x_train, y_train)

print("Best parameters:", grid.best_params_)

Best parameters: {'alpha': 0.01, 'hidden_layer_sizes': (64, 64, 32), 'learning_rate_init': 0.01}


In [28]:
regr = MLPRegressor(alpha=0.01,hidden_layer_sizes=(64, 64, 32), 
                    learning_rate_init=0.01)
regr.fit(x_train, y_train)
regr.predict(x_test)
regr.score(x_test,y_test)


0.8355824913058697

In [24]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2))
clf.fit(x_train, y_train)
#y_pred = clf.predict(x_test)
clf.score(x_test,y_test)

0.7384615384615385

In [None]:
param_grid = {
    'hidden_layer_sizes': [(64, 32), (128, 64), (64, 64, 32)],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate_init': [0.001, 0.01]
}

grid = GridSearchCV(MLPClassifier(max_iter=500), param_grid, scoring='neg_mean_squared_error', cv=5)

In [50]:
model = LogisticRegression( max_iter=10000)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(x_train)
print(y_train)

Accuracy: 0.9668016194331984
       Time_spent_Alone  Stage_fear  Social_event_attendance  Going_outside  \
10186               2.0         0.0                      4.0            5.0   
16884               0.0         0.0                      3.0            3.0   
15185               2.0         0.0                      4.0            3.0   
12750               2.0         0.0                      5.0            7.0   
13199               9.0         1.0                      0.0            0.0   
...                 ...         ...                      ...            ...   
9165                7.0         1.0                      0.0            0.0   
5199                2.0         0.0                      7.0            3.0   
18125               0.0         0.0                      4.0            6.0   
8243                3.0         0.0                      8.0            4.0   
8823                3.0         0.0                      7.0            6.0   

       Drained_after_s

In [None]:
test_data = pd.read_csv('test.csv')
test_data


Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,18524,3.0,No,7.0,4.0,No,6.0,
1,18525,,Yes,0.0,0.0,Yes,5.0,1.0
2,18526,3.0,No,5.0,6.0,No,15.0,9.0
3,18527,3.0,No,4.0,4.0,No,5.0,6.0
4,18528,9.0,Yes,1.0,2.0,Yes,1.0,1.0
...,...,...,...,...,...,...,...,...
6170,24694,3.0,No,5.0,5.0,No,9.0,6.0
6171,24695,8.0,Yes,2.0,1.0,Yes,0.0,0.0
6172,24696,2.0,No,4.0,3.0,No,9.0,7.0
6173,24697,3.0,No,4.0,4.0,No,11.0,9.0


In [58]:
test_data['Drained_after_socializing'] = test_data['Drained_after_socializing'].map({
    'Yes': 1,
    'No': 0
})
test_data

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,18524,3.0,No,7.0,4.0,0.0,6.0,
1,18525,,Yes,0.0,0.0,1.0,5.0,1.0
2,18526,3.0,No,5.0,6.0,0.0,15.0,9.0
3,18527,3.0,No,4.0,4.0,0.0,5.0,6.0
4,18528,9.0,Yes,1.0,2.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...
6170,24694,3.0,No,5.0,5.0,0.0,9.0,6.0
6171,24695,8.0,Yes,2.0,1.0,1.0,0.0,0.0
6172,24696,2.0,No,4.0,3.0,0.0,9.0,7.0
6173,24697,3.0,No,4.0,4.0,0.0,11.0,9.0


In [59]:
test_data['Stage_fear'] = test_data['Stage_fear'].map({
    'Yes': 1,
    'No': 0
})
test_data

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,18524,3.0,0.0,7.0,4.0,0.0,6.0,
1,18525,,1.0,0.0,0.0,1.0,5.0,1.0
2,18526,3.0,0.0,5.0,6.0,0.0,15.0,9.0
3,18527,3.0,0.0,4.0,4.0,0.0,5.0,6.0
4,18528,9.0,1.0,1.0,2.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...
6170,24694,3.0,0.0,5.0,5.0,0.0,9.0,6.0
6171,24695,8.0,1.0,2.0,1.0,1.0,0.0,0.0
6172,24696,2.0,0.0,4.0,3.0,0.0,9.0,7.0
6173,24697,3.0,0.0,4.0,4.0,0.0,11.0,9.0


In [60]:
test_data.isna().sum()

id                             0
Time_spent_Alone             425
Stage_fear                   598
Social_event_attendance      397
Going_outside                466
Drained_after_socializing    432
Friends_circle_size          350
Post_frequency               408
dtype: int64

In [62]:
# Step 1: Separate numeric and non-numeric columns
numeric_cols = test_data.select_dtypes(include='number').columns
non_numeric_cols = test_data.select_dtypes(exclude='number').columns

# Step 2: Apply KNN Imputer on numeric data only
imputer = KNNImputer(n_neighbors=5)
numeric_data = pd.DataFrame(imputer.fit_transform(test_data[numeric_cols]), columns=numeric_cols)

# Step 3: Combine numeric and non-numeric data
data_imputed_test = pd.concat([numeric_data, test_data[non_numeric_cols].reset_index(drop=True)], axis=1)
data_imputed_test

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,18524.0,3.0,0.0,7.0,4.0,0.0,6.0,5.2
1,18525.0,5.6,1.0,0.0,0.0,1.0,5.0,1.0
2,18526.0,3.0,0.0,5.0,6.0,0.0,15.0,9.0
3,18527.0,3.0,0.0,4.0,4.0,0.0,5.0,6.0
4,18528.0,9.0,1.0,1.0,2.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...
6170,24694.0,3.0,0.0,5.0,5.0,0.0,9.0,6.0
6171,24695.0,8.0,1.0,2.0,1.0,1.0,0.0,0.0
6172,24696.0,2.0,0.0,4.0,3.0,0.0,9.0,7.0
6173,24697.0,3.0,0.0,4.0,4.0,0.0,11.0,9.0


In [63]:
data_imputed_test.isna().sum()

id                           0
Time_spent_Alone             0
Stage_fear                   0
Social_event_attendance      0
Going_outside                0
Drained_after_socializing    0
Friends_circle_size          0
Post_frequency               0
dtype: int64

In [None]:
feature_columns = [col for col in data_imputed_test.columns if col not in ["id"]]
X_test = data_imputed_test[feature_columns]
y_test_pred = model.predict(X_test) 
submission = pd.DataFrame({
    "id": data_imputed_test["id"].astype(int),
    "Personality": y_test_pred
})
submission

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
...,...,...
6170,24694,Extrovert
6171,24695,Introvert
6172,24696,Extrovert
6173,24697,Extrovert


In [67]:
submission.to_csv("submission.csv", index=False)