In [5]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from asgl import Regressor


# Loading Data

Data was loaded and inspected. Here we can see some values are missing throughout the dataset. We can also see some columns are categorical.

In [11]:
train = pd.read_csv('playground-series/train.csv')
test = pd.read_csv('playground-series/test.csv')

print(train.shape)
display(train.head())
display(test.head())
print(train.dtypes)

(18524, 9)


Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,18524,3.0,No,7.0,4.0,No,6.0,
1,18525,,Yes,0.0,0.0,Yes,5.0,1.0
2,18526,3.0,No,5.0,6.0,No,15.0,9.0
3,18527,3.0,No,4.0,4.0,No,5.0,6.0
4,18528,9.0,Yes,1.0,2.0,Yes,1.0,1.0


id                             int64
Time_spent_Alone             float64
Stage_fear                    object
Social_event_attendance      float64
Going_outside                float64
Drained_after_socializing     object
Friends_circle_size          float64
Post_frequency               float64
Personality                   object
dtype: object


# Converting textual/object columns and Splitting Labels
"Stage_Fear", 'Drained_after_socializing" (the categorical columns) will be converted to numerical representations using one hot encoding. Numerical columns will be standardized. Also the location of the NaN values, the original id column, and the training labels are to be saved. 

In [None]:
def ID_NAs(df):
    return np.where(df.isna())

def LabelSplitter(traindf):
    traindf = np.array(traindf)
    x = traindf[:, :-1]
    y = traindf[:, -1] 
    return x, y  

def GetIDs(arr):
    ids = arr[:, :1]
    arr = arr[:, 1:]
    return arr, ids


nanlocs_train = ID_NAs(train)
nanlocs_test = ID_NAs(test)

train = np.array(train)
test = np.array(test)

train, train_ids = GetIDs(train)
test, test_ids = GetIDs(test)

xtrain, labels = LabelSplitter(train)
xtest = np.array(test)

coltrans = ColumnTransformer(
                    transformers=[('cat', OneHotEncoder(sparse_output=False, drop='first'), [1, 4]), 
                                ('cont', StandardScaler(), [0, 2, 3, 5, 6])], 
                    remainder='passthrough'
                             )
xtrain_oh = coltrans.fit_transform(xtrain)
xtest_oh = coltrans.transform(xtest)
labels = np.where(labels == 'Extrovert', 0, 1)

print(xtrain_oh)


[[ 0.          0.          0.         ... -0.02148798  1.65821945
   0.00621824]
 [ 0.          0.          0.         ... -0.50633192  0.4743288
   1.04822661]
 [ 1.          0.          0.         ... -1.96086373 -1.1831181
  -1.73046236]
 ...
 [ 1.          0.          1.         ... -1.47601979 -1.65667436
          nan]
 [ 1.          0.          1.         ... -1.96086373 -0.70956184
  -1.03579012]
 [ 0.          0.          0.         ...  0.94819989 -0.94633997
   0.70089048]]


# KNN Imputation and Logistic Regression with LASSO Penalization Pipeline

This is where the model pipeline was created. First I applied the K-nearest neighbors imputation method to handle the missing values. This looks at a chosen number of points closest to each point containing a missing value to fill what is missing.

Then a logistic regression model using a LASSO penalty was used for classification. LASSO is used for variable selection in the logistic regression model. Predicted values (0 or 1) were then mapped back to their respective categories ('Extrovert' or 'Introvert') and saved as a CSV file.

For each step in the pipeline, parameters were optimized using cross validation with GridSearchCV.

In [13]:
pipeli = Pipeline([
    ('imputer', KNNImputer(weights='distance')), 
    ('classifier', Regressor(model='logit', penalization='lasso'))])

params = {
    'imputer__n_neighbors': [3, 7, 11, 18, 25],
    'classifier__lambda1': [round(b*10**i, 4) for i in range(-3, 2) for b in range(2, 12, 2)]
}

grid = GridSearchCV(pipeli, params)
grid.fit(xtrain_oh, labels)
labelspred = grid.predict(xtest_oh)

# TODO: Implement Group LASSO, XGBoost model, and explore other classifiers




In [14]:
labeldict = {0: 'Extrovert', 1: 'Introvert'}
labelspred = np.array([labeldict[val] for val in labelspred])
submission = np.hstack((test_ids, labelspred.reshape((-1, 1))))

print(f'Predictions for test data set: \n {submission}')

submissiondf = pd.DataFrame(submission, columns=['id', 'Personality'])
submissiondf.to_csv('submit.csv', index=False)


Predictions for test data set: 
 [[18524 'Extrovert']
 [18525 'Introvert']
 [18526 'Extrovert']
 ...
 [24696 'Extrovert']
 [24697 'Extrovert']
 [24698 'Introvert']]
