In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from asgl import Regressor


# Loading Data

Data was loaded and inspected. Here we can see some values are missing throughout the dataset. We can also see some columns are categorical.

In [None]:
train = pd.read_csv('playground-series/train.csv')
test = pd.read_csv('playground-series/test.csv')

print(train.shape)
display(train.head())
display(test.head())
print(train.dtypes)

(18524, 9)


Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,18524,3.0,No,7.0,4.0,No,6.0,
1,18525,,Yes,0.0,0.0,Yes,5.0,1.0
2,18526,3.0,No,5.0,6.0,No,15.0,9.0
3,18527,3.0,No,4.0,4.0,No,5.0,6.0
4,18528,9.0,Yes,1.0,2.0,Yes,1.0,1.0


id                             int64
Time_spent_Alone             float64
Stage_fear                    object
Social_event_attendance      float64
Going_outside                float64
Drained_after_socializing     object
Friends_circle_size          float64
Post_frequency               float64
Personality                   object
dtype: object


# Converting textual/object columns and Splitting Labels
"Stage_Fear", 'Drained_after_socializing" (the categorical columns) will be converted to numerical representations using one hot encoding. Also saved the location of the NaN values, the original id column, and the training labels. Missing values were imputed using K-nearest neighbor imputation.

In [560]:
def ID_NAs(df):
    return np.where(df.isna())

def LabelSplitter(traindf):
    traindf = np.array(traindf)
    x = traindf[:, :-1]
    y = traindf[:, -1] 
    return x, y  

def GetIDs(arr):
    ids = arr[:, :1]
    arr = arr[:, 1:]
    return arr, ids


nanlocs_train = ID_NAs(train)
nanlocs_test = ID_NAs(test)

train = np.array(train)
test = np.array(test)

train, train_ids = GetIDs(train)
test, test_ids = GetIDs(test)

xtrain, labels = LabelSplitter(train)
xtest = np.array(test)

coltrans = ColumnTransformer(
                             transformers=[('cat', OneHotEncoder(sparse_output=False, drop='first'), [1, 4])], 
                             remainder='passthrough'
                             )
xtrain_oh = coltrans.fit_transform(xtrain)
xtest_oh = coltrans.transform(xtest)
labels = np.where(labels == 'Extrovert', 0, 1)

print(f'Before imputation (sample): \n {xtrain_oh[:15]}')

knnimp = KNNImputer(n_neighbors=5, weights='distance')
xtrain_oh = np.round(knnimp.fit_transform(xtrain_oh))
xtest_oh = np.round(knnimp.transform(xtest_oh))

print(f'After imputation (sample): \n {xtrain_oh[:15]}')


Before imputation (sample): 
 [[0.0 0.0 0.0 0.0 0.0 6.0 4.0 15.0 5.0]
 [0.0 0.0 0.0 0.0 1.0 7.0 3.0 10.0 8.0]
 [1.0 0.0 0.0 1.0 6.0 1.0 0.0 3.0 0.0]
 [0.0 0.0 0.0 0.0 3.0 7.0 3.0 11.0 5.0]
 [0.0 0.0 0.0 0.0 1.0 4.0 4.0 13.0 nan]
 [0.0 0.0 0.0 0.0 2.0 8.0 5.0 nan 3.0]
 [0.0 0.0 0.0 0.0 1.0 8.0 nan nan 4.0]
 [0.0 0.0 0.0 0.0 2.0 8.0 3.0 4.0 5.0]
 [1.0 0.0 0.0 1.0 4.0 2.0 1.0 0.0 2.0]
 [0.0 0.0 0.0 0.0 1.0 8.0 6.0 14.0 9.0]
 [0.0 0.0 0.0 0.0 3.0 7.0 4.0 5.0 10.0]
 [0.0 0.0 0.0 0.0 2.0 6.0 3.0 4.0 8.0]
 [0.0 0.0 0.0 0.0 3.0 5.0 4.0 9.0 6.0]
 [0.0 1.0 0.0 0.0 3.0 nan 5.0 12.0 5.0]
 [0.0 0.0 0.0 0.0 3.0 6.0 4.0 9.0 nan]]
After imputation (sample): 
 [[ 0.  0.  0.  0.  0.  6.  4. 15.  5.]
 [ 0.  0.  0.  0.  1.  7.  3. 10.  8.]
 [ 1.  0.  0.  1.  6.  1.  0.  3.  0.]
 [ 0.  0.  0.  0.  3.  7.  3. 11.  5.]
 [ 0.  0.  0.  0.  1.  4.  4. 13.  5.]
 [ 0.  0.  0.  0.  2.  8.  5. 12.  3.]
 [ 0.  0.  0.  0.  1.  8.  4. 11.  4.]
 [ 0.  0.  0.  0.  2.  8.  3.  4.  5.]
 [ 1.  0.  0.  1.  4.  2.  1.  0.  2

# Logistic Regression with LASSO Penalization

Here a logistic regression model using a LASSO penalty was used for classification. LASSO is used for variable selection in the logistic regression model. Lambda values were cross validated using 5-fold cross validation. Predicted values (0 or 1) were then mapped back to the respective categories ('Extrovert' or 'Introvert') and saved as a CSV file.

In [None]:
lambdas = [round(b*10**i, 4) for i in range(-3, 2) for b in range(2, 12, 2)]
logreg = Regressor(model='logit', penalization='lasso')
cv = GridSearchCV(estimator=logreg, param_grid={'lambda1': lambdas}, cv=5)
cv.fit(xtrain_oh, labels)
labelspred = cv.predict(xtest_oh)

# TODO: Implement Group LASSO, XGBoost model, and test other classifiers




In [None]:
labeldict = {0: 'Extrovert', 1: 'Introvert'}
labelspred = np.array([labeldict[val] for val in labelspred])
submission = np.hstack((test_ids, labelspred.reshape((-1, 1))))

print(submission)


[[18524 'Extrovert']
 [18525 'Introvert']
 [18526 'Extrovert']
 ...
 [24696 'Extrovert']
 [24697 'Extrovert']
 [24698 'Introvert']]


In [None]:

submissiondf = pd.DataFrame(submission, columns=['id', 'Personality'])
submissiondf.to_csv('submit.csv', index=False)