## Imports

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

df_train = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")
df_train.columns = df_train.columns.str.replace(' ','_')
df_test.columns = df_test.columns.str.replace(' ','_')

In [None]:
df_train_X = df_train.drop(columns=['Personality','id'])
df_train_y = df_train['Personality']


In [14]:
import numpy as np

class Int64Converter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Konwersja do pandas DataFrame jeśli to numpy array
        if isinstance(X, np.ndarray):
            return X.astype(np.int64)
        else:
            return X.astype('Int64')

time_spent_alone_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=2)),
    ('scaler', StandardScaler()),
    ('int64_converter', Int64Converter())
])


stage_fear_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


social_event_attendance_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=3.5)),
    ('scaler', StandardScaler())
])


going_outside_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=2.5)),
    ('scaler', StandardScaler())
])


drained_after_socializing_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


friends_circle_size_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=5)),
    ('scaler', StandardScaler())
])


post_frequency_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=2.5)),
    ('scaler', StandardScaler())
])

In [15]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('col1', time_spent_alone_pipeline, ['Time_spent_Alone']),
    ('col2', stage_fear_pipeline,['Stage_fear']),
    ('col3', social_event_attendance_pipeline,['Social_event_attendance']),
    ('col4', going_outside_pipeline,['Going_outside']),
    ('col5', drained_after_socializing_pipeline,['Drained_after_socializing']),
    ('col6', friends_circle_size_pipeline,['Friends_circle_size']),
    ('col7', post_frequency_pipeline,['Post_frequency']),
])

In [10]:
df_train_X.head(10)

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,0.0,No,6.0,4.0,No,15.0,5.0
1,1.0,No,7.0,3.0,No,10.0,8.0
2,6.0,Yes,1.0,0.0,,3.0,0.0
3,3.0,No,7.0,3.0,No,11.0,5.0
4,1.0,No,4.0,4.0,No,13.0,
5,2.0,No,8.0,5.0,No,,3.0
6,1.0,No,8.0,,No,,4.0
7,2.0,No,8.0,3.0,No,4.0,5.0
8,4.0,Yes,2.0,1.0,,0.0,2.0
9,1.0,No,8.0,6.0,No,14.0,9.0


In [18]:
preprocessor.fit(df_train_X)

In [19]:
df_train_transformed_X = preprocessor.transform(df_train_X)
df_test_transformed_X = preprocessor.transform(df_test)

## Basic Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [25]:
basic_lr = LogisticRegression()
basic_lr.fit(df_train_transformed_X, df_train_y)

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df_train_transformed_X, df_train_y, test_size=0.2, random_state=55)

split_basic_lr = LogisticRegression()
split_basic_lr.fit(X_train,y_train)

ypred = split_basic_lr.predict(X_val)
print(accuracy_score(y_val,ypred))

0.9665317139001349


In [28]:
ids = df_test['id']
X_test = df_test.drop(['id'],axis=1)
test_transformed = preprocessor.transform(X_test)
test_ypred = basic_lr.predict(test_transformed)
submission = pd.DataFrame({
    'id':ids,
    'Personality': test_ypred
})
submission.to_csv('basic_lr.csv', index=False)