In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
       
train_df = pd.read_csv("/kaggle/input/ml-1-spring-2023-challenge-2/training.csv");
test_df = pd.read_csv("/kaggle/input/ml-1-spring-2023-challenge-2/testing.csv");

In [3]:
# Split the training data into features and target variable
X_train = train_df.drop(['TARGET_LABEL'], axis=1)
y_train = train_df['TARGET_LABEL']

In [4]:

# Identify categorical and numerical columns
cat_cols = X_train.select_dtypes(include=['object']).columns
num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

In [5]:
# Define preprocessing pipelines for categorical and numerical columns
cat_pipe = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [6]:
num_pipe = Pipeline([
    ('scaler', StandardScaler())
])

In [7]:
# Combine the pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_pipe, cat_cols),
        ('num', num_pipe, num_cols)
    ])

In [8]:
# Train a logistic regression model
clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [9]:
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['row ID', 'DATETIME', 'CREDIT_ANALYSIS_PROCESS', 'Gender',
       'MaritalStatus', 'Flag_TelResidence', 'TypeOfResidence',
       'FLAG_MotherName', 'Flag_FatherName', 'Flag_JobSameCity',
       'Flag_OtherCard'],
      dtype='object')),
                                                 ('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['ID_CLIENT', 'BranchID', 'Age', 'NumberOfDependents', 'BillDueDate',
       'MonthsInCurrentResidence', 'MonthsInCurrentJob',
       'NumberOfBankAcc

In [10]:
# Make predictions on the testing data
X_test = test_df
y_pred_proba = clf.predict_proba(X_test)[:, 1]

In [11]:
# Output the predicted probabilities in a CSV file
output_df = pd.DataFrame({'row ID': test_df['row ID'], 'TARGET_LABEL': y_pred_proba})
output_df.to_csv('/kaggle/working/predictions.csv', index=False)

In [12]:
print(output_df)

         row ID  TARGET_LABEL
0          Row2      0.077356
1          Row6      0.166802
2          Row7      0.055259
3         Row10      0.126088
4         Row12      0.255178
...         ...           ...
29995  Row99993      0.216939
29996  Row99994      0.182274
29997  Row99995      0.249943
29998  Row99998      0.257785
29999  Row99999      0.324428

[30000 rows x 2 columns]
