In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load train and test datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Exploratory Data Analysis (EDA)
# ... (perform EDA as needed)

# Pre-processing
# Separate features and target variable in the training set
X_train = train_data.drop(['Loan_Status', 'Loan_ID'], axis=1)
y_train = train_data['Loan_Status']

# Separate features in the test set
X_test = test_data.drop('Loan_ID', axis=1)

# Identify categorical columns
categorical_columns = [col for col in X_train.columns if X_train[col].dtype == 'object']

# Create transformers for numerical and categorical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X_train.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Create a pipeline with preprocessor and model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression(random_state=42))])

# Split the training set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model on the validation set
val_predictions = model.predict(X_val)
accuracy = accuracy_score(y_val, val_predictions)
print(f'Model Accuracy on Validation Set: {accuracy}')

# Predictions on Test Dataset
test_predictions = model.predict(X_test)

# Replace 'Loan_Status' column in the test dataset with predictions
test_data['Loan_Status'] = test_predictions

# Save the new dataset with predictions as a CSV file
test_data[['Loan_ID', 'Loan_Status']].to_csv('submission_results.csv', index=False)


Model Accuracy on Validation Set: 0.7886178861788617
