In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

In [2]:
# Data Retrieval
application_df = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vR2kOtee9P4bRefkQteex-wI8h8Ci1qLIw5BV7InwDaFf6ha7Lqv1WiY9JDohleBQ/pub?gid=1091767314&single=true&output=csv")
application_df.head()

Unnamed: 0,homeownership,income,age,total
0,owner,Less than $10000,Under 25 years old,42
1,owner,Less than $10000,25 to 29 years old,50
2,owner,Less than $10000,30 to 34 years old,109
3,owner,Less than $10000,35 to 44 years old,353
4,owner,Less than $10000,45 to 54 years old,527


In [3]:
# 2. Data Preprocessing
# Assuming you want to use 'application_df' instead of randomly generated data
# Drop any rows with missing values
application_df.dropna(inplace=True)

In [4]:
# Replace non-numeric values in the 'age' column
age_mapping = {'Under 25 years old': 20, '25 to 29 years old': 27, '30 to 34 years old': 32,
               '35 to 44 years old': 40, '45 to 54 years old': 50, '55 to 64 years old': 60,
               '65 to 74 years old': 70, '75 years old and over': 75}

application_df['age'] = application_df['age'].replace(age_mapping)

# Replace non-numeric values in the 'income' column
income_mapping = {'Less than $10000': 5000, '$10000 - $24999': 17500, '$25000 - $49999': 37500,
                  '$50000 - $74999': 62500, '$75000 - $99999': 87500, '$100000 - $149999': 125000,
                  '$150000 - $199999': 175000, '$200000 or more': 200000}

application_df['income'] = application_df['income'].map(income_mapping)

# Now, standardize the data
scaler = StandardScaler()
application_df[['income', 'age']] = scaler.fit_transform(application_df[['income', 'age']])


In [5]:
# 3. Model Training and Evaluation
# Split data into features (X) and target variable (y)
X = application_df[['income', 'age']]
y = application_df['homeownership']

In [6]:
# Split data into features (X) and target variable (y)
X = application_df[['income', 'age']]
y = application_df['homeownership']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Initialize the imputer with a strategy to fill missing values (e.g., with mean)
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on the training data
X_train_imputed = imputer.fit_transform(X_train)

# Transform the test data using the trained imputer
X_test_imputed = imputer.transform(X_test)

In [8]:
# Now train your Logistic Regression model
model = LogisticRegression()
model.fit(X_train_imputed, y_train)

In [9]:
# Make predictions on the test data
y_pred = model.predict(X_test_imputed)

In [10]:
# Evaluate the model using score method
accuracy = model.score(X_test_imputed, y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 1.0
