In [4]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Generate 5000 rows of synthetic insurance data
rows = 5000
ages = np.random.randint(18, 65, size=rows)
sexes = np.random.choice(['male', 'female'], size=rows)
bmis = np.round(np.random.uniform(15, 40, size=rows), 1)
children = np.random.randint(0, 5, size=rows)
smokers = np.random.choice(['yes', 'no'], size=rows)
regions = np.random.choice(['southwest', 'southeast', 'northwest', 'northeast'], size=rows)

# Estimate charges (simplified logic)
charges = np.round(
    2000 + (ages * 50) + (bmis * 200) + (children * 500) +
    np.where(smokers == 'yes', 10000, 0) + np.random.normal(0, 1000, size=rows),
    2
)

# Create DataFrame
df = pd.DataFrame({
    "age": ages,
    "sex": sexes,
    "bmi": bmis,
    "children": children,
    "smoker": smokers,
    "region": regions,
    "charges": charges
})

# Save to CSV
df.to_csv("insurance_5000_rows.csv", index=False)
print("✅ 'insurance_5000_rows.csv' created successfully!")


✅ 'insurance_5000_rows.csv' created successfully!


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso

# Load the dataset
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv("Credit_Risk_Scoring_Important_Features.csv").dropna()

# Prepare features and target
if "Balance" in df.columns:
    X = df.drop("Balance", axis=1)
    y = df["Balance"]
else:
    raise ValueError("⚠️ 'Balance' column not found in the dataset!")

# Convert categorical to numerical
X = pd.get_dummies(X, drop_first=True)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Lasso model
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

# Extract important features
selected_features = pd.Series(lasso.coef_, index=X.columns)
important_features = selected_features[selected_features != 0].sort_values(ascending=False)

# Save to CSV
important_features_df = important_features.reset_index()
important_features_df.columns = ["Feature", "Coefficient"]
important_features_df.to_csv("Important_Credit_Risk_Factors.csv", index=False)

print("✅ File 'Important_Credit_Risk_Factors.csv' has been created successfully!")



ValueError: ⚠️ 'Balance' column not found in the dataset!