In [None]:

"""
Breast Cancer Campaign

After the success of the second part of the campaign, the medical insurance company has become more and more into the idea of awareness-raising campaigns. However, the board needs confirmation on the company’s expenses to make sure of their financial status.
Before you can look at the financial reports, you are required to perform regression modeling on an insurance dataset to predict the expenses based on some information about patients.
You are required to:
Perform EDA and visualization on the dataset.
Implement different regression models.
Get an acceptable R-squared score.

"""
# from networkx import display
import sys
from sklearn.calibration import LabelEncoder
from sklearn.svm import SVR


def main():

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import StandardScaler
    import seaborn as sns

    # STEP 1: Download the Dataset 
    df = pd.read_csv('insurance.csv')    
    print("DATASET", df)
    print("DATASET INFO", df.info())
    print("DATASET DESCR", df.describe())
    print("DATASET DATA TYPES", df.dtypes)


    # STEP 2 : Reading the Dataset
    # Step 2a: Create a boolean DataFrame showing missing values (True/False)
    missing_values_dataframe = df.isna()
    print("BOOLEAN MISSING VALUE MASK:")
    print(missing_values_dataframe.head())  # Shows True where data is missing

    # Step 2b: Count total missing values per column
    missing_counts = missing_values_dataframe.sum()
    print("\nCOUNT OF MISSING VALUES PER COLUMN:")
    print(missing_counts)
    # Count total missing values in entire dataset
    # This will show how many missing (NaN) values each column has — see that Unnamed: 32 has all NaNs.
    total_missing =  missing_values_dataframe.values.sum()
    print("\nTOTAL MISSING VALUES IN DATASET:", total_missing)

    # Drop missing values
    df = df.dropna()
    print("\nMissing values after dropping:")
    print(df.isnull().sum())
    print("DATASET", df)
    print("DATASET HEAD",df.head())

    # ----------------------------
    # EXPLORATORY DATA ANALYSIS (EDA)
    # ----------------------------
    print("\nDESCRIPTIVE STATS:\n", df.describe())


    # Count plots for categorical features
    for col in ["sex", "smoker", "region"]:
        plt.figure()
        sns.countplot(x=df[col])
        plt.title(f"Count Plot of {col}")
        plt.tight_layout()
        plt.show()

    # Pair plot for a subset of features
    plt.figure()
    sns.pairplot(df[["age", "bmi", "children", "charges"]])
    plt.show()

    # Correlation matrix heat map (numeric only)
    numeric_cols = ["age", "bmi", "children", "charges"]
    corr = df[numeric_cols].corr()
    plt.figure(figsize=(6, 4))
    sns.heatmap(corr, annot=True, fmt=".2f")
    plt.title("Correlation Heatmap (Numeric Features)")
    plt.tight_layout()
    plt.show()

    # Box plots for numeric features
    for col in numeric_cols:
        plt.figure()
        sns.boxplot(x=df[col])
        plt.title(f"Boxplot of {col}")
        plt.tight_layout()
        plt.show()


    numeric_features = ["age", "bmi", "children", "charges"]
    # Categorical columns
    categorical_features = ["sex", "smoker", "region"]
    
    columns_to_encode = ["sex", "smoker", "region"]

# Apply get_dummies() to selected columns
    df = pd.get_dummies(df, columns=columns_to_encode, drop_first=True, dtype=int)
    print("encoded_df:\n", df.head())
    # encoded_df.to_csv("endoded_insurance.csv", index=False)

    # Combine encoded columns with the rest of numeric columns
#     df = pd.concat(
#         [df.drop(columns=columns_to_encode), df],
#         axis=1
# )
    print("\nFINAL ENCODED DF:\n", df.head())

    # Export correct encoded CSV
    df.to_csv("encoded_insurance.csv", index=False)
    
    
    # Define Features (X) and Target (y)
    # Separating Features (X) and Target (y)
    # Assuming 'charges' is the target
    
    
    # Target & features
    X = df.drop("charges", axis=1)
    y = df["charges"]
    
   
    print("X...",X)
    print("y...",y)
    

    # Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    # df.to_csv("forecast_data_scaled.csv", index=False)
    # Convert the NumPy array to a Pandas DataFrame
    print("TYPE", type( X_scaled))
    X_scaled_df = pd.DataFrame(X_scaled)
    print("X_scaled_df", X_scaled_df)

    # Splitting the Data
    # We split the data into 80% training, 10% validation, and 10% test sets.     

    from sklearn.model_selection import train_test_split

    # First split: 80% Train, 20% Temp
    X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.2, random_state=1)

    # Second split: Split the 20% Temp into 50/50 (10% Val, 10% Test)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Validation set: {X_val.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")
    
    from sklearn.tree import DecisionTreeRegressor

    modelTree = DecisionTreeRegressor(max_leaf_nodes = 10, max_depth=6).fit(X_train, y_train)
    y_pred = modelTree.predict(X_test)

    scoreDT = modelTree.score(X_test, y_test)
    print("scoreDT", scoreDT)

    from sklearn.metrics import mean_squared_error
    y_pred = modelTree.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print("MSE", mse)

    from sklearn.metrics import mean_absolute_error
    y_pred = modelTree.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    print("MAE", mae)

    """
    For Decision Trees:
    The primary criterion to optimize is often tree complexity, which can be controlled by parameters like:
    max_depth: The maximum depth of the tree.
    min_samples_split: The minimum number of samples required to split an internal node. 
    min_samples_leaf: The minimum number of samples required to be at a leaf node.
    ccp_alpha (cost-complexity pruning parameter): A parameter used for pruning, where larger values lead to more pruning.
        
    Cross-validation process for Decision Trees:
    Define a range of values for the hyperparameters you want to optimize (e.g., max_depth from 1 to 20, or a range of ccp_alpha values).
    Perform k-fold cross-validation: Split the training data into k folds. For each fold, train a Decision Tree model on k-1 folds and evaluate its performance on the remaining fold (the validation set).
    Evaluate performance: Use a suitable metric (e.g., accuracy, F1-score, RMSE, R-squared) to assess the model's performance on the validation set for each hyperparameter combination.
    Average the performance: Calculate the average performance across all k folds for each hyperparameter combination.
    Select the optimal values: Choose the hyperparameter values that result in the best average performance.
    
    """

    from sklearn.tree import DecisionTreeRegressor
    from sklearn.metrics import mean_squared_error, mean_absolute_error

    best_score = -999
    best_params = {}

    max_depth_list = [3, 5, 7, 10, None]
    min_samples_split_list = [2, 5, 10]
    min_samples_leaf_list = [1, 2, 4]

    print("\n>>> Decision Tree Parameter Optimization <<<\n")

    for depth in max_depth_list:
        for min_split in min_samples_split_list:
            for min_leaf in min_samples_leaf_list:
                
                modelTree = DecisionTreeRegressor(
                    max_depth=depth,
                    min_samples_split=min_split,
                    min_samples_leaf=min_leaf,
                    random_state=1
                ).fit(X_train, y_train)

                y_pred = modelTree.predict(X_test)
                scoreDT = modelTree.score(X_test, y_test)

                print(
                    f"max_depth={depth}, min_samples_split={min_split}, "
                    f"min_samples_leaf={min_leaf} → R2={scoreDT:.3f}"
                )

                if scoreDT > best_score:
                    best_score = scoreDT
                    best_params = {
                        "max_depth": depth,
                        "min_samples_split": min_split,
                        "min_samples_leaf": min_leaf
                    }

    print("\nBest Decision Tree Score:", round(best_score, 4))
    print("Best Parameters:", best_params)

    # Retrain best model
    best_dt = DecisionTreeRegressor(
        **best_params,
        random_state=1
    ).fit(X_train, y_train)

    y_pred = best_dt.predict(X_test)

    print("\nFinal Decision Tree Evaluation:")
    print("R2:", best_dt.score(X_test, y_test))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("MAE:", mean_absolute_error(y_test, y_pred))
    
    """    
    For Random Forests:
    Random Forests, being an ensemble of Decision Trees, have additional hyperparameters to optimize, including:
    n_estimators: The number of trees in the forest.
    max_features: The number of features to consider when looking for the best split.
    Tree-specific parameters: The same parameters as for individual Decision Trees (max_depth, min_samples_split, min_samples_leaf, etc.).
    Cross-validation process for Random Forests:
    The process is similar to Decision Trees, but you will be optimizing a larger set of hyperparameters.
    Define a grid of hyperparameter combinations: to explore (e.g., combinations of n_estimators, max_features, and tree-specific parameters).
    Perform k-fold cross-validation: for each combination, training a Random Forest model and evaluating its performance on the validation sets.
    Evaluate and average performance: using appropriate metrics.
    Select the optimal combination: that yields the best average performance.
    
    """
    
    
    # Random forest

    from sklearn.ensemble import RandomForestRegressor
    score=[]
    estimator_num=[]
    temp_score = []

    for estimators in range(1,301,25):
        
        estimator_num.append(estimators)
        modelRandFor = RandomForestRegressor(n_estimators = estimators, 
                                            random_state = 0,
                                            max_leaf_nodes=10).fit(X_train,
                                                                y_train)
        y_predRF = modelRandFor.predict(X_test)
        
        score = round(modelRandFor.score(X_test, y_test), 3)
        temp_score.append(score)

        print(estimators,'estimators gives a score of:',modelRandFor.score(X_test, y_test))

    scoreRF = max(temp_score)
    print('Random Forest Maximum Score is:',scoreRF)

    from sklearn.metrics import mean_squared_error

    y_pred = modelRandFor.predict(X_test)
    mse_randforest = mean_squared_error(y_test, y_pred)
    print("MSE RANDOM FOREST", mse_randforest)

    from sklearn.metrics import mean_absolute_error

    y_pred = modelTree.predict(X_test)
    mae_randforest = mean_absolute_error(y_test, y_pred)  
    print("MAE RANDOM FOREST", mse_randforest)


    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error, mean_absolute_error

    best_score_RF = -999
    best_params_RF = {}

    # Your original estimator loop — extended for more parameters
    for estimators in range(1, 301, 25):            # n_estimators
        for max_feat in ["sqrt", "log2", None]:   # max_features
            for depth in [None, 5, 10, 20]:          # max_depth
                for min_split in [2, 5, 10]:         # min_samples_split
                    for min_leaf in [1, 2, 4]:       # min_samples_leaf

                        modelRandFor = RandomForestRegressor(
                            n_estimators=estimators,
                            max_features=max_feat,
                            max_depth=depth,
                            min_samples_split=min_split,
                            min_samples_leaf=min_leaf,
                            random_state=0,
                            n_jobs=-1
                        ).fit(X_train, y_train)

                        y_predRF = modelRandFor.predict(X_test)
                        score = round(modelRandFor.score(X_test, y_test), 4)

                        print(
                            f"Estimators={estimators}, max_features={max_feat}, "
                            f"max_depth={depth}, min_split={min_split}, "
                            f"min_leaf={min_leaf} → R2={score}"
                        )

                        # Track best model
                        if score > best_score_RF:
                            best_score_RF = score
                            best_params_RF = {
                                "n_estimators": estimators,
                                "max_features": max_feat,
                                "max_depth": depth,
                                "min_samples_split": min_split,
                                "min_samples_leaf": min_leaf
                            }

    print("\nBEST RANDOM FOREST R2:", best_score_RF)
    print("BEST RANDOM FOREST PARAMETERS:", best_params_RF)

    # Retrain best model
    best_rf = RandomForestRegressor(
        **best_params_RF,
        random_state=0,
        n_jobs=-1
    ).fit(X_train, y_train)

    # Evaluation
    y_pred = best_rf.predict(X_test)

    print("\nFinal Random Forest Evaluation:")
    print("R2:", best_rf.score(X_test, y_test))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("MAE:", mean_absolute_error(y_test, y_pred))
    
    #  - Try the SVR linear regression with a subset since it is choking
    from sklearn.svm import SVR

    modelSVR = SVR(kernel='linear', verbose=True).fit(X_train, y_train)
    y_pred = modelSVR.predict(X_test)

    scoreSVR = modelSVR.score(X_test, y_test)
    print('Linear SVR Model gives a score of: ',scoreSVR)

    sys.stdout.flush()
    print(">>> Moving on to SVR Polynomial SVR Model section...")

    deg = list(range(1,11))
    scoresSVRpoly = []

    for i in deg:
        modelSVRpoly = SVR(kernel='poly',degree=i, gamma='auto', tol = 0.001, max_iter = 100000).fit(X_train, y_train)

        score = modelSVRpoly.score(X_test, y_test)
        scoresSVRpoly.append(score)
        print(i, 'gives a score of: ',score) 

        scoreSVRpoly = max(scoresSVRpoly)
        bestSVRPoly = scoresSVRpoly.index(max(scoresSVRpoly)) + 1 # location of max score

        print('SVR Polynomial Regression gives a best score of:', scoreSVRpoly, "with a degree of",bestSVRPoly) 

    

main()



"""











"""