# ML_Regression_Problems

In [107]:
# import libraries
from sklearn.datasets import fetch_california_housing, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

In [109]:
# Prepare regression datasets
reg_datasets = {
    "California_Housing": fetch_california_housing(),
    "Diabetes": load_diabetes(),
    "Synthetic_1": pd.DataFrame({
        "X": np.arange(0, 100),
        "y": 2 * np.arange(0, 100) + np.random.randn(100) * 10
    }),
    "Synthetic_2": pd.DataFrame({
        "X": np.linspace(0, 10, 100),
        "y": 5 * np.sin(np.linspace(0, 10, 100)) + np.random.randn(100)
    }),
    "Synthetic_3": pd.DataFrame({
        "X": np.random.rand(100),
        "y": np.random.rand(100) * 50
    })
}


In [111]:
# Regression models
reg_models = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(n_estimators=100),
    "SVR": SVR()
}


In [113]:
# Results dictionary
regression_results = []

In [115]:
# Loop through regression datasets
for name, dataset in reg_datasets.items():
    if isinstance(dataset, dict):  # sklearn datasets
        X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
        y = pd.Series(dataset.target, name='target')
    else:
        X = dataset[["X"]]
        y = dataset["y"]

    # Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Apply models
    for model_name, model in reg_models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        mse = mean_squared_error(y_test, preds)
        r2 = r2_score(y_test, preds)
        regression_results.append({
            "Dataset": name,
            "Model": model_name,
            "MSE": mse,
            "R2_Score": r2
        })

# Convert to DataFrame
regression_summary_df = pd.DataFrame(regression_results)
regression_summary_df.sort_values(by=["Dataset", "R2_Score"], ascending=[True, False], inplace=True)
regression_summary_df.reset_index(drop=True, inplace=True)
regression_summary_df


Unnamed: 0,Dataset,Model,MSE,R2_Score
0,California_Housing,RandomForest,0.255268,0.805199
1,California_Housing,SVR,0.355208,0.728933
2,California_Housing,DecisionTree,0.494902,0.62233
3,California_Housing,LinearRegression,0.555892,0.575788
4,Diabetes,LinearRegression,2900.193628,0.452603
5,Diabetes,RandomForest,3079.511402,0.418757
6,Diabetes,SVR,4333.285955,0.182114
7,Diabetes,DecisionTree,4691.438202,0.114514
8,Synthetic_1,LinearRegression,41.033746,0.988112
9,Synthetic_1,RandomForest,109.688353,0.968222


# Automation EDA For housing calfornia dataset

# Dtale

In [118]:
calfornia = fetch_california_housing()

In [119]:
calfornia

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [120]:
cal_df = pd.DataFrame(calfornia.data, columns = calfornia.feature_names)
cal_df['target'] = calfornia.target

In [121]:
import dtale
dtale.show(cal_df)



# y-data profiling

In [123]:
from ydata_profiling import ProfileReport
profile2 = ProfileReport(cal_df, explorative=True)
profile2.to_file('Calfornia.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|████████████████████████████████████████████| 9/9 [00:00<00:00, 204.54it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Sweet viz

In [125]:
import sweetviz as sv
re = sv.analyze(cal_df)
re.show_html("Calfornia_sweetviz.html")

                                             |      | [  0%]   00:00 -> (? left)

Report Calfornia_sweetviz.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Autoviz

In [127]:
from autoviz.AutoViz_Class import AutoViz_Class

AV = AutoViz_Class()

# DO NOT include `filename`
dftc = AV.AutoViz(
    filename='',
    dfte=cal_df,             # Only pass your DataFrame
    depVar="MedInc",    # Make sure this column exists
    sep=",",                 # This is optional when using dfte
    header=0,                # Usually 0 if df has headers; can be None
    verbose=1,
    lowess=False,
    chart_format="png",
    max_rows_analyzed=300000,
    max_cols_analyzed=30
)


Shape of your Data Set loaded: (20640, 9)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    Number of Numeric Columns =  8
    Number of Integer-Categorical Columns =  0
    Number of String-Categorical Columns =  0
    Number of Factor-Categorical Columns =  0
    Number of String-Boolean Columns =  0
    Number of Numeric-Boolean Columns =  0
    Number of Discrete String Columns =  0
    Number of NLP String Columns =  0
    Number of Date Time Columns =  0
    Number of ID Columns =  0
    Number of Columns to Delete =  0
    8 Predictors classified...
        No variables removed since no ID or low-information variables found in data set

################ Regression problem #####################
To fix these data quality issues in

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
HouseAge,float64,0.0,,1.0,52.0,No issue
AveRooms,float64,0.0,,0.846154,141.909091,Column has 511 outliers greater than upper bound (8.47) or lower than lower bound(2.02). Cap them or remove them.
AveBedrms,float64,0.0,,0.333333,34.066667,"Column has 1424 outliers greater than upper bound (1.24) or lower than lower bound(0.87). Cap them or remove them., Column has a high correlation with ['AveRooms']. Consider dropping one of them."
Population,float64,0.0,,3.0,35682.0,Column has 1196 outliers greater than upper bound (3132.00) or lower than lower bound(-620.00). Cap them or remove them.
AveOccup,float64,0.0,,0.692308,1243.333333,Column has 711 outliers greater than upper bound (4.56) or lower than lower bound(1.15). Cap them or remove them.
Latitude,float64,0.0,,32.54,41.95,No issue
Longitude,float64,0.0,,-124.35,-114.31,Column has a high correlation with ['Latitude']. Consider dropping one of them.
target,float64,0.0,,0.14999,5.00001,Column has 1071 outliers greater than upper bound (4.82) or lower than lower bound(-0.98). Cap them or remove them.
MedInc,float64,0.0,62.0,0.4999,15.0001,Target column


Number of All Scatter Plots = 36
All Plots done
Time to run AutoViz = 2 seconds 

 ###################### AUTO VISUALIZATION Completed ########################


# Automation EDA for Diabetes Dataset

# dtale

In [130]:
Diabetes = load_diabetes()

In [131]:
dia_df = pd.DataFrame(Diabetes.data, columns = Diabetes.feature_names)
dia_df['target'] = Diabetes.target

In [132]:
dia_df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
5,-0.092695,-0.044642,-0.040696,-0.019442,-0.068991,-0.079288,0.041277,-0.076395,-0.041176,-0.096346,97.0
6,-0.045472,0.05068,-0.047163,-0.015999,-0.040096,-0.0248,0.000779,-0.039493,-0.062917,-0.038357,138.0
7,0.063504,0.05068,-0.001895,0.066629,0.09062,0.108914,0.022869,0.017703,-0.035816,0.003064,63.0
8,0.041708,0.05068,0.061696,-0.040099,-0.013953,0.006202,-0.028674,-0.002592,-0.01496,0.011349,110.0
9,-0.0709,-0.044642,0.039062,-0.033213,-0.012577,-0.034508,-0.024993,-0.002592,0.067737,-0.013504,310.0


In [133]:
import dtale

In [134]:
dtale.show(dia_df)



# y_dataProfiling

In [149]:
from ydata_profiling import ProfileReport
profile2 = ProfileReport(dia_df, explorative=True)
profile2.to_file('Diabetes.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|███████████████████████████████████████| 11/11 [00:00<00:00, 322638.77it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# SweetViz

In [152]:
import sweetviz as sv
r = sv.analyze(dia_df)
r.show_html("Diabetes_sweetviz.html")

                                             |      | [  0%]   00:00 -> (? left)

Report Diabetes_sweetviz.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Autoviz

In [159]:
from autoviz.AutoViz_Class import AutoViz_Class

AV = AutoViz_Class()

# DO NOT include `filename`
dftc = AV.AutoViz(
    filename='',
    dfte=dia_df,             # Only pass your DataFrame
    depVar="target",    # Make sure this column exists
    sep=",",                 # This is optional when using dfte
    header=0,                # Usually 0 if df has headers; can be None
    verbose=1,
    lowess=False,
    chart_format="png",
    max_rows_analyzed=300000,
    max_cols_analyzed=30
)


Shape of your Data Set loaded: (442, 11)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    Number of Numeric Columns =  9
    Number of Integer-Categorical Columns =  0
    Number of String-Categorical Columns =  0
    Number of Factor-Categorical Columns =  0
    Number of String-Boolean Columns =  0
    Number of Numeric-Boolean Columns =  1
    Number of Discrete String Columns =  0
    Number of NLP String Columns =  0
    Number of Date Time Columns =  0
    Number of ID Columns =  0
    Number of Columns to Delete =  0
    10 Predictors classified...
        No variables removed since no ID or low-information variables found in data set

################ Regression problem #####################
To fix these data quality issues in

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
age,float64,0.0,,-0.107226,0.110727,No issue
sex,float64,0.0,0.0,-0.044642,0.05068,No issue
bmi,float64,0.0,,-0.090275,0.170555,Column has 3 outliers greater than upper bound (0.13) or lower than lower bound(-0.13). Cap them or remove them.
bp,float64,0.0,,-0.112399,0.132044,No issue
s1,float64,0.0,,-0.126781,0.153914,Column has 8 outliers greater than upper bound (0.12) or lower than lower bound(-0.13). Cap them or remove them.
s2,float64,0.0,,-0.115613,0.198788,"Column has 7 outliers greater than upper bound (0.12) or lower than lower bound(-0.12). Cap them or remove them., Column has a high correlation with ['s1']. Consider dropping one of them."
s3,float64,0.0,,-0.102307,0.181179,Column has 7 outliers greater than upper bound (0.13) or lower than lower bound(-0.13). Cap them or remove them.
s4,float64,0.0,,-0.076395,0.185234,Column has 2 outliers greater than upper bound (0.15) or lower than lower bound(-0.15). Cap them or remove them.
s5,float64,0.0,,-0.126097,0.133597,Column has 4 outliers greater than upper bound (0.13) or lower than lower bound(-0.13). Cap them or remove them.
s6,float64,0.0,,-0.137767,0.135612,Column has 9 outliers greater than upper bound (0.12) or lower than lower bound(-0.12). Cap them or remove them.


Number of All Scatter Plots = 45
No categorical or numeric vars in data set. Hence no bar charts.
All Plots done
Time to run AutoViz = 2 seconds 

 ###################### AUTO VISUALIZATION Completed ########################
