## 1. Import Libraries

In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder



## 1.2 Import Dataset <a id=1.4></a>

In [2]:
df = pd.read_csv(r"E:\Projects\Medical project\data.csv")
df.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# 2. Data Exploration <a id=2></a>

## 2.1 Info Aboute Vars<a id=2.1></a>

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [69]:
df.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

## 2.2 Check Data Types <a id=2.2></a>

In [7]:
df.select_dtypes(include='object').columns

Index(['sex', 'smoker', 'region'], dtype='object')

In [8]:
df['sex'].value_counts()

sex
male      676
female    662
Name: count, dtype: int64

In [9]:
df['smoker'].value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

In [10]:
df['region'].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [11]:
df.select_dtypes(include='number').columns

Index(['age', 'bmi', 'children', 'charges'], dtype='object')

## 2.3 Statistical Summary <a id=2.3></a>

In [None]:
df.describe().T##====>>Statistic data 'T ==>> Transpose'

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


## 2.4 Cleaning


In [13]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [3]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# 3. Exploratory Data Analysis

## 3.1 Grouping DataFrame By Categorical Vars <a id=3.1></a>

In [70]:
df.groupby('sex')[['age',  'bmi', 'children', 'charges']].mean()

Unnamed: 0_level_0,age,bmi,children,charges
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,39.503021,30.377749,1.074018,12569.578844
male,38.91716,30.943129,1.115385,13956.751178


In [71]:
df.groupby('region')[['age',  'bmi', 'children', 'charges']].mean()

Unnamed: 0_level_0,age,bmi,children,charges
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
northeast,39.268519,29.173503,1.046296,13406.384516
northwest,39.196923,29.199785,1.147692,12417.575374
southeast,38.93956,33.355989,1.049451,14735.411438
southwest,39.455385,30.596615,1.141538,12346.937377


In [72]:
df.groupby('smoker')[['age',  'bmi', 'children', 'charges']].mean()

Unnamed: 0_level_0,age,bmi,children,charges
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,39.385338,30.651795,1.090226,8434.268298
yes,38.514599,30.708449,1.113139,32050.231832


## 4. Visualization <a id=3.2></a>

In [20]:
fig = px.histogram(df, x='sex', color='sex',
                   title='Sex Count',
                   color_discrete_sequence=px.colors.sequential.Reds_r,
                   width=600, height=400)

fig.update_layout(
    xaxis_title='Sex',
    yaxis_title='Count',
    title_font_size=16,
    xaxis_tickfont_size=12,
    yaxis_tickfont_size=12
)

fig.show()

In [23]:
fig = px.histogram(df, x='smoker', color='smoker',
                   title='Smoker Count',
                   color_discrete_sequence=px.colors.sequential.Reds_r,
                   width=600, height=400)

fig.update_layout(
    xaxis_title='Smoker',
    yaxis_title='Count',
    title_font_size=16,
    xaxis_tickfont_size=12,
    yaxis_tickfont_size=12
)

fig.show()

In [26]:


fig = px.histogram(df, x="region",
                   color="region",
                   title="Region Count",
                   width=600, height=400,
                   color_discrete_sequence=px.colors.sequential.Reds_r)

fig.update_layout(
    xaxis_title="Region",
    yaxis_title="Count",
    showlegend=False,
    title_font_size=16
)

fig.show()


In [None]:
def dist(col):
    fig = px.histogram(df, x=col,
                       nbins=50, 
                       marginal="box", 
                       opacity=0.75,
                       title=f"{col} Distribution",
                       color_discrete_sequence=['darkblue'],
                       width=600, height=400)

    fig.update_layout(
        xaxis_title=col,
        yaxis_title='Count',
        title_font_size=16
    )

    fig.show()

In [27]:
dist('age')

In [28]:
dist('bmi')

In [29]:
dist('children')

In [30]:
dist('charges')

# 5. Data Preprocessing <a id=4></a>

## 5.1 Encoding Categorical Data ==>> Convert to Numerical<a id=4.1></a>

In [38]:
encoder = OneHotEncoder(drop='first', sparse_output=False)

encoded_cols = encoder.fit_transform(df[['sex', 'smoker', 'region']])
encoded_cols = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(['sex', 'smoker', 'region']))
encoded_cols

Unnamed: 0,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
1333,1.0,0.0,1.0,0.0,0.0
1334,0.0,0.0,0.0,0.0,0.0
1335,0.0,0.0,0.0,1.0,0.0
1336,0.0,0.0,0.0,0.0,1.0


In [41]:
df_encoded = pd.concat([df, encoded_cols], axis=1)
df_encoded.drop(['sex', 'smoker','region'], axis=1, inplace=True)
df_encoded.head(2)

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0.0,1.0,0.0,0.0,1.0
1,18,33.77,1,1725.5523,1.0,0.0,0.0,1.0,0.0


In [42]:
df_encoded.shape

(1338, 9)

## 5.2 Check Outliers <a id=4.2></a>

In [44]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=4, subplot_titles=("Age", "BMI", "Children", "Charges"))

fig.add_trace(go.Box(y=df_encoded["age"], name="Age", boxmean=True), row=1, col=1)

fig.add_trace(go.Box(y=df_encoded["bmi"], name="BMI", boxmean=True), row=1, col=2)

fig.add_trace(go.Box(y=df_encoded["children"], name="Children", boxmean=True), row=1, col=3)

fig.add_trace(go.Box(y=df_encoded["charges"], name="Charges", boxmean=True), row=1, col=4)

fig.update_layout(
    height=500, width=1000,
    title_text="Boxplots for Numerical Features",
    showlegend=False
)

fig.show()


## 5.3 Log Tranformation On Target Variable <a id=4.3></a>

In [80]:
df_encoded['charges'] = np.log1p(df_encoded['charges'])
df_encoded['bmi'] = np.log1p(df_encoded['bmi'])

## 5.4 Correlation Bar <a id=5.1></a>

In [83]:

correlation = df_encoded.corrwith(df_encoded['charges']).sort_values(ascending=False).reset_index()
correlation.columns = ['Feature', 'Correlation']

fig = px.bar(
    correlation,
    x='Feature',
    y='Correlation',
    title='Correlation with Charges',
    text='Correlation',
    color='Correlation',
    color_continuous_scale='Blues',
    height=500
)

fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(
    xaxis_title='Feature',
    yaxis_title='Corr. coefficient',
    title_x=0.5,
    uniformtext_minsize=8,
    uniformtext_mode='hide',
    plot_bgcolor='white'
)

fig.show()


## 5.5 Correlation Heatmap <a id=5.2></a>

In [85]:
import plotly.figure_factory as ff

df_corr = df_encoded.corr().round(2)

fig = ff.create_annotated_heatmap(
    z=df_corr.values,
    x=df_corr.columns.tolist(),
    y=df_corr.columns.tolist(),
    annotation_text=df_corr.values,
    colorscale='Viridis', 
    showscale=True
)

fig.update_layout(
    title_text='Correlation Heatmap (Interactive)',
    title_x=0.5,
    width=900,
    height=700,
    margin=dict(l=100, r=100, t=100, b=100)
)

fig.show()


# 6. Preparing For Modelling <a id=6></a>

## 6.1 Splitting Dependent/Independent Features <a id=6.1></a>

In [61]:
X = df_encoded.drop('charges', axis=1).values
y = df_encoded['charges'].values

print("X Shape: ", X.shape)
print("y Shape: ", y.shape)

X Shape:  (1338, 8)
y Shape:  (1338,)


In [158]:
df_encoded.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,9.734236,0.0,1.0,0.0,0.0,1.0
1,18,33.77,1,7.453882,1.0,0.0,0.0,1.0,0.0
2,28,33.0,3,8.400763,1.0,0.0,0.0,1.0,0.0
3,33,22.705,0,9.998137,1.0,0.0,1.0,0.0,0.0
4,32,28.88,0,8.260455,1.0,0.0,1.0,0.0,0.0


## 6.2 Splitting Train/Test Sets <a id=6.2></a>
# Modeling

In [None]:


X = df_encoded.drop('charges', axis=1)  
y = df_encoded['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

regressor_lr = LinearRegression().fit(X_train, y_train)

y_pred = regressor_lr.predict(X_test)



### 6.3 Linear Regression Accuracy

In [99]:
from sklearn.metrics import r2_score, mean_squared_error

print("Multiple Linear Regression Accuracy: ", r2_score(y_test, y_pred))
print("Multiple Linear Regression MSE: ", mean_squared_error(y_test, y_pred))

Multiple Linear Regression Accuracy:  0.7697367301388875
Multiple Linear Regression MSE:  6.608458605615583e-07


## 6.4 RandomForest Regression <a id=7.2></a>

In [100]:
from sklearn.ensemble import RandomForestRegressor

regressor_rf = RandomForestRegressor().fit(X_train, y_train)

y_pred = regressor_rf.predict(X_test)

### 6.5 RandomForest Regression Accuracy <a id=7.2.1></a>

In [102]:
print("RandomForest Regression Accuracy: ", r2_score(y_test, y_pred))
print("RandomForest Regression MSE: ", mean_squared_error(y_test, y_pred))

RandomForest Regression Accuracy:  0.8753500375717796
RandomForest Regression MSE:  3.5774012824333135e-07
