# Explore here

In [None]:
# Your code here
# Do the train test split before substituting missing values with median and outliers, and before doing scaling
# Better to use pipelines even when using grid

Basic EDA

In [7]:
import pandas as pd

total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/linear-regression-project-tutorial/main/medical_insurance_cost.csv")
total_data.head() # to show first 5 rows of the dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


Drop duplicates:


In [8]:
total_data = total_data.drop_duplicates().reset_index(drop = True) 
#reset_index: default is False, when used with False 
#a new index column is added to the dataframe columns,
# we have to drop that (drop: True) so we do not have double row indexes shown
total_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


Min-Max scaler: shrink data usually between 0 and 1 (range)

Creating a Scaler:
First, we create a scaler object named scaler. Think of this scaler as a tool that helps us adjust the scale of our data.
Scaling the Features:
We use the scaler to transform the numerical variables in our dataset (total_data) to a specific scale. Scaling means adjusting the range of values in each column.
The fit_transform() method of the scaler adjusts the data based on the scaling method it applies. In this case, MinMaxScaler will transform the data so that all features are between 0 and 1.
Creating a New DataFrame:
We create a new DataFrame named total_data_scal to hold the scaled features.
The pd.DataFrame() function constructs a new DataFrame using the scaled features.
We provide the scaled features (scal_features) as the data, and we also specify the index and columns for the new DataFrame. The index is taken from the original total_data DataFrame, and the columns are named the same as the numerical variables.
Displaying the Head:
Finally, we display the first few rows of the total_data_scal DataFrame using the head() function. This helps us see how our scaled data looks like.
So, in simple terms, this code takes numerical variables from our dataset, adjusts their scale to be between 0 and 1, and then stores the scaled data in a new DataFrame called total_data_scal, which we can use for further analysis or modeling.

In [12]:
from sklearn.preprocessing import MinMaxScaler

total_data["sex_n"] = pd.factorize(total_data["sex"])[0] #turning categorical values into numerical ones
total_data["smoker_n"] = pd.factorize(total_data["smoker"])[0] 
#[0] because we want the first item in the tuple (generated by .factorize()), it contains the array of numerical lables
# Create numerical lables/assign unique number to the feature: sequence of numerical values given to each category in the array
# Male, female columns: male is assigned e.g. to label 0 and female to 1 (male and femal are contained in the array in the factorize.sex tuple)
# Each feature is in a specific category with a unique numerical lable
total_data["region_n"] = pd.factorize(total_data["region"])[0]
num_variables = ["age", "bmi", "children", "sex_n", "smoker_n", "region_n", "charges"]#list of numerical features/variables, old and new
#charges: target variable

scaler = MinMaxScaler() #helps us adjust the scale of the data
# numerical values of dataset are converted to a specific scale:
# numbers in each column are converted to numbers between 0 and 1
scal_features = scaler.fit_transform(total_data[num_variables])
total_data_scal = pd.DataFrame(scal_features, index = total_data.index, columns = num_variables)
#index = total_data.index: keeping the columns above their values, keeping them in right format/shape
# Good practice!
total_data_scal.head()#create a new datafram with scaled features

Unnamed: 0,age,bmi,children,sex_n,smoker_n,region_n,charges
0,0.021739,0.321227,0.0,0.0,0.0,0.0,0.251611
1,0.0,0.47915,0.2,1.0,1.0,0.333333,0.009636
2,0.217391,0.458434,0.6,1.0,1.0,0.333333,0.053115
3,0.326087,0.181464,0.0,1.0,1.0,0.666667,0.33301
4,0.304348,0.347592,0.0,1.0,1.0,0.666667,0.043816


Feature selection

In [15]:
from sklearn.feature_selection import SelectKBest, f_regression #function
# f_regression: statistical technique, measures correlation between features and target variable
# for example: smoker feature has more of an impact (is strongly correlated to the charges)
from sklearn.model_selection import train_test_split

X = total_data_scal.drop("charges", axis = 1) #drop target variable, keep features to use to make predictions
# X is the input features
y = total_data_scal["charges"] #output, what we want to obtain/predict, we need to see the original values of charges to the compare and see if the model was trained well

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) #test_size = 20% of total data, 80% will be the training data
# random_state: each time the code is executed, it generates the same randomization - useful if someone else wants to use your code
# separate training data into X_train and y_train, test data also separated and then used to see how well the model works
total_data_correlated = SelectKBest(f_regression, k = 4) # k = 4: we only want to keep the top 4 features that are strongly correlated to the target variable
total_data_correlated.fit(X_train, y_train) #fit the model

selected_features = X_train.columns[total_data_correlated.get_support()]
# X_train contains the features used to train the model. .get_support() returns an array to show which features
# were selected by the SelectKBest from before, that now are in X_train
X_train_correlated = pd.DataFrame(total_data_correlated.transform(X_train), columns = selected_features)
# pd.DataFrame: turn the array from .get_support() into a dataframe
# selecting features: include only selected features. .transform() putting correlated features into the dataframe
X_test_correlated = pd.DataFrame(total_data_correlated.transform(X_test), columns = selected_features)

X_train_correlated.head()

Unnamed: 0,age,bmi,children,smoker_n
0,0.108696,0.230024,0.0,1.0
1,0.065217,0.26325,0.4,1.0
2,0.73913,0.580172,0.4,1.0
3,0.978261,0.686306,0.0,1.0
4,0.630435,0.286252,0.4,1.0
