# Modelling medical insurance data using Linear regression

In [0]:
#%fs mkdirs /tmp/reproducible_ml_uofl

In [0]:
medinsurance_linearreg_class1_7_csv = spark.read.table("medinsurance_linearreg_class1_7_csv")
write_path = 'dbfs:/tmp/reproducible_ml_uofl/medinsurance_linearreg_class1_7_csv.delta'
medinsurance_linearreg_class1_7_csv.write.format('delta').mode('overwrite').save(write_path)

In [0]:
# Data processing
from pyspark.sql.functions import log, col, exp

# Modeling
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


In [0]:
write_path = 'dbfs:/tmp/reproducible_ml_uofl/medinsurance_linearreg_class1_7_csv.delta'
medins_chrgs = spark.read.format('delta').load(write_path)

#Show basic summary stats
display(medins_chrgs.summary())

summary,age,bmi,children,region,charges,gender_cd,smoker_cd
count,1338.0,1338.0,1338.0,1338,1338.0,1338.0,1338.0
mean,39.20702541106129,30.663396860986538,1.0949177877429,,13270.422265141257,0.4947683109118087,0.2047832585949177
stddev,14.049960379216149,6.098186911679012,1.205492739781914,,12110.011236693992,0.5001595692843768,0.4036940375456171
min,18.0,15.96,0.0,northeast,1121.8739,0.0,0.0
25%,27.0,26.29,0.0,,4738.2682,0.0,0.0
50%,39.0,30.4,1.0,,9377.9047,0.0,0.0
75%,51.0,34.7,2.0,,16657.71745,1.0,0.0
max,64.0,53.13,5.0,southwest,63770.42801,1.0,1.0


In [0]:
#Count of Categories in each Region
medins_chrgs.groupBy('region').count().show()

In [0]:
# Train test split
trainDF, testDF = medins_chrgs.randomSplit([.65, .35], seed=123)
# Print the number of records
print(f'There are {trainDF.cache().count()} records in the training dataset.')
print(f'There are {testDF.cache().count()} records in the testing dataset.')

In [0]:
##Count of Categories in each Region in Training data
trainDF.groupBy('region').count().show()

##Now we need to modify the categorical variable region into one-hot-encoded version
 For this we can either use the StringIndexer and OneHotEncoder separately OR use a pipeline to do this in one step
 
Some machine learning algorithms, such as linear and logistic regression, require numeric features.  

The following code block illustrates how to use `StringIndexer` and `OneHotEncoder` to convert categorical variables into a set of numeric variables that only take on values 0 and 1. 

- `StringIndexer` converts a column of string values to a column of label indexes. For example, it might convert the values "red", "blue", and "green" to 0, 1, and 2. 
- `OneHotEncoder` maps a column of category indices to a column of binary vectors, with at most one "1" in each row that indicates the category index for that row.

One-hot encoding in Spark is a two-step process. You first use the StringIndexer, followed by the OneHotEncoder. The following code block defines the StringIndexer and OneHotEncoder but does not apply it to any data yet.

For more information:   
[StringIndexer](http://spark.apache.org/docs/latest/ml-features.html#stringindexer)   
[OneHotEncoder](https://spark.apache.org/docs/latest/ml-features.html#onehotencoder)

## Note: Transformers, Estimators, and Pipelines

Three important concepts in MLlib machine learning that will be used in this notebook, and most others are **Transformers**, **Estimators**, and **Pipelines**. 

- **Transformer**: Takes a DataFrame as input, and returns a new DataFrame. Transformers do not learn any parameters from the data and simply apply rule-based transformations to either prepare data for model training or generate predictions using a trained MLlib model. You call a transformer with a `.transform()` method.

- **Estimator**: Learns (or "fits") parameters from your DataFrame via a `.fit()` method and returns a Model, which is a transformer.

- **Pipeline**: Combines multiple steps into a single workflow that can be easily run. Creating a machine learning model typically involves setting up many different steps and iterating over them. Pipelines help you automate this process.

For more information:
[ML Pipelines](https://spark.apache.org/docs/latest/ml-pipeline.html#ml-pipelines)

In [0]:
#You can also create a pipeline and do everything together in one easy fit and transform step
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
 
categoricalColumns = ["region"]
stages = [] # stages in Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    
# Use OneHotEncoder to convert categorical variables into binary SparseVectors
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])

# Define the pipeline based on the stages created in previous steps.
pipeline = Pipeline(stages=[stringIndexer, encoder])
 
# Define the pipeline model.
transform_mdl = pipeline.fit(trainDF)
trainDF21=transform_mdl.transform(trainDF)
trainDF21.show()

### Combine all feature columns into a single feature vector

Most MLlib algorithms require a single features column as input. Each row in this column contains a vector of data points corresponding to the set of features used for prediction. 

MLlib provides the `VectorAssembler` transformer to create a single vector column from a list of columns.

The following code block illustrates how to use VectorAssembler.

For more information: [VectorAssembler](https://spark.apache.org/docs/latest/ml-features.html#vectorassembler)

In [0]:
# Linear regression expect a vector input
vecAssembler = VectorAssembler(inputCols=['age','bmi','children','gender_cd','smoker_cd','regionclassVec'], outputCol="features")
vecTrainDF = vecAssembler.transform(trainDF21)

In [0]:
# Take a look at the data
display(vecTrainDF)

age,bmi,children,region,charges,gender_cd,smoker_cd,regionIndex,regionclassVec,features
18,15.96,0,northeast,1694.7964,0,0,3.0,"List(0, 3, List(), List())","List(0, 8, List(0, 1), List(18.0, 15.96))"
18,17.29,2,northeast,12829.4551,0,1,3.0,"List(0, 3, List(), List())","List(0, 8, List(0, 1, 2, 4), List(18.0, 17.29, 2.0, 1.0))"
18,21.47,0,northeast,1702.4553,0,0,3.0,"List(0, 3, List(), List())","List(0, 8, List(0, 1), List(18.0, 21.47))"
18,21.565,0,northeast,13747.87235,0,1,3.0,"List(0, 3, List(), List())","List(0, 8, List(0, 1, 4), List(18.0, 21.565, 1.0))"
18,21.66,0,northeast,14283.4594,1,1,3.0,"List(0, 3, List(), List())","List(0, 8, List(0, 1, 3, 4), List(18.0, 21.66, 1.0, 1.0))"
18,22.99,0,northeast,1704.5681,0,0,3.0,"List(0, 3, List(), List())","List(0, 8, List(0, 1), List(18.0, 22.99))"
18,23.085,0,northeast,1704.70015,0,0,3.0,"List(0, 3, List(), List())","List(0, 8, List(0, 1), List(18.0, 23.085))"
18,23.32,1,southeast,1711.0268,0,0,0.0,"List(0, 3, List(0), List(1.0))","List(0, 8, List(0, 1, 2, 5), List(18.0, 23.32, 1.0, 1.0))"
18,23.75,0,northeast,1705.6245,0,0,3.0,"List(0, 3, List(), List())","List(0, 8, List(0, 1), List(18.0, 23.75))"
18,25.175,0,northeast,15518.18025,0,1,3.0,"List(0, 3, List(), List())","List(0, 8, List(0, 1, 4), List(18.0, 25.175, 1.0))"


In [0]:
# Create linear regression
lr = LinearRegression(featuresCol="features", labelCol="charges")
# Fit the linear regresssion model
lrModel = lr.fit(vecTrainDF)
predict_train = lrModel.transform(vecTrainDF)

In [0]:
# Make predictions on testing dataset
testDF21=transform_mdl.transform(testDF) #do the data transformation using saved parameters from training
vecTestDF = vecAssembler.transform(testDF21) #do the feature transformation using vector assembler
predict_test = lrModel.transform(vecTestDF) #make predictions using the trained model

# Take a look at the output
display(predict_test.select("features", "charges", "prediction"))

features,charges,prediction
"List(0, 8, List(0, 1, 3, 5), List(18.0, 20.79, 1.0, 1.0))",1607.5101,-1667.1004174430054
"List(0, 8, List(0, 1, 2, 5), List(18.0, 21.78, 2.0, 1.0))",11884.04858,-42.51582574053464
"List(0, 8, List(0, 1, 5), List(18.0, 23.21, 1.0))",1121.8739,-921.4320723196925
"List(1, 8, List(), List(18.0, 24.09, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0))",2201.0971,-0.7026523818949499
"List(0, 8, List(0, 1, 3), List(18.0, 25.08, 1.0))",2196.4732,434.6089365796943
"List(0, 8, List(0, 1, 3), List(18.0, 26.315, 1.0))",2198.18985,812.0112921432938
"List(0, 8, List(0, 1, 3), List(18.0, 28.215, 1.0))",2200.83085,1392.630300702673
"List(0, 8, List(0, 1, 2), List(18.0, 28.31, 1.0))",11272.33139,2085.7585565994577
"List(0, 8, List(0, 1, 3), List(18.0, 30.115, 1.0))",21344.8467,1973.2493092620532
"List(0, 8, List(0, 1, 5), List(18.0, 30.14, 1.0))",1131.5066,1196.29936416268


In [0]:
# Create regression evaluator
regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="charges", metricName="rmse")
# RMSE
rmse = regressionEvaluator.evaluate(predict_test)
print(f"The Test RMSE for the linear regression model is {rmse:0.2f}")
rmsetr = regressionEvaluator.evaluate(predict_train)
print(f"The Train RMSE for the linear regression model is {rmsetr:0.2f}")

# MSE
mse = regressionEvaluator.setMetricName("mse").evaluate(predict_test)
print(f"The Test MSE for the linear regression model is {mse:0.2f}")
msetr = regressionEvaluator.setMetricName("mse").evaluate(predict_train)
print(f"The Train MSE for the linear regression model is {msetr:0.2f}")

# R2
r2 = regressionEvaluator.setMetricName("r2").evaluate(predict_test)
print(f"The Test R2 for the linear regression model is {r2:0.2f}")
trainr2 = regressionEvaluator.setMetricName("r2").evaluate(predict_train)
print(f"The Train R2 for the linear regression model is {trainr2:0.2f}")

# MAE
mae = regressionEvaluator.setMetricName("mae").evaluate(predict_test)
print(f"The Test MAE for the linear regression model is {mae:0.2f}")
maetr = regressionEvaluator.setMetricName("mae").evaluate(predict_train)
print(f"The Train MAE for the linear regression model is {maetr:0.2f}")

# Visualize the data
#display(predict_test.select("charges", "prediction"))

In [0]:
print("Coefficients: \n" + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))