In [0]:
paises = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/frodriguez@insulet.com/pai_ses-2.csv")

In [0]:
from pyspark.sql.functions import col

paises = paises.withColumn("total_cases_per_million", col("total_cases_per_million").cast("double"))
paises = paises.withColumn("total_deaths_per_million", col("total_deaths_per_million").cast("double"))
paises = paises.withColumn("diabetes_prevalence", col("diabetes_prevalence").cast("double"))
paises = paises.withColumn("hdi", col("hdi").cast("double"))
# casting strings for doubles

paises

In [0]:
paises = paises.na.drop()

## Train/Test Split

![](https://files.training.databricks.com/images/301/TrainTestSplit.png)

**Question**: Why is it necessary to set a seed? What happens if I change my cluster configuration?

In [0]:
trainDF, testDF = paises.randomSplit([.8, .2], seed=42)
print(trainDF.cache().count())

Let's change the # of partitions (to simulate a different cluster configuration), and see if we get the same number of data points in our training set.

In [0]:
trainRepartitionDF, testRepartitionDF = (paises
                                         .repartition(24)
                                         .randomSplit([.8, .2], seed=42))

print(trainRepartitionDF.count())

# we just simulated a different cluster
# despite same seed 42, we have 158 rows instead of 165 (previous cell) 

In [0]:
display(trainDF.select("total_cases_per_million", "hdi").summary())

summary,total_cases_per_million,hdi
count,147.0,147.0
mean,62302.86707482991,0.7251632653061223
stddev,63405.76922256815,0.1522297309006577
min,68.961,0.394
25%,7099.542,0.594
50%,48589.891,0.75
75%,99247.528,0.851
max,253504.891,0.957


In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
from pyspark.ml.feature import VectorAssembler

# we want to output the values of chosen columns into a single column, as a vector, and we will call it features
vecAssembler = VectorAssembler(inputCols=["hdi"], outputCol="features")

# transform takes a df and changes columns, or appends columns
# combining column values into a single column
vecTrainDF = vecAssembler.transform(trainDF)

In [0]:
lr = LinearRegression(featuresCol="features", labelCol="total_cases_per_million")
lrModel = lr.fit(vecTrainDF)

## Inspect the model

In [0]:
m = lrModel.coefficients[0]
b = lrModel.intercept

print(f"The formula for the linear regression line is y = {m:.2f}x + {b:.2f}")

# En este caso nuestra x es el valor de HDI
# si un pais tiene 0.5, esperariamos -1271.55
# si tuviera 0.9, esperariamos 111,667.68

## Apply model to test set

In [0]:
vecTestDF = vecAssembler.transform(testDF)

predDF = lrModel.transform(vecTestDF)

predDF.select("hdi", "features", "total_cases_per_million", "prediction").show()

## Evaluate Model

Let's see how our linear regression model with just one variable does. Does it beat our baseline model?

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="total_cases_per_million", metricName="rmse")

rmse = regressionEvaluator.evaluate(predDF)
print(f"RMSE is {rmse}")

r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
print(f"r2 is {r2}")

El error es muy alto, pero la r2 mejoro

-sandbox
&copy; 2020 Databricks, Inc. All rights reserved.<br/>
Apache, Apache Spark, Spark and the Spark logo are trademarks of the <a href="http://www.apache.org/">Apache Software Foundation</a>.<br/>
<br/>
<a href="https://databricks.com/privacy-policy">Privacy Policy</a> | <a href="https://databricks.com/terms-of-use">Terms of Use</a> | <a href="http://help.databricks.com/">Support</a>