In [0]:
%python
from pyspark.sql.functions import spark_partition_id, asc, desc

# Show first two rows, as well as id’s and row counts for all partitions in df. 
def showIdAndRowCount(df, title):
  print("\n*** " + title)
  display(df.head(2))
  df.rdd.toDF()\
      .withColumn("partitionId", spark_partition_id())\
      .groupBy("partitionId")\
      .count()\
      .orderBy(asc("count"))\
      .show()

# Load csv file into a Spark DataFrame. 
admissionsDf1 = spark.read.csv("/FileStore/tables/admissions.csv", \
inferSchema=True, header=True, multiLine=True, escape='"')
showIdAndRowCount(admissionsDf1, "admissionsDf1")

# Load JSON file into a Spark DataFrame. 
admissionsDf2 = spark.read.json("/FileStore/tables/admissions.json")
showIdAndRowCount(admissionsDf2, "admissionsDf2")

# Join both DataFrames into a single Spark DataFrame. Preserve column order.
unionDf = admissionsDf1.union(admissionsDf2.select(admissionsDf1.columns))
showIdAndRowCount(unionDf, "unionDf")



*** admissionsDf1


gmat,gpa,work_experience,admitted
780,4.0,3,1
750,3.9,4,1


+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0|   20|
+-----------+-----+


*** admissionsDf2


admitted,gmat,gpa,work_experience
0.0,620.0,3.3,2.0
0.0,600.0,2.0,1.0


+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0|   20|
+-----------+-----+


*** unionDf


gmat,gpa,work_experience,admitted
780.0,4.0,3.0,1.0
750.0,3.9,4.0,1.0


+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0|   20|
|          1|   20|
+-----------+-----+



In [0]:
%python
from pyspark.ml.classification import LogisticRegression, LogisticRegressionSummary
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

# Use assembler to define feature columns.
assembler = VectorAssembler(inputCols=['gmat', 'gpa', 'work_experience'], outputCol="features")

# Define logistic regression model with target column.
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='features', labelCol='admitted', maxIter=10)

# Build pipeline with feature vector and logistic regression model.
pipeline    = Pipeline(stages=[assembler, lr])

# Split the Spark DataFrame.
train, test = unionDf.randomSplit([0.75, 0.25])

# Fit the model with train and predict with test.
model       = pipeline.fit(train)
predictions = model.transform(test)
print("\nShowing prediction DataFrame")
display(predictions)



Showing prediction DataFrame


gmat,gpa,work_experience,admitted,features,rawPrediction,probability,prediction
610.0,3.0,1.0,0.0,"Map(vectorType -> dense, length -> 3, values -> List(610.0, 3.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(7.418689510298023, -7.418689510298023))","Map(vectorType -> dense, length -> 2, values -> List(0.9994004248670022, 5.995751329977983E-4))",0.0
680.0,3.3,4.0,1.0,"Map(vectorType -> dense, length -> 3, values -> List(680.0, 3.3, 4.0))","Map(vectorType -> dense, length -> 2, values -> List(-5.511582048398353, 5.511582048398353))","Map(vectorType -> dense, length -> 2, values -> List(0.004023457698746624, 0.9959765423012534))",1.0
680.0,3.9,4.0,1.0,"Map(vectorType -> dense, length -> 3, values -> List(680.0, 3.9, 4.0))","Map(vectorType -> dense, length -> 2, values -> List(-11.208905764956484, 11.208905764956484))","Map(vectorType -> dense, length -> 2, values -> List(1.355277387154791E-5, 0.9999864472261285))",1.0
690.0,2.3,1.0,0.0,"Map(vectorType -> dense, length -> 3, values -> List(690.0, 2.3, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(8.902319174433273, -8.902319174433273))","Map(vectorType -> dense, length -> 2, values -> List(0.9998639455300616, 1.3605446993836257E-4))",0.0
710.0,3.7,5.0,1.0,"Map(vectorType -> dense, length -> 3, values -> List(710.0, 3.7, 5.0))","Map(vectorType -> dense, length -> 2, values -> List(-13.100605093341429, 13.100605093341429))","Map(vectorType -> dense, length -> 2, values -> List(2.0439892655330147E-6, 0.9999979560107345))",1.0
770.0,3.3,3.0,1.0,"Map(vectorType -> dense, length -> 3, values -> List(770.0, 3.3, 3.0))","Map(vectorType -> dense, length -> 2, values -> List(-9.465646822267885, 9.465646822267885))","Map(vectorType -> dense, length -> 2, values -> List(7.746190528413678E-5, 0.9999225380947159))",1.0
580.0,3.3,1.0,0.0,"Map(vectorType -> dense, length -> 3, values -> List(580.0, 3.3, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(6.506245653962424, -6.506245653962424))","Map(vectorType -> dense, length -> 2, values -> List(0.9985081504460395, 0.0014918495539605336))",0.0
590.0,1.7,4.0,0.0,"Map(vectorType -> dense, length -> 3, values -> List(590.0, 1.7, 4.0))","Map(vectorType -> dense, length -> 2, values -> List(15.489935201587059, -15.489935201587059))","Map(vectorType -> dense, length -> 2, values -> List(0.9999998125840557, 1.8741594431936903E-7))",0.0
620.0,2.7,2.0,0.0,"Map(vectorType -> dense, length -> 3, values -> List(620.0, 2.7, 2.0))","Map(vectorType -> dense, length -> 2, values -> List(7.767356135968413, -7.767356135968413))","Map(vectorType -> dense, length -> 2, values -> List(0.9995768481150992, 4.231518849008298E-4))",0.0
620.0,3.3,2.0,0.0,"Map(vectorType -> dense, length -> 3, values -> List(620.0, 3.3, 2.0))","Map(vectorType -> dense, length -> 2, values -> List(2.070032419410296, -2.070032419410296))","Map(vectorType -> dense, length -> 2, values -> List(0.8879561868906117, 0.11204381310938827))",0.0


In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics

predictions = (predictions.withColumn("admitted", predictions["admitted"].cast("double")))

#evaluate the model
predictionsAndLabels = predictions.select("prediction", "admitted").rdd

metrics = MulticlassMetrics(predictionsAndLabels)

confusionMatrix = metrics.confusionMatrix()
accuracy        = metrics.accuracy
precision       = metrics.precision(1.0)
recall          = metrics.recall(1.0)
print("Confusion matrix: ")
print(confusionMatrix)

print("Accuracy: " + str(accuracy))
print("Precision: " + str(accuracy))
print("Recall:    " + str(recall))


Confusion matrix: 
DenseMatrix([[8., 0.],
             [0., 7.]])
Accuracy: 1.0
Precision: 1.0
Recall:    1.0


In [0]:
%python

#Example

# Load social network data from FileStore.
socialNetworkDf = spark.read.csv("/FileStore/tables/Social_Network_Ads.csv", 
                                 header="true", inferSchema="true")

# Split the data into test and training sets. 
splits = socialNetworkDf.randomSplit([0.8, 0.2], seed=1234)
train  = splits[0]
test   = splits[1]

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression

# 1. Create gender index column. Converts Male and Female to 0 and 1.
genderIndexer       =  StringIndexer().setInputCol("Gender").setOutputCol("GenderIndex")

# Define feature and target variable names. 
features        = ["GenderIndex", "Age", "EstimatedSalary"]
targetVariable  = "Purchased"

# 2. Set up feature vector.
vectorAssembler = VectorAssembler().setInputCols(features).setOutputCol("features")

# 3. Set up feature scaler.
scaler          = StandardScaler().setInputCol("features").setOutputCol("scaledFeatures")

# 4. Set classifier with features and labels. 
logisticRegression = LogisticRegression().setFeaturesCol("scaledFeatures")\
  .setLabelCol(targetVariable)

# Define pipeline stages.
# 1) Index gender. 2) Build feature set. 3) Scale features. 4) Prep model.
stages   = [genderIndexer, vectorAssembler, scaler, logisticRegression]
pipeline = Pipeline().setStages(stages)

# Get model object.
model    = pipeline.fit(train)

# Generate predictions
results  = model.transform(test)
display(results)


from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics

# Convert 'Purchased' column to double.
predictions = (results.withColumn("Purchased", results["Purchased"].cast("double")))

# Compare predicted and actual value with MulticlassMetrics class.
predictionsAndLabels = predictions.select("prediction", "Purchased").rdd
metrics = MulticlassMetrics(predictionsAndLabels)

# Obtain statistical measures.
confusionMatrix = metrics.confusionMatrix()
accuracy        = metrics.accuracy
precision       = metrics.precision(1.0)
recall          = metrics.recall(1.0)
print("Confusion matrix: ")
print(confusionMatrix)

print("Accuracy:  " + str(accuracy))
print("Precision: " + str(accuracy))
print("Recall:    " + str(recall))


User ID,Gender,Age,EstimatedSalary,Purchased,GenderIndex,features,scaledFeatures,rawPrediction,probability,prediction
15569641,Female,58,95000,1,0.0,"Map(vectorType -> dense, length -> 3, values -> List(0.0, 58.0, 95000.0))","Map(vectorType -> dense, length -> 3, values -> List(0.0, 5.374366311477167, 2.7908277974402735))","Map(vectorType -> dense, length -> 2, values -> List(-4.362677848156325, 4.362677848156325))","Map(vectorType -> dense, length -> 2, values -> List(0.012583843636093576, 0.9874161563639064))",1.0
15571059,Female,33,41000,0,0.0,"Map(vectorType -> dense, length -> 3, values -> List(0.0, 33.0, 41000.0))","Map(vectorType -> dense, length -> 3, values -> List(0.0, 3.05782910825425, 1.2044625231058022))","Map(vectorType -> dense, length -> 2, values -> List(3.573146607331344, -3.573146607331344))","Map(vectorType -> dense, length -> 2, values -> List(0.9726988740483518, 0.027301125951648242))",0.0
15581282,Male,37,74000,0,1.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 37.0, 74000.0))","Map(vectorType -> dense, length -> 3, values -> List(1.997177391336967, 3.428475060769917, 2.1739079685324234))","Map(vectorType -> dense, length -> 2, values -> List(1.0807948748509144, -1.0807948748509144))","Map(vectorType -> dense, length -> 2, values -> List(0.7466443765489799, 0.25335562345102014))",0.0
15582492,Male,28,123000,1,1.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 28.0, 123000.0))","Map(vectorType -> dense, length -> 3, values -> List(1.997177391336967, 2.5945216676096665, 3.6133875693174065))","Map(vectorType -> dense, length -> 2, values -> List(1.5060954031458547, -1.5060954031458547))","Map(vectorType -> dense, length -> 2, values -> List(0.8184818247425623, 0.18151817525743774))",0.0
15583137,Male,40,61000,0,1.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 40.0, 61000.0))","Map(vectorType -> dense, length -> 3, values -> List(1.997177391336967, 3.7064595251566668, 1.7920052173037546))","Map(vectorType -> dense, length -> 2, values -> List(0.8205984453101429, -0.8205984453101429))","Map(vectorType -> dense, length -> 2, values -> List(0.6943633586686281, 0.3056366413313719))",0.0
15583681,Male,32,120000,1,1.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 32.0, 120000.0))","Map(vectorType -> dense, length -> 3, values -> List(1.997177391336967, 2.9651676201253334, 3.525256165187714))","Map(vectorType -> dense, length -> 2, values -> List(0.6499196072889024, -0.6499196072889024))","Map(vectorType -> dense, length -> 2, values -> List(0.6569923461311152, 0.34300765386888477))",0.0
15584114,Male,24,23000,0,1.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 24.0, 23000.0))","Map(vectorType -> dense, length -> 3, values -> List(1.997177391336967, 2.223875715094, 0.6756740983276451))","Map(vectorType -> dense, length -> 2, values -> List(6.02174543037487, -6.02174543037487))","Map(vectorType -> dense, length -> 2, values -> List(0.9975804360142784, 0.002419563985721629))",0.0
15589449,Male,39,106000,1,1.0,"Map(vectorType -> dense, length -> 3, values -> List(1.0, 39.0, 106000.0))","Map(vectorType -> dense, length -> 3, values -> List(1.997177391336967, 3.6137980370277503, 3.113976279249147))","Map(vectorType -> dense, length -> 2, values -> List(-0.5375103701742994, 0.5375103701742994))","Map(vectorType -> dense, length -> 2, values -> List(0.36876692343312967, 0.6312330765668703))",1.0
15594041,Female,49,36000,1,0.0,"Map(vectorType -> dense, length -> 3, values -> List(0.0, 49.0, 36000.0))","Map(vectorType -> dense, length -> 3, values -> List(0.0, 4.540412918316917, 1.057576849556314))","Map(vectorType -> dense, length -> 2, values -> List(-0.10025870832562767, 0.10025870832562767))","Map(vectorType -> dense, length -> 2, values -> List(0.4749562972805283, 0.5250437027194716))",1.0
15594762,Female,39,75000,1,0.0,"Map(vectorType -> dense, length -> 3, values -> List(0.0, 39.0, 75000.0))","Map(vectorType -> dense, length -> 3, values -> List(0.0, 3.6137980370277503, 2.203285103242321))","Map(vectorType -> dense, length -> 2, values -> List(0.9210211862854845, -0.9210211862854845))","Map(vectorType -> dense, length -> 2, values -> List(0.7152501337448346, 0.2847498662551654))",0.0


Confusion matrix: 
DenseMatrix([[51.,  4.],
             [14., 17.]])
Accuracy:  0.7906976744186046
Precision: 0.7906976744186046
Recall:    0.5483870967741935


In [0]:
%python

#Exercise 1

# Load social network data from FileStore.
irisDf = spark.read.csv("/FileStore/tables/iris.csv", 
                                 header="true", inferSchema="true")

# Split the data into test and training sets. 
splits = irisDf.randomSplit([0.8, 0.2], seed=1234)
train  = splits[0]
test   = splits[1]


In [0]:
%python

#Exercise 1

# Load social network data from FileStore.
irisDf = spark.read.csv("/FileStore/tables/iris.csv", 
                                 header="true", inferSchema="true")

# Split the data into test and training sets. 
splits = irisDf.randomSplit([0.8, 0.2], seed=1234)
train  = splits[0]
test   = splits[1]

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression

# 1. Create flower type index column. Converts to 0 and 1 and 2.
flowerTypeIndexer = StringIndexer().setInputCol("Flower_Type").setOutputCol("Flower_Type_Index")

# Define feature and target variable names. 
features        = ["Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width"]
targetVariable  = "Flower_Type_Index"

# 2. Set up feature vector.
vectorAssembler = VectorAssembler().setInputCols(features).setOutputCol("features")

# 3. Set up feature scaler.
scaler          = StandardScaler().setInputCol("features").setOutputCol("scaledFeatures")

# 4. Set classifier with features and labels. 
logisticRegression = LogisticRegression().setFeaturesCol("scaledFeatures")\
  .setLabelCol(targetVariable)

# Define pipeline stages.
# 1) Index flower type. 2) Build feature set. 3) Scale features. 4) Prep model.
stages   = [flowerTypeIndexer, vectorAssembler, scaler, logisticRegression]
pipeline = Pipeline().setStages(stages)

# Get model object.
model    = pipeline.fit(train)

# Generate predictions
results  = model.transform(test)
display(results)


# Get model object.
model    = pipeline.fit(train)

# Generate predictions
results  = model.transform(test)
display(results)


from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics

# Convert 'Flower_Type_Index' column to double.
predictions = (results.withColumn("Flower_Type_index", results["Flower_Type_index"].cast("double")))

from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Compare predicted and actual value with MulticlassMetrics class.
predictionsAndLabels = predictions.select("prediction", "Flower_Type_Index").rdd
metrics = MulticlassMetrics(predictionsAndLabels)

# Obtain statistical measures.
confusionMatrix = metrics.confusionMatrix()
accuracy        = metrics.accuracy
precision       = metrics.precision(1.0)
recall          = metrics.recall(1.0)
print("Confusion matrix: ")
print(confusionMatrix)

print("Accuracy:  " + str(accuracy))
print("Precision: " + str(accuracy))
print("Recall:    " + str(recall))


Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Flower_Type,Flower_Type_Index,features,scaledFeatures,rawPrediction,probability,prediction
4.4,2.9,1.4,0.2,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(4.4, 2.9, 1.4, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(5.0010171654063305, 6.968190455239197, 0.7920150679648652, 0.2606750763541214))","Map(vectorType -> dense, length -> 3, values -> List(-79.47072894873867, -33.6393926737438, 113.11012162248248))","Map(vectorType -> dense, length -> 3, values -> List(2.3078058860519703E-84, 1.851380645087324E-64, 1.0))",2.0
4.5,2.3,1.3,0.3,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(4.5, 2.3, 1.3, 0.3))","Map(vectorType -> dense, length -> 4, values -> List(5.114676646438292, 5.526495878293156, 0.735442563110232, 0.391012614531182))","Map(vectorType -> dense, length -> 3, values -> List(-47.04386253042741, -5.127727392070199, 52.1715899224976))","Map(vectorType -> dense, length -> 3, values -> List(8.15223105453467E-44, 1.3038351154249712E-25, 1.0))",2.0
4.9,3.1,1.5,0.1,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(4.9, 3.1, 1.5, 0.1))","Map(vectorType -> dense, length -> 4, values -> List(5.56931457056614, 7.448755314221212, 0.8485875728194985, 0.1303375381770607))","Map(vectorType -> dense, length -> 3, values -> List(-87.52121972004588, -38.83267163699415, 126.35389135704004))","Map(vectorType -> dense, length -> 3, values -> List(1.303825416331986E-93, 1.8213239861593713E-72, 1.0))",2.0
5.0,3.0,1.6,0.2,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.0, 1.6, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(5.6829740515981015, 7.208472884730204, 0.9051600776741318, 0.2606750763541214))","Map(vectorType -> dense, length -> 3, values -> List(-76.99038590691146, -30.92206184282783, 107.91244774973931))","Map(vectorType -> dense, length -> 3, values -> List(4.985625519420681E-81, 5.069190195385229E-61, 1.0))",2.0
5.0,3.2,1.2,0.2,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.2, 1.2, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(5.6829740515981015, 7.689037743712219, 0.6788700582555988, 0.2606750763541214))","Map(vectorType -> dense, length -> 3, values -> List(-93.00873357864847, -42.656330433691906, 135.66506401234045))","Map(vectorType -> dense, length -> 3, values -> List(4.877884922615119E-100, 3.597513122710428E-78, 1.0))",2.0
5.0,3.3,1.4,0.2,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.3, 1.4, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(5.6829740515981015, 7.929320173203225, 0.7920150679648652, 0.2606750763541214))","Map(vectorType -> dense, length -> 3, values -> List(-94.96796188979508, -45.6119260659063, 140.57988795570145))","Map(vectorType -> dense, length -> 3, values -> List(5.045086164717633E-103, 1.373797098691763E-81, 1.0))",2.0
5.0,3.4,1.5,0.2,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.4, 1.5, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(5.6829740515981015, 8.169602602694232, 0.8485875728194985, 0.2606750763541214))","Map(vectorType -> dense, length -> 3, values -> List(-98.43967658212219, -49.29540636392511, 147.73508294604733))","Map(vectorType -> dense, length -> 3, values -> List(1.2236588429913587E-107, 2.6961604018502634E-86, 1.0))",2.0
5.1,3.5,1.4,0.3,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(5.1, 3.5, 1.4, 0.3))","Map(vectorType -> dense, length -> 4, values -> List(5.7966335326300635, 8.409885032185239, 0.7920150679648652, 0.391012614531182))","Map(vectorType -> dense, length -> 3, values -> List(-100.90221767836385, -51.66329582901406, 152.56551350737794))","Map(vectorType -> dense, length -> 3, values -> List(8.324592923536577E-111, 2.0162995794324653E-89, 1.0))",2.0
5.3,3.7,1.5,0.2,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(5.3, 3.7, 1.5, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(6.0239524946939875, 8.890449891167252, 0.8485875728194985, 0.2606750763541214))","Map(vectorType -> dense, length -> 3, values -> List(-111.17249412615797, -59.6930380238296, 170.8655321499876))","Map(vectorType -> dense, length -> 3, values -> List(3.2541794738694936E-123, 7.407731251802606E-101, 1.0))",2.0
5.4,3.4,1.5,0.4,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(5.4, 3.4, 1.5, 0.4))","Map(vectorType -> dense, length -> 4, values -> List(6.13761197572595, 8.169602602694232, 0.8485875728194985, 0.5213501527082428))","Map(vectorType -> dense, length -> 3, values -> List(-88.8915267475715, -41.861710547137605, 130.7532372947091))","Map(vectorType -> dense, length -> 3, values -> List(4.069030990143796E-96, 1.0821330425483868E-75, 1.0))",2.0


Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Flower_Type,Flower_Type_Index,features,scaledFeatures,rawPrediction,probability,prediction
4.4,2.9,1.4,0.2,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(4.4, 2.9, 1.4, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(5.0010171654063305, 6.968190455239197, 0.7920150679648652, 0.2606750763541214))","Map(vectorType -> dense, length -> 3, values -> List(-79.47072894873867, -33.6393926737438, 113.11012162248248))","Map(vectorType -> dense, length -> 3, values -> List(2.3078058860519703E-84, 1.851380645087324E-64, 1.0))",2.0
4.5,2.3,1.3,0.3,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(4.5, 2.3, 1.3, 0.3))","Map(vectorType -> dense, length -> 4, values -> List(5.114676646438292, 5.526495878293156, 0.735442563110232, 0.391012614531182))","Map(vectorType -> dense, length -> 3, values -> List(-47.04386253042741, -5.127727392070199, 52.1715899224976))","Map(vectorType -> dense, length -> 3, values -> List(8.15223105453467E-44, 1.3038351154249712E-25, 1.0))",2.0
4.9,3.1,1.5,0.1,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(4.9, 3.1, 1.5, 0.1))","Map(vectorType -> dense, length -> 4, values -> List(5.56931457056614, 7.448755314221212, 0.8485875728194985, 0.1303375381770607))","Map(vectorType -> dense, length -> 3, values -> List(-87.52121972004588, -38.83267163699415, 126.35389135704004))","Map(vectorType -> dense, length -> 3, values -> List(1.303825416331986E-93, 1.8213239861593713E-72, 1.0))",2.0
5.0,3.0,1.6,0.2,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.0, 1.6, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(5.6829740515981015, 7.208472884730204, 0.9051600776741318, 0.2606750763541214))","Map(vectorType -> dense, length -> 3, values -> List(-76.99038590691146, -30.92206184282783, 107.91244774973931))","Map(vectorType -> dense, length -> 3, values -> List(4.985625519420681E-81, 5.069190195385229E-61, 1.0))",2.0
5.0,3.2,1.2,0.2,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.2, 1.2, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(5.6829740515981015, 7.689037743712219, 0.6788700582555988, 0.2606750763541214))","Map(vectorType -> dense, length -> 3, values -> List(-93.00873357864847, -42.656330433691906, 135.66506401234045))","Map(vectorType -> dense, length -> 3, values -> List(4.877884922615119E-100, 3.597513122710428E-78, 1.0))",2.0
5.0,3.3,1.4,0.2,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.3, 1.4, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(5.6829740515981015, 7.929320173203225, 0.7920150679648652, 0.2606750763541214))","Map(vectorType -> dense, length -> 3, values -> List(-94.96796188979508, -45.6119260659063, 140.57988795570145))","Map(vectorType -> dense, length -> 3, values -> List(5.045086164717633E-103, 1.373797098691763E-81, 1.0))",2.0
5.0,3.4,1.5,0.2,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.4, 1.5, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(5.6829740515981015, 8.169602602694232, 0.8485875728194985, 0.2606750763541214))","Map(vectorType -> dense, length -> 3, values -> List(-98.43967658212219, -49.29540636392511, 147.73508294604733))","Map(vectorType -> dense, length -> 3, values -> List(1.2236588429913587E-107, 2.6961604018502634E-86, 1.0))",2.0
5.1,3.5,1.4,0.3,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(5.1, 3.5, 1.4, 0.3))","Map(vectorType -> dense, length -> 4, values -> List(5.7966335326300635, 8.409885032185239, 0.7920150679648652, 0.391012614531182))","Map(vectorType -> dense, length -> 3, values -> List(-100.90221767836385, -51.66329582901406, 152.56551350737794))","Map(vectorType -> dense, length -> 3, values -> List(8.324592923536577E-111, 2.0162995794324653E-89, 1.0))",2.0
5.3,3.7,1.5,0.2,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(5.3, 3.7, 1.5, 0.2))","Map(vectorType -> dense, length -> 4, values -> List(6.0239524946939875, 8.890449891167252, 0.8485875728194985, 0.2606750763541214))","Map(vectorType -> dense, length -> 3, values -> List(-111.17249412615797, -59.6930380238296, 170.8655321499876))","Map(vectorType -> dense, length -> 3, values -> List(3.2541794738694936E-123, 7.407731251802606E-101, 1.0))",2.0
5.4,3.4,1.5,0.4,Iris-setosa,2.0,"Map(vectorType -> dense, length -> 4, values -> List(5.4, 3.4, 1.5, 0.4))","Map(vectorType -> dense, length -> 4, values -> List(6.13761197572595, 8.169602602694232, 0.8485875728194985, 0.5213501527082428))","Map(vectorType -> dense, length -> 3, values -> List(-88.8915267475715, -41.861710547137605, 130.7532372947091))","Map(vectorType -> dense, length -> 3, values -> List(4.069030990143796E-96, 1.0821330425483868E-75, 1.0))",2.0


Confusion matrix: 
DenseMatrix([[11.,  0.,  0.],
             [ 0., 12.,  0.],
             [ 0.,  0., 14.]])
Accuracy:  1.0
Precision: 1.0
Recall:    1.0


In [0]:
%sql
SELECT * FROM truck_table_victor


List price,Best Price
12.39999962,11.19999981
14.30000019,12.5
14.5,12.69999981
14.89999962,13.10000038
16.10000038,14.10000038
16.89999962,14.80000019
16.5,14.39999962
15.39999962,13.39999962
17.0,14.89999962
17.89999962,15.60000038


In [0]:
%sql
--DROP TABLE Book;

CREATE TABLE Book (
   book   int,
   title VARCHAR(35)
);
INSERT INTO Book VALUES(1,'Harry Potter');
SELECT * FROM Book; 


book,title
1,Harry Potter


In [0]:
%python
# CELL A
# Load wine quality dataset into Spark dataframe. 
wineSparkDf = spark.read.csv("/FileStore/tables/winequality.csv", 
                             header="true", inferSchema="true")
print("Original dataframe length: " + str(wineSparkDf.count()))

# Get packages for performing LinearRegression without the ML pipeline
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model    import LinearRegression
from sklearn                 import metrics
import statsmodels.api       as sm
import numpy                 as np

# Show all columns.
pd.set_option('display.max_columns', None)

# Increase number of columns that display on one line.
pd.set_option('display.width', 1000)

# Convert PySpark dataframe to Pandas df
pandasDf = wineSparkDf.toPandas()

print(pandasDf.head(3))

X = pandasDf[['volatile acidity', 'chlorides', 'sulphates', 'alcohol']]

# Adding an intercept *** This is requried ***. Don't forget this step.
# The intercept centers the error residuals around zero 
# which helps to avoid over-fitting.
X = sm.add_constant(X)

y = pandasDf['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = sm.OLS(y_train, X_train).fit()
predictions = model.predict(X_test) # make the predictions by the model

print(model.summary())

print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))



Original dataframe length: 1599




   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  alcohol  quality
0            7.4              0.70         0.00             1.9      0.076                 11.0                  34.0   0.9978  3.51       0.56      9.4        5
1            7.8              0.88         0.00             2.6      0.098                 25.0                  67.0   0.9968  3.20       0.68      9.8        5
2            7.8              0.76         0.04             2.3      0.092                 15.0                  54.0   0.9970  3.26       0.65      9.8        5
                            OLS Regression Results                            
Dep. Variable:                quality   R-squared:                       0.340
Model:                            OLS   Adj. R-squared:                  0.338
Method:                 Least Squares   F-statistic:                     164.2
Date:                Mon, 27 Mar 202