In [1]:
%output --no-stdout

In [2]:
@file:Repository("https://binrepo.target.com/artifactory/gradle")
@file:Repository("https://binrepo.target.com/artifactory/maven-central")
@file:Repository("https://binrepo.target.com/artifactory/jcenter")
@file:Repository("https://binrepo.target.com/artifactory/jitpack-maven")
@file:Repository("https://binrepo.target.com/artifactory/kotlin-maven")
@file:Repository("https://binrepo.target.com/artifactory/apache-maven")
@file:Repository("https://binrepo.target.com/artifactory/jitpack")
%use spark

In [3]:
%output --reset-to-defaults
@file:DependsOn("org.jetbrains.kotlinx.spark:kotlin-spark-api-3.0.0_2.12:1.0.0-preview1")

In [4]:
import org.jetbrains.kotlinx.spark.api.*
import org.apache.spark.sql.functions.*
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.ml.tuning.CrossValidator
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.recommendation.ALSModel

In [5]:
val ratingFile = "data/ml-latest-small/ratings.csv"
val movieFile = "data/ml-latest-small/movies.csv"
val linkFile = "data/ml-latest-small/links.csv"
val tagFile = "data/ml-latest-small/tags.csv"

In [6]:
val spark = SparkSession
.builder()
.master("local[*]")
.appName("Recommender").orCreate

In [7]:
spark

org.apache.spark.sql.SparkSession@6954913f

In [8]:
val path = "/Users/z001hqv/spark-3.0.1-bin-hadoop2.7/data/mllib/images/origin/kittens"

In [9]:
val df = spark.read().format("image")
.option("dropInvalid", true)
.load(path)
.select("image.origin", "image.height", "image.width", "image.nChannels", "image.mode")

In [10]:
df.show(false)

+--------------------------------------------------------------------------------------------------------+------+-----+---------+----+
|origin                                                                                                  |height|width|nChannels|mode|
+--------------------------------------------------------------------------------------------------------+------+-----+---------+----+
|file:///Users/z001hqv/spark-3.0.1-bin-hadoop2.7/data/mllib/images/origin/kittens/54893.jpg              |311   |300  |3        |16  |
|file:///Users/z001hqv/spark-3.0.1-bin-hadoop2.7/data/mllib/images/origin/kittens/DP802813.jpg           |313   |199  |3        |16  |
|file:///Users/z001hqv/spark-3.0.1-bin-hadoop2.7/data/mllib/images/origin/kittens/29.5.a_b_EGDP022204.jpg|200   |300  |3        |16  |
|file:///Users/z001hqv/spark-3.0.1-bin-hadoop2.7/data/mllib/images/origin/kittens/DP153539.jpg           |296   |300  |3        |16  |
+------------------------------------------------------

In [11]:
val PATH = "/Users/z001hqv/spark-3.0.1-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt"

In [12]:
val svm = spark.read().format("libsvm").option("numFeatures", 700).load(PATH)

In [13]:
svm.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(700,[127,128,129...|
|  1.0|(700,[158,159,160...|
|  1.0|(700,[124,125,126...|
|  1.0|(700,[152,153,154...|
|  1.0|(700,[151,152,153...|
|  0.0|(700,[129,130,131...|
|  1.0|(700,[158,159,160...|
|  1.0|(700,[99,100,101,...|
|  0.0|(700,[154,155,156...|
|  0.0|(700,[127,128,129...|
|  1.0|(700,[154,155,156...|
|  0.0|(700,[153,154,155...|
|  0.0|(700,[151,152,153...|
|  1.0|(700,[129,130,131...|
|  0.0|(700,[154,155,156...|
|  1.0|(700,[150,151,152...|
|  0.0|(700,[124,125,126...|
|  0.0|(700,[152,153,154...|
|  1.0|(700,[97,98,99,12...|
|  1.0|(700,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [7]:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

In [15]:
// Stage 1
val labelIndexer = StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(svm)

In [16]:
// Stage 2
val featureIndexer = VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4)
.fit(svm)

In [17]:
// Stage 3
val decisionTreeClassifierModel = DecisionTreeClassifier()
    .setLabelCol("indexedLabel")
    .setFeaturesCol("indexedFeatures")

In [18]:
// Create pipeline
val pipeline = Pipeline().setStages(arrayOf(labelIndexer, featureIndexer, decisionTreeClassifierModel))

In [19]:
val (trainingData, testData) = svm.randomSplit(doubleArrayOf(0.7, 0.3))

In [20]:
val model = pipeline.fit(trainingData)

In [21]:
val predictions = model.transform(testData)

In [22]:
predictions.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       1.0|         1.0|(700,[122,123,148...|
|       1.0|         1.0|(700,[123,124,125...|
|       1.0|         1.0|(700,[124,125,126...|
|       1.0|         1.0|(700,[125,126,127...|
|       1.0|         1.0|(700,[126,127,128...|
+----------+------------+--------------------+
only showing top 5 rows



In [23]:
val evaluator = MulticlassClassificationEvaluator()
    .setLabelCol("indexedLabel")
    .setPredictionCol("prediction")
    .setMetricName("accuracy")

In [24]:
val accuracy = evaluator.evaluate(predictions)

In [25]:
println("Test Error = ${1.0 - accuracy}")

Test Error = 0.0


In [8]:
val mlb = spark
        .read()
        .option("header", "true")
        .option("inferSchema", "true")
        .csv("data/players_v2.csv")

In [196]:
val mlb_hitting = spark
    .read()
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("data/hitting.csv")

In [197]:
mlb_hitting.show()

+---------+---+-------------------+-------------------+-------------------+---+---+---+---+---+---+-----+---+---+---+---+-------------------+
|player_id| bb|                avg|                slg|                ops| cs| sb| hr|rbi|lob|xbh|  obp|  h|  r| ab| so|              babip|
+---------+---+-------------------+-------------------+-------------------+---+---+---+---+---+---+-----+---+---+---+---+-------------------+
|   605113| 52|              0.254|              0.437|              0.753|  2|  8| 19| 82|239| 58|0.316|141| 79|556|113|               0.28|
|   594777| 70|              0.232|              0.467|              0.792|  1|  4| 33| 74|245| 63|0.325|128| 92|552|162|              0.265|
|   500871| 50|              0.269|              0.511|              0.831|  1|  5| 35|118|265| 74| 0.32|171| 94|636|130|              0.283|
|   608348| 48|              0.245|              0.478|              0.826|  0|  0| 18| 47|141| 37|0.348| 77| 46|314| 79|              0.271|
|   64

In [198]:
val combined = mlb.join(mlb_hitting, "player_id")

In [199]:
combined.count()

381

In [233]:
val labelAndFeatures = combined
    .withColumn("position", `when`(col("position_txt").equalTo("CF")
            .or(col("position_txt").equalTo("RF"))
            .or(col("position_txt").equalTo("LF")), "OF")
            .otherwise(col("position_txt")))
    .withColumn("bats_mod", `when`(col("bats").equalTo("R"), 0)
    .`when`(col("bats").equalTo("L"), 1).otherwise(2))
    .withColumn("throws_mod", `when`(col("throws").equalTo("R"),0).otherwise(1))
    .filter(col("position").isInCollection(listOf("1B", "2B", "3B", "SS", "C", "OF")).and(col("ab").gt(250)))
    .select("position", "hr", "slg", "bats_mod", "throws_mod", "height_in_inches", "weight")

In [234]:
labelAndFeatures.filter(col("position").equalTo("OF")).show(50)

+--------+---+-----+--------+----------+----------------+------+
|position| hr|  slg|bats_mod|throws_mod|height_in_inches|weight|
+--------+---+-----+--------+----------+----------------+------+
|      OF| 33|0.467|       1|         1|              70|   210|
|      OF| 12|0.461|       1|         1|              73|   220|
|      OF| 41|0.518|       0|         0|              72|   205|
|      OF| 29|0.472|       0|         0|              73|   225|
|      OF| 35|0.535|       0|         0|              76|   230|
|      OF| 20|0.476|       2|         0|              74|   225|
|      OF| 36|0.557|       0|         0|              75|   230|
|      OF| 33|0.489|       0|         0|              73|   230|
|      OF| 12|0.475|       1|         1|              72|   192|
|      OF| 21|0.429|       1|         1|              77|   240|
|      OF| 36|0.538|       1|         1|              73|   220|
|      OF| 15|0.428|       1|         1|              69|   176|
|      OF|  8|0.378|     

In [235]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

In [236]:
// Stage 1
val labelIndexer = StringIndexer()
    .setInputCol("position")
    .setOutputCol("indexedLabel")

In [237]:
// Stage 2
val output = VectorAssembler()
    .setInputCols(arrayOf("hr", "slg", "bats_mod", "throws_mod", "height_in_inches", "weight"))
    .setOutputCol("features")

In [238]:
// Stage 2
val featureIndexer = VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexedFeatures")
    .setMaxCategories(4)

In [239]:
// Stage 4
val decisionTreeClassifierModel = DecisionTreeClassifier()
    .setLabelCol("indexedLabel")
    .setFeaturesCol("indexedFeatures")

In [240]:
// Create pipeline
val pipeline = Pipeline().setStages(arrayOf(labelIndexer, output, featureIndexer, decisionTreeClassifierModel))

In [241]:
val (trainingData, testData) = labelAndFeatures.na().drop().randomSplit(doubleArrayOf(0.7, 0.3))

In [242]:
testData.count()

64

In [243]:
val model = pipeline.fit(trainingData)

In [244]:
val predictions = model.transform(testData)

In [245]:
predictions.select("prediction", "indexedLabel", "features").show(10)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         4.0|[12.0,0.399,0.0,0...|
|       0.0|         4.0|[15.0,0.411,1.0,0...|
|       0.0|         4.0|[21.0,0.449,1.0,1...|
|       0.0|         4.0|[23.0,0.43,0.0,0....|
|       0.0|         4.0|[29.0,0.476,0.0,0...|
|       0.0|         4.0|[33.0,0.503,0.0,0...|
|       0.0|         4.0|[37.0,0.569,2.0,0...|
|       0.0|         4.0|[53.0,0.583,0.0,0...|
|       2.0|         3.0|[2.0,0.321,2.0,0....|
|       0.0|         3.0|[9.0,0.487,0.0,0....|
+----------+------------+--------------------+
only showing top 10 rows



In [246]:
val evaluator = MulticlassClassificationEvaluator()
    .setLabelCol("indexedLabel")
    .setPredictionCol("prediction")
    .setMetricName("accuracy")

In [247]:
val accuracy = evaluator.evaluate(predictions)

In [248]:
println("Test Error = ${1.0 - accuracy}")

Test Error = 0.5625


In [249]:
accuracy

0.4375

In [192]:
predictions.select("*").filter(col("prediction").equalTo(col("indexedLabel"))).count()

14

In [193]:
predictions.count()

45