In [2]:
val rawData = sc.textFile("stumbleupon/train_noheader.tsv")
val records = rawData.map(line => line.split("\t"))


In [5]:
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors

val data = records.map { r => 
    val trimmed = r.map(_.replaceAll("\"", ""))
    val label = trimmed(r.size - 1).toInt
    val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
    LabeledPoint(label, Vectors.dense(features))
}
data.cache
val numData = data.count
println(numData)

7395


In [6]:
//Data for Naive Bayes, with all the negative values changed to 0.0
val nbData = records.map { r => 
    val trimmed = r.map(_.replaceAll("\"", ""))
    val label = trimmed(r.size - 1).toInt
    val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble).map(d => if (d < 0) 0.0 else d)
    LabeledPoint(label, Vectors.dense(features))
}

In [7]:
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.configuration.Algo
import org.apache.spark.mllib.tree.impurity.Entropy

val numIterations = 10
val maxTreeDepth = 5

In [8]:
//Logistic Regression model Training
val lrModel = LogisticRegressionWithSGD.train(data, numIterations)

In [9]:
//Support Vector Machine model Training
val svmModel = SVMWithSGD.train(data, numIterations)

In [17]:
//Naive Bayes model Training
val nbModel = NaiveBayes.train(nbData)

In [11]:
//Decision Tree model Training
val dtModel = DecisionTree.train(data, Algo.Classification, Entropy, maxTreeDepth)

In [12]:
//Generating predictions
val dataPoint = data.first
val prediction = lrModel.predict(dataPoint.features)
println("Prediction: " + prediction)
val trueLabel = dataPoint.label
println("True Label: " + trueLabel)

Prediction: 1.0
True Label: 0.0


In [13]:
//Predictions in bulk
val predictions = lrModel.predict(data.map(lp => lp.features))
println(predictions.take(5).mkString(","))

1.0,1.0,1.0,1.0,1.0


In [14]:
//Evaluation
val lrTotalCorrect = data.map{ point => 
    if(lrModel.predict(point.features) == point.label) 1 else 0
}.sum

val lrAccuracy = lrTotalCorrect / numData
println("Logistic Regression Accuracy: " + lrAccuracy)

Logistic Regression Accuracy: 0.5146720757268425


In [19]:
val svmTotalCorrect = data.map{ point => 
    if(svmModel.predict(point.features) == point.label) 1 else 0
}.sum

val nbTotalCorrect = nbData.map{ point =>
    if(nbModel.predict(point.features) == point.label) 1 else 0
}.sum
val dtTotalCorrect = data.map{ point =>
    val score = dtModel.predict(point.features)
    val predicted = if(score > 0.5) 1 else 0
    if(predicted == point.label) 1 else 0
}.sum

val svmAccuracy = svmTotalCorrect / numData
println("SVM Accuracy: " + svmAccuracy)
val nbAccuracy = nbTotalCorrect / numData
println("Naive Bayes Accuracy: " + nbAccuracy)
val dtAccuracy = dtTotalCorrect / numData
println("Decision Tree Accuracy: " + dtAccuracy)

SVM Accuracy: 0.5146720757268425
Naive Bayes Accuracy: 0.5803921568627451
Decision Tree Accuracy: 0.6482758620689655


In [21]:
//Metrics
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
val metrics = Seq(lrModel, svmModel).map{ model => 
    val scoreAndLabels = data.map{ point =>
        (model.predict(point.features), point.label)
    }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)
}

val nbMetrics = Seq(nbModel).map{ model =>
    val scoreAndLabels = nbData.map{ point =>
        val score = model.predict(point.features)
        (if(score > 0.5) 1.0 else 0.0, point.label)
    }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)
}

val dtMetrics = Seq(dtModel).map{ model =>
    val scoreAndLabels = data.map{ point =>
        val score = model.predict(point.features)
        (if(score>0.5)1.0 else 0.0, point.label)
    }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)
}

val allMetrics = metrics ++ nbMetrics ++ dtMetrics
allMetrics.foreach{ case (m, pr, roc) =>
    println(f"$m, Area under PR: ${pr * 100.0}%2.4f%%, Area under ROC: ${roc * 100.0}%2.4f%%")
}

LogisticRegressionModel, Area under PR: 75.6759%, Area under ROC: 50.1418%
SVMModel, Area under PR: 75.6759%, Area under ROC: 50.1418%
NaiveBayesModel, Area under PR: 68.0851%, Area under ROC: 58.3559%
DecisionTreeModel, Area under PR: 74.3081%, Area under ROC: 64.8837%
