In [1]:
@file:DependsOn("com.londogard:nlp:1.2.0-BETA2")
%use dataframe

# Classifying IMDB Reviews

This is a simple end-2-end example of text classification

In [91]:
val df = DataFrame.readCSV("imdb_small.csv")
df.head()

Let's validate that we have evenly distributed data.

In [92]:
df.valueCounts("sentiment")

Building a simple classifier directly on this, comparing Na√Øve Bayes with Logisitc Regression.

1. Tokenize Data
2. Encode Data
  - BagOfWords
  - TF-IDF
  - BM-25
  - ...
3. Classify Data

In [93]:
import com.londogard.nlp.tokenizer.SimpleTokenizer

val tokenizer = SimpleTokenizer()
val dfWithTokens = df.add("tokens") { tokenizer.split(review) }
dfWithTokens.head()

In [95]:
val dfToNaiveBayes = dfWithTokens.add("category") { if (sentiment == "positive") 1 else 0 }
dfToNaiveBayes.head()

In [88]:
data class SplittedData<T, I>(val xTrain: List<T>, val xValid: List<T>, val yTrain: List<I>, val yValid: List<I>)

fun <T, I> splitToTrainTest(x: List<T>, y: List<I>, split: Float = 0.7f): SplittedData<T, I> {
    require(x.size == y.size) { "x and y must be equal length! Is ${x.size} != ${y.size}" }
    val (xSize, ySize) = (x.size * split).toInt() to (y.size * split).toInt()

    return SplittedData(x.slice(0 until xSize), x.slice(xSize until x.size), y.slice(0 until ySize), y.slice(ySize until y.size))
}

In [96]:
import com.londogard.nlp.meachinelearning.vectorizer.count.CountVectorizer
import org.jetbrains.kotlinx.multik.api.mk
import org.jetbrains.kotlinx.multik.api.ndarray

val vectorizer = CountVectorizer<Int>()
val x = dfToNaiveBayes.get{ tokens } .toList()
val yArray = dfToNaiveBayes.get { category }.toIntArray()

val splitData = splitToTrainTest(x, y)

In [97]:
import com.londogard.nlp.meachinelearning.predictors.classifiers.NaiveBayes
import com.londogard.nlp.meachinelearning.predictors.classifiers.LogisticRegression
import com.londogard.nlp.meachinelearning.predictors.asAutoOneHotClassifier


val xTrain = vectorizer.fitTransform(splitData.xTrain)
val xValid = vectorizer.transform(splitData.xValid)

In [109]:
import org.jetbrains.kotlinx.multik.api.mk.get

val classifier = LogisticRegression().asAutoOneHotClassifier()
classifier.fit(xTrain, mk.ndarray(splitData.yTrain))

In [110]:
val yPred = classifier.predictSimple(xValid)
yPred[0..10]

Line_3619.jupyter-kts (2:1 - 13) Unresolved reference. None of the following candidates is applicable because of receiver type mismatch: 
public operator fun MatchGroupCollection.get(name: String): MatchGroup? defined in kotlin.text
Line_3619.jupyter-kts (2:6 - 13) No get method providing array access

In [107]:
import com.londogard.nlp.meachinelearning.metrics.Metrics

Metrics.accuracy(yPred.reshape(yPred.size, 1), mk.ndarray(splitData.yValid).reshape(yPred.size, 1))

0.5393333333333333

How about we rather use **TF-IDF**?