# Language Classifier

### Setup und Helpers

In [1]:
import numpy as np

In [2]:
import findspark
findspark.init()

In [14]:
import pyspark
import sys
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.evaluation import MulticlassMetrics

### Übung 1

Erstellen eines neuen SparkContext mit dem Name "LanguageClassifier"

In [4]:
sc = pyspark.SparkContext(appName="LanguageClassifier")

In [5]:
def printRDD(rdd, part = True):
    if (part):
        for i in rdd.take(10):
            print(i)
    else:
        for i in rdd.collect():
            print(i)  

### Übung 2

Die Methode "readDataFiles" ergänzen. Erstelle eine Variable mit dem Namen dataRDD und lade mit dem SparkContext mit der Methode "wholeTextFiles" nur den Text in das RDD.

In [6]:
def readDataFiles(path):
    global sc
    dataRDD = sc.wholeTextFiles(path).map(lambda x: x[1]) # return only text (not path x[0])
    return dataRDD

In [7]:
path = "language-classifier-data"
deRDD = readDataFiles(path + "/de_DE")
enRDD = readDataFiles(path + "/en_UK")
esRDD = readDataFiles(path + "/es_ES")
frRDD = readDataFiles(path + "/fr_FR")
itRDD = readDataFiles(path + "/it_IT")
nlRDD = readDataFiles(path + "/nl_NL")

In [8]:
printRDD(deRDD)

für aktuelle tweets live von der berlinale schaut mal bei vorbei
ff aufgrund aktueller anlässe und monazaki wäre sie auf twitter dabei
danke für euer feedbackhilft uns immer und wir arbeiten stetstwittter auch auf deutsch zu optimieren
danke für die vielen empfehlungen es macht immer spa euere zu lesen keep em coming wm2010
wir haben eine liste von ausgewählten konten für die wm aufgestellt habt ihr noch empfehlungen spieler usw wm2010
hast du ein iphone hast du schon twitter für iphone unsere offizielle app heruntergeladen nein na auf was wartest du
die wm ist die gröte sportveranstaltung der welt wie können medienfirmen dieses potenzial ausnutzen unsere ideen
ups musikmontag verpasst hier die deutsche liste empfohlener benutzer in die kategorie musik
gestern haben wir schon für ff vorgeschlagen sie war ja eigentlich euere empfehlung also habt ihr sonst noch welche
freunde die liste deutscher empfohlenen benutzer ist live weitere empfehlungen für jede kategorie sind wilkommen


### Feature Vector erstellen

In [9]:
charset = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'á', 'â', 'ã', 'ä', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'ô', 'ö', 'ù', 'ú', 'û', 'ü']
print("size of character set", len(charset))

bigram_dict = {}
idx = 0
for c1 in charset:
    for c2 in charset:
        bigram_dict[c1+c2] = idx
        idx += 1

print("number of bigrams", len(bigram_dict))

size of character set 48
number of bigrams 2304


In [10]:
def initBigramVector():
    return np.zeros(len(bigram_dict), dtype = np.int32)

### Bigrams erstellen

In [11]:
def getBigramVector(text):
    global bigram_dict
    bv = initBigramVector()
    
    i = 0
    while i < len(text) - 1:
        bigram = text[i].lower() + text[i+1].lower()
        if bigram in bigram_dict:
            bv[bigram_dict[bigram]] += 1
        i += 1
    
    return bv

In [15]:
np.set_printoptions(threshold=sys.maxsize)
print(getBigramVector("Ich bin ein kurzer Text"))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

### Train und Test Daten erstellen

In [16]:
dataRDD = deRDD.map(lambda x: LabeledPoint(0.0, getBigramVector(x))) \
    .union(enRDD.map(lambda x: LabeledPoint(1.0, getBigramVector(x)))) \
    .union(esRDD.map(lambda x: LabeledPoint(2.0, getBigramVector(x)))) \
    .union(frRDD.map(lambda x: LabeledPoint(3.0, getBigramVector(x)))) \
    .union(itRDD.map(lambda x: LabeledPoint(4.0, getBigramVector(x)))) \
    .union(nlRDD.map(lambda x: LabeledPoint(5.0, getBigramVector(x))))

In [17]:
print(dataRDD.count())

9066


In [18]:
print(dataRDD.first().label)
print(dataRDD.first().features)

0.0
[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0

### Übung 3

Splitte das "dataRDD" in ein Test und Train RDD mit dem Namen "trainingRDD" und "testRDD". Dies kannst du mit randomSplit machen. Das Trainset sollte 80% der Daten beinhalten und das Testset 20%.

In [19]:
trainingRDD, testRDD = dataRDD.randomSplit([0.8, 0.2])

### Klassifizieren und Bewerten

In [20]:
model = NaiveBayes.train(trainingRDD, 1.0)

In [21]:
predictionAndLabels = testRDD.map(lambda lp: (float(model.predict(lp.features)), lp.label))
metrics = MulticlassMetrics(predictionAndLabels)
print("Accuracy:",metrics.accuracy)

Accuracy: 0.9738636363636364


### Übung 4

Teste den Classifier. Rufe auf dem Model die Methode Predict auf. Zuerst muss der Bigram Vector des Textes ermittelt werden. Dies kannst du mit der Methode "getBigramVector" machen.

In [22]:
txt = "Ich bin ein kurzer Text"
print(txt, model.predict(getBigramVector(txt)))

Ich bin ein kurzer Text 0.0


In [23]:
txt = "I am a short text"
print(txt, model.predict(getBigramVector(txt)))

I am a short text 1.0


In [24]:
txt = "Je suis un petit texte"
print(txt, model.predict(getBigramVector(txt)))

Je suis un petit texte 3.0
