In [3]:
from pyspark.sql import SparkSession 
from pyspark.ml.feature import CountVectorizer,Tokenizer,StopWordsRemover
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


spark = SparkSession.builder.appName("SparkML Depression Analysis").getOrCreate()
dataDF= spark.read.option("inferSchema","True")\
                        .option("delimiter","\t")\
                        .csv("datasets/movie_turkish_train.txt")
dataDF.show()
dataDF = dataDF.withColumnRenamed("_c0","tweet")

dataDF = dataDF.withColumnRenamed("_c1","label")

tokenizer = Tokenizer(inputCol='tweet',outputCol='tokens')
dataDF = tokenizer.transform(dataDF)

remover = StopWordsRemover(inputCol='tokens',outputCol='no-stop-words')
dataDF = remover.transform(dataDF)


vectorizer = CountVectorizer(inputCol='no-stop-words',outputCol='features')
dataDF = vectorizer.fit(dataDF).transform(dataDF)
dataDF.show()

+--------------------+---+
|                 _c0|_c1|
+--------------------+---+
|bunu da gördük uz...|  0|
|filmi begenmeyenl...|  0|
|günde 1 film izle...|  0|
|afisine bakip ald...|  0|
|sadece insanlarin...|  0|
|ucuz bir aksiyon ...|  0|
|olmamis diyor pua...|  0|
|kesinlikle çok kö...|  0|
|ben nasil bir fil...|  0|
|bu yüzler fazla t...|  0|
|. çok ilginç yaa ...|  0|
|bence bu film hiç...|  0|
|valla ben begenme...|  0|
|gönül ister milyo...|  0|
|dün aksam bu film...|  0|
|cok kötü olmus......|  0|
|igrenç yaaa.acaba...|  0|
|saçma sapan bir f...|  0|
|çok güzel komedi ...|  0|
|beklentimi yüksek...|  0|
+--------------------+---+
only showing top 20 rows

+--------------------+-----+--------------------+--------------------+--------------------+
|               tweet|label|              tokens|       no-stop-words|            features|
+--------------------+-----+--------------------+--------------------+--------------------+
|bunu da gördük uz...|    0|[bunu, da, gördük...|[

In [4]:
dataDF = dataDF.select('features','label')
dataDF.show()

dataDF.printSchema()

trainDF,testDF = dataDF.randomSplit([0.75,0.25])


mlpClassifier = MultilayerPerceptronClassifier(layers=[7538,2,2])
model = mlpClassifier.fit(trainDF)


resultDF = model.transform(testDF) #Prediction
#resultDF.show(100)

eva = MulticlassClassificationEvaluator(metricName='accuracy')
accuracy = eva.evaluate(resultDF)
print("Test Accuracy : ",accuracy)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(7538,[14,16,27,1...|    0|
|(7538,[0,1,5,6,50...|    0|
|(7538,[0,1,4,11,2...|    0|
|(7538,[2,3,4,13,3...|    0|
|(7538,[0,1,14,17,...|    0|
|(7538,[0,3,5,36,4...|    0|
|(7538,[12,178,510...|    0|
|(7538,[0,1,2,7,21...|    0|
|(7538,[0,1,6,16,2...|    0|
|(7538,[3,4,14,17,...|    0|
|(7538,[0,1,2,9,12...|    0|
|(7538,[1,3,6,8,9,...|    0|
|(7538,[0,2,29,38,...|    0|
|(7538,[3,6,21,24,...|    0|
|(7538,[3,30,38,49...|    0|
|(7538,[7,19,27,32...|    0|
|(7538,[0,1,3,5,17...|    0|
|(7538,[0,3,11,30,...|    0|
|(7538,[2,5,8,27,1...|    0|
|(7538,[0,5,35,59,...|    0|
+--------------------+-----+
only showing top 20 rows

root
 |-- features: vector (nullable = true)
 |-- label: integer (nullable = true)

Test Accuracy :  0.78515625
