# Classification of the Text in Turkish using Spark NLP

## Initializing of PySpark & Colab

In [1]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp

openjdk version "1.8.0_265"
OpenJDK Runtime Environment (build 1.8.0_265-8u265-b01-0ubuntu2~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.265-b01, mixed mode)
Collecting pyspark==2.4.4
[?25l  Downloading https://files.pythonhosted.org/packages/87/21/f05c186f4ddb01d15d0ddc36ef4b7e3cedbeb6412274a41f26b55a650ee5/pyspark-2.4.4.tar.gz (215.7MB)
[K     |████████████████████████████████| 215.7MB 58kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 42.1MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.4-py2.py3-none-any.whl size=216130389 sha256=ccac6edcc83c80289e75f0784b64aacaef90f5345ac1e4d5d97b244c3f95bf29
  Stored in directory: /root/.cache/pip/wheels/ab/09/4d/0d18423005

## Starting of Spark Session

In [2]:
import sparknlp
spark = sparknlp.start()
sparknlp.version()
spark.version

'2.4.4'

In [3]:
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
#from sparknlp.embeddings import *

## Loading and Reading the DataSet

In [4]:
! wget https://raw.githubusercontent.com/murat-gunay/NLP/master/02_NLP_Projects/2-project_2_Turkish_sparkNLP_Classification/turkish_categorical_corpus.csv

--2020-10-03 09:19:27--  https://raw.githubusercontent.com/murat-gunay/NLP/master/02_NLP_Projects/2-project_2_Turkish_sparkNLP_Classification/turkish_categorical_corpus.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10627541 (10M) [text/plain]
Saving to: ‘turkish_categorical_corpus.csv’


2020-10-03 09:19:28 (56.3 MB/s) - ‘turkish_categorical_corpus.csv’ saved [10627541/10627541]



In [5]:
df_Spark = spark.read \
           .option("header", True) \
           .csv("turkish_categorical_corpus.csv")

In [6]:
df_Spark.show(5, truncate=100)

+--------+----------------------------------------------------------------------------------------------------+
|category|                                                                                                text|
+--------+----------------------------------------------------------------------------------------------------+
|siyaset | 3 milyon ile ön seçim vaadi mhp nin 10 olağan büyük kurultayı nda konuşan genel başkan adayı kor...|
|siyaset | mesut_yılmaz yüce_divan da ceza alabilirdi prof dr sacit adalı isviçre deki banka eski başbakan ...|
|siyaset | disko lar kaldırılıyor başbakan_yardımcısı arınç disko diye tabir edilen disiplin koğuşlarının k...|
|siyaset | sarıgül anayasa_mahkemesi ne gidiyor mustafa_sarıgül ilçedeki sınır değişikliğine itiraz için an...|
|siyaset | erdoğan idamın bir haklılık sebebi var demek ki yeri geldiği zaman idamın bir haklılık sebebi de...|
+--------+----------------------------------------------------------------------------------------------

In [7]:
df_Spark.groupBy("category").count().show()

+----------+-----+
|  category|count|
+----------+-----+
|   kultur |  700|
|  siyaset |  700|
|teknoloji |  700|
|   saglik |  700|
|  ekonomi |  700|
|     spor |  700|
|    dunya |  700|
+----------+-----+



## Removing extraneus underscores from the documents

In [9]:
df_Spark.take(2)

# We should remove the "_" (underscores) between nouns. e.g.: "mesut_yılmaz", "koray_aydın"

[Row(category='siyaset ', text=' 3 milyon ile ön seçim vaadi mhp nin 10 olağan büyük kurultayı nda konuşan genel başkan adayı koray_aydın seçimlerden önce partinin üye sayısının 3 milyona ulaştırılması hedefini koyarak ön seçim uygulaması vaadinde bulundu mhp nin 10 olağan büyük kurultayı nda konuşan genel başkan adayı koray_aydın seçimlerden önce partinin üye sayısının 3 milyona ulaştırılması hedefini koyarak ön seçim uygulaması vaadinde bulundu genel_başkan adayı koray_aydın kürsüye beklenirken yapılan tezahüratlar ve ıslıklamalar üzerine divan başkanı tuğrul_türkeş mhp nin genel başkanlığı da genel başkan adaylığı da saygıdeğer işlerdir bu salondaki herkes ciddiye almak zorundadır dedi ve taşkınlıklara izin verilmeyeceğini salonda sükunet sağlanmadan konuşmaların başlamayacağını vurguladı türkeş devlet_bahçeli nin kurultay açılışında konuştuğu için adaylık nedeniyle ikinci bir konuşma yapmayacağını açıkladı konuşmasında kurultayın mhp nin tek başına iktidarına vesile olmasını dileye

In [8]:
from pyspark.sql.functions import *
df_Spark = df_Spark.withColumn('text', regexp_replace('text', '_', ' '))

In [9]:
df_Spark.show(5, truncate=100)

+--------+----------------------------------------------------------------------------------------------------+
|category|                                                                                                text|
+--------+----------------------------------------------------------------------------------------------------+
|siyaset | 3 milyon ile ön seçim vaadi mhp nin 10 olağan büyük kurultayı nda konuşan genel başkan adayı kor...|
|siyaset | mesut yılmaz yüce divan da ceza alabilirdi prof dr sacit adalı isviçre deki banka eski başbakan ...|
|siyaset | disko lar kaldırılıyor başbakan yardımcısı arınç disko diye tabir edilen disiplin koğuşlarının k...|
|siyaset | sarıgül anayasa mahkemesi ne gidiyor mustafa sarıgül ilçedeki sınır değişikliğine itiraz için an...|
|siyaset | erdoğan idamın bir haklılık sebebi var demek ki yeri geldiği zaman idamın bir haklılık sebebi de...|
+--------+----------------------------------------------------------------------------------------------

## Splitting the dataset into training and testing sets

In [10]:
train_news, test_news = df_Spark.randomSplit([0.8, 0.2], seed = 100)

In [13]:
train_news.count()

3889

In [14]:
test_news.count()

1011

## Setting the Pipeline for ``LogisticRegression`` and ``NaiveBayes`` models.

In [9]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')

stop_words = StopWordsCleaner.pretrained('stopwords_tr', 'tr')\
    .setInputCols(["token"]) \
    .setOutputCol("cleanTokens") \
    .setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained("lemma", "tr") \
         .setInputCols(["cleanTokens"]) \
         .setOutputCol("lemma")

# pos = PerceptronModel.pretrained("pos_ud_imst", "tr") \
#       .setInputCols(["document", "token"]) \
#       .setOutputCol("pos")

finisher = Finisher() \
    .setInputCols(["lemma"]) \
    .setOutputCols(["token_features"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

stopwords_tr download started this may take some time.
Approximate size to download 2 KB
[OK!]
lemma download started this may take some time.
Approximate size to download 14.8 MB
[OK!]


In [10]:
from pyspark.ml.feature import HashingTF, IDF, StringIndexer, IndexToString
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Text Classification with `LogisticRegression`

In [11]:
hashTF = HashingTF(inputCol="token_features", outputCol="raw_features", numFeatures=4000)

idf = IDF(inputCol="raw_features", outputCol="features", minDocFreq=5)

label_strIdx = StringIndexer(inputCol="category", outputCol="label")

logReg = LogisticRegression(maxIter=10)

# LogisticRegression(featuresCol='features', labelCol='label', predictionCol='prediction', 
#                    maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, 
#                    fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol='probability', 
#                    rawPredictionCol='rawPrediction', standardization=True, weightCol=None, 
#                    aggregationDepth=2, family='auto', lowerBoundsOnCoefficients=None, 
#                    upperBoundsOnCoefficients=None, lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None)

label_Idxstr = IndexToString(inputCol="label", outputCol="article_class")

nlp_pipeline_lr = Pipeline(
        stages=[document, 
                sentence,
                token,
                stop_words, 
                lemmatizer, 
                finisher,
                hashTF,
                idf,
                label_strIdx,
                logReg,
                label_Idxstr])

In [12]:
classification_model_lr = nlp_pipeline_lr.fit(train_news)

In [14]:
pred_lr = classification_model_lr.transform(test_news)

In [27]:
pred_lr.show()

+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+-------------+
|category|                text|            document|            sentence|               token|         cleanTokens|               lemma|      token_features|        raw_features|            features|label|       rawPrediction|         probability|prediction|article_class|
+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+-------------+
|  dunya | 140 araç birbiri...|[[document, 0, 94...|[[document, 1, 94...|[[token, 1, 3, 14...|[[token, 1, 3, 14...|[[token, 1, 3, 14...|[140, araç, birbi...|(4000,[103,127,13...|(40

In [21]:
pred_lr.select("category", "label", "prediction").show(5)

+--------+-----+----------+
|category|label|prediction|
+--------+-----+----------+
|  dunya |  3.0|       3.0|
|  dunya |  3.0|       3.0|
|  dunya |  3.0|       3.0|
|  dunya |  3.0|       3.0|
|  dunya |  3.0|       3.0|
+--------+-----+----------+
only showing top 5 rows



- Evaluation of Classification (`LogisticRegression`)

In [22]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(pred_lr)
print("Accuracy = %g" % (accuracy))
print("Test Error = %g " % (1.0 - accuracy))

Accuracy = 0.896142
Test Error = 0.103858 


In [19]:
from sklearn.metrics import classification_report, accuracy_score

In [None]:
df_lr = classification_model_lr.transform(test_news).select("category", "label", "prediction").toPandas()

In [None]:
df_lr.head()

Unnamed: 0,category,label,prediction
0,dunya,3.0,3.0
1,dunya,3.0,3.0
2,dunya,3.0,3.0
3,dunya,3.0,3.0
4,dunya,3.0,3.0


In [None]:
print(classification_report(df_lr.label, df_lr.prediction))

              precision    recall  f1-score   support

         0.0       0.94      0.93      0.93       135
         1.0       0.86      0.81      0.83       140
         2.0       0.86      0.88      0.87       142
         3.0       0.82      0.89      0.86       142
         4.0       0.91      0.92      0.91       144
         5.0       0.88      0.88      0.88       153
         6.0       0.98      0.95      0.96       155

    accuracy                           0.89      1011
   macro avg       0.89      0.89      0.89      1011
weighted avg       0.89      0.89      0.89      1011



## Text Classification with `NaiveBayes`

In [23]:
hashTF = HashingTF(inputCol="token_features", outputCol="raw_features", numFeatures=4000)

idf = IDF(inputCol="raw_features", outputCol="features", minDocFreq=5)

label_strIdx = StringIndexer(inputCol="category", outputCol="label")

bayes_class = NaiveBayes(smoothing=111)

label_Idxstr = IndexToString(inputCol="label", outputCol="article_class")

nlp_pipeline_bayes = Pipeline(
    stages=[document, 
            sentence,
            token,
            stop_words, 
            lemmatizer, 
            finisher,
            hashTF,
            idf,
            label_strIdx,
            bayes_class,
            label_Idxstr])

In [24]:
classification_model_bayes = nlp_pipeline_bayes.fit(train_news)

In [25]:
pred_bayes = classification_model_bayes.transform(test_news)

In [None]:
pred_bayes.select("category", "label", "prediction").show(5)

+--------+-----+----------+
|category|label|prediction|
+--------+-----+----------+
|  dunya |  3.0|       3.0|
|  dunya |  3.0|       3.0|
|  dunya |  3.0|       1.0|
|  dunya |  3.0|       1.0|
|  dunya |  3.0|       1.0|
+--------+-----+----------+
only showing top 5 rows



- Evaluation of Classification (`NaiveBaye`)



In [26]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(pred_bayes)
print("Accuracy = %g" % (accuracy))
print("Test Error = %g " % (1.0 - accuracy))

Accuracy = 0.872404
Test Error = 0.127596 


In [None]:
df_bayes = classification_model_bayes.transform(test_news).select("category", "label", "prediction").toPandas()

In [None]:
print(classification_report(df_bayes.label, df_bayes.prediction))

              precision    recall  f1-score   support

         0.0       0.91      0.93      0.92       135
         1.0       0.75      0.86      0.80       140
         2.0       0.77      0.93      0.84       142
         3.0       0.91      0.68      0.78       142
         4.0       0.90      0.92      0.91       144
         5.0       0.88      0.83      0.85       153
         6.0       0.99      0.91      0.95       155

    accuracy                           0.87      1011
   macro avg       0.87      0.87      0.87      1011
weighted avg       0.87      0.87      0.87      1011



## Classification with `BertSentenceEmbeddings` and `ClassifierDL`

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.bin.gz
!gzip cc.tr.300.bin.gz

--2020-10-01 20:43:46--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4506977940 (4.2G) [application/octet-stream]
Saving to: ‘cc.tr.300.bin.gz’


2020-10-01 20:49:12 (13.2 MB/s) - ‘cc.tr.300.bin.gz’ saved [4506977940/4506977940]

gzip: cc.tr.300.bin.gz already has .gz suffix -- unchanged


## Setting the Pipeline for ``ClassifierDLApproach``

In [11]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')

stop_words = StopWordsCleaner.pretrained('stopwords_tr', 'tr')\
    .setInputCols(["token"]) \
    .setOutputCol("cleanTokens") \
    .setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained("lemma", "tr") \
        .setInputCols(["token"]) \
        .setOutputCol("lemma")

embeddings = BertSentenceEmbeddings\
    .pretrained('labse', 'xx') \
    .setInputCols(["document"])\
    .setOutputCol("sentence_embeddings")

# embeddings = WordEmbeddings()\
#   .setInputCols(["sentence", "token"])\
#   .setOutputCol("embeddings")\
#   .setStoragePath('cc.tr.300.bin', "BINARY")\
#   .setDimension(300)

# sentence_embeddings = SentenceEmbeddings() \
#             .setInputCols(["document", "embeddings"]) \
#             .setOutputCol("sentence_embeddings") \
#             .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

stopwords_tr download started this may take some time.
Approximate size to download 2 KB
[OK!]
lemma download started this may take some time.
Approximate size to download 14.8 MB
[OK!]
labse download started this may take some time.
Approximate size to download 1.7 GB
[OK!]


In [14]:
nlp_pipeline_bert = Pipeline(
    stages=[document, 
            sentence,
            token,
            stop_words, 
            lemmatizer, 
            embeddings,
            classsifierdl])

In [13]:
classification_model_bert = nlp_pipeline_bert.fit(train_news)

In [15]:
df_bert = classification_model_bert.transform(test_news).select("category", "text", "class.result").toPandas()

In [16]:
df_bert.head()

Unnamed: 0,category,text,result
0,dunya,140 araç birbirine girdi 2 ölü 80 yaralı abd ...,[dunya ]
1,dunya,150 araç birbirine girdi abd de yoğun sis ned...,[dunya ]
2,dunya,150 araç birbirine girdi teksas ta etkili ola...,[dunya ]
3,dunya,2 nükleer santralin daha açılmasını istiyor j...,[ekonomi ]
4,dunya,46 5 milyon dolarlık insani yardım aldı tacik...,[ekonomi ]


In [17]:
df_bert["result"].str[0].head()

0      dunya 
1      dunya 
2      dunya 
3    ekonomi 
4    ekonomi 
Name: result, dtype: object

- Evaluation of Classification (`DLApproach` & `BertEmbeddings`)

In [None]:
print(classification_report(df_bert.category, df_bert.result.str[0]))

              precision    recall  f1-score   support

      dunya        0.86      0.80      0.82       142
    ekonomi        0.86      0.79      0.82       140
     kultur        0.89      0.94      0.92       144
     saglik        0.88      0.95      0.91       135
    siyaset        0.85      0.85      0.85       142
       spor        0.97      0.95      0.96       155
  teknoloji        0.84      0.88      0.86       153

    accuracy                           0.88      1011
   macro avg       0.88      0.88      0.88      1011
weighted avg       0.88      0.88      0.88      1011

