# ML : Multilabel classification

For this last part we do some Multilabel classification of the tags based on the text of each questions.
In order to do so we use two frameworks : Spark MLLIB & Elephas which allows to connect spark pipeline with Keras.  

In [1]:
import os
os.chdir(os.environ['HOME'])

import stack_overflow_functions.DataLoader as data_loader
import stack_overflow_functions.DataTransformation as data_transfo
from pycountry_convert import country_name_to_country_alpha3
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
import patoolib
import gdown
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql.types import StructField, StructType, StringType, ArrayType,IntegerType

import pyspark
import sparknlp
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2
from geopy.geocoders import Nominatim
import pyspark.sql.functions as F
from pyspark.sql.types import LongType, StringType
import pandas as pd
import re
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import geopandas as gpd 
import matplotlib.pyplot as plt
import seaborn as sns
import json
seed = 2020

In [2]:
spark = sparknlp.start()
conf = (pyspark
        .SparkConf()
        .set("spark.ui.showConsoleProgress", "true")
       )
sc = pyspark.SparkContext.getOrCreate(conf=conf)
sqlcontext = pyspark.SQLContext(sc)

In [3]:
%%time
post_dir = "Data/sample/Posts"
posts = (sqlcontext
         .read
         .format("parquet")
         .option("header",True)
         .load(post_dir)
         .sample(False, 0.01)
         .select("Id",
                  F.concat_ws(' ',F.col('Title'),F.col('Body')).alias("full_text"),
                  "Tags"
                )
        )

CPU times: user 27.2 ms, sys: 4.74 ms, total: 32 ms
Wall time: 1min 22s


In [4]:
tags_split =tags_split = F.regexp_replace( F.regexp_replace(
    F.regexp_replace(F.col('Tags'), '&lt;', ''), "&gt;", "<split_token>"), " ", "")

udf_drop = F.udf(lambda x: re.sub("'","",str(x[:-1])[1:-1]) if isinstance(x,list) else None,StringType())

posts = (
    posts
    .withColumn('Splitted_tags', tags_split)
    .withColumn('Splitted_tags', F.split(F.col("Splitted_tags"), "<split_token>"))
    .withColumn('Splitted_tags', udf_drop(F.col("Splitted_tags")))
    .withColumn('Splitted_tags', F.split(F.col("Splitted_tags"),","))
    .drop('Tags')
)



In [5]:
input_col = "full_text"
clean_up_patterns = [
                    "p&gt;"
                    ,"&.*?;\space"
                    ,'&.*?;'                
                    ,"/.*?;"
                    ,"/code"
                    ,"/pre"
                    ,'/p'
                    ,"/a"
                    ,"href="
                    ,"lt;"
                    ,"gt;"
                    ,"[^\w\s]"
                    ,r"\b\d+\b"
                  ]


# Document assembler : Tokenize our text
documentAssembler = DocumentAssembler() \
    .setInputCol(input_col) \
    .setOutputCol('_intermediate_results')

# Document normalizer : Normalize the document
# by lowercasing, removing non utf8 chars
# and remove regex oattern defined
doc_norm = DocumentNormalizer() \
    .setInputCols("_intermediate_results") \
    .setOutputCol(input_col + "_cleaned") \
    .setAction("clean") \
    .setPatterns(clean_up_patterns) \
    .setReplacement(" ") \
    .setPolicy("pretty_all") \
    .setLowercase(True)

# Document tokenizer : allows to remove
# undesired tokens (punctuations etc.)
# prepare the colums for the stopwords 
# remover
tokenizer = Tokenizer() \
    .setInputCols([input_col + "_cleaned"]) \
    .setOutputCol("token") \
    .setSplitChars(['-']) \
    .setContextChars(['(', ')', '?', '!']) \
    .setSplitPattern("'") \
    .setMaxLength(0) \
    .setMaxLength(99999) \
    .setCaseSensitiveExceptions(False)


# StopWordsCleaner : remove 
# the stopwords based on
# a predifined list
Stop_words_cleaner = (
    StopWordsCleaner()
    .pretrained("stopwords_en", "en")
    .setInputCols(["token"])
    .setOutputCol(input_col + "_without_stopwords") 
    .setCaseSensitive(False) 
    .setLazyAnnotator(False)
)

# Lemmatize the text 
# thanks to the lemmatizing tab
# defined above
Lemmatizer_cleaner = (
    Lemmatizer() 
    .setInputCols([input_col + "_without_stopwords"]) 
    .setOutputCol(input_col + "_lemmatized") 
    .setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->") 
    .setLazyAnnotator(False)
)


# Creates thepipeline
cleaning_pipeline = (
    Pipeline() 
    .setStages([
        documentAssembler,
        doc_norm,
        tokenizer,
        #Document_cleaner,
        Stop_words_cleaner,
        Lemmatizer_cleaner])
)


posts_ml = (
    cleaning_pipeline
    .fit(posts)
    .transform(posts)
    .select(F.col("Id"),
            F.col(input_col),
            F.col(input_col + "_lemmatized.result"),
            F.col("Splitted_tags")
           )
) 


stopwords_en download started this may take some time.
Approximate size to download 2.9 KB
[OK!]


In [6]:
from pyspark.mllib.regression import LabeledPoint
import numpy as np

all_tags = (
            posts_ml
                .select(F.explode("Splitted_tags").alias('t'))
                .select(F.trim("t").alias('t'))
                .groupby("t")
                .count()
                .filter(F.col('count') >= 10)
                .collect()
)
i=0
match_tags = {}
for tag in all_tags:
    m_tag = tag['t'].strip()
    match_tags[m_tag] = i
    i += 1
    

def map_tags(tag_list):
    label = np.zeros(len(match_tags.keys()),dtype=int)
    mask = []
    for tag in tag_list:
        tag_m = tag.strip()
        for key, value in match_tags.items():
            if tag_m == key:
                mask.append(value)
                pass
    label[mask] = 1
    return label.tolist()

In [7]:
udf_map = F.udf(map_tags, StringType())
posts_ml = posts_ml.withColumn("label_to_encode", udf_map('Splitted_tags'))

In [8]:
posts_ml=posts_ml.withColumn("lemma_text",F.concat_ws(" ", "result"))

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.feature import StringIndexer

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
# TF
cv = CountVectorizer(inputCol="result", outputCol="tf_features", vocabSize=100, minDF=0)

# IDF
idf = IDF(inputCol="tf_features", outputCol="features")

# Label encoder 
label_string= StringIndexer(inputCol="label_to_encode", outputCol ="label")

# Logistic regression
lr = LogisticRegression(maxIter=10, regParam=0.001,family="multinomial")
pipeline = Pipeline(stages=[cv, idf, label_string, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(posts_ml)

In [16]:
model.transform(posts_ml).show()

+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+----------+
|      Id|           full_text|              result|       Splitted_tags|     label_to_encode|          lemma_text|         tf_features|            features| label|       rawPrediction|         probability|prediction|
+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+----------+
|63215921|Changing Google C...|[change, google, ...|[python-3.x,  goo...|[0, 0, 0, 0, 0, 0...|change google clo...|(100,[14,30,33,41...|(100,[14,30,33,41...|4003.0|[6.82200264777294...|[0.06237101207997...|       0.0|
|63219559|Configuring Ident...|[configure, ident...|[c#,  asp.net-cor...|[0, 0, 0, 0, 0, 0...|configure identit...|(100,[0,1,2,3