In [78]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as sf
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, NaiveBayes
from pyspark.ml.classification import LogisticRegressionModel, NaiveBayesModel, RandomForestClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer,StopWordsRemover,RegexTokenizer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [None]:
#spark.stop()

In [15]:
spark = SparkSession.builder.appName('GSK').getOrCreate()

In [31]:
data=spark.read.csv("Dataset_N.csv", inferSchema=True,sep=';',header=True)

In [32]:
data

DataFrame[V1: int, V2: string, V3: string, V4: string, V5: string]

In [33]:
data.show()

+--------+---------------+--------------------+--------------------+-------------------+
|      V1|             V2|                  V3|                  V4|                 V5|
+--------+---------------+--------------------+--------------------+-------------------+
|26229701|WASHINGMACHINES|           WAQ284E25|      WASCHMASCHINEN|              BOSCH|
|16576864|     USB MEMORY|LEEF IBRIDGE MOBI...|PC__1100COMPUTING...|               LEEF|
|26155618|     USB MEMORY|SANDISK 32GB ULTR...|               W1370|               null|
|25646138|       BICYCLES|HOLLANDRAD DAMEN ...|FAHRRAEDER // SPO...|SCHALOW & KROH GMBH|
|19764614|       BICYCLES|DAHON SPEED D7 SC...|SPORTS__30000WHEE...|              DAHON|
|64836708|     USB MEMORY|PNY 16GB LEGO USB...| COMPONENT __ MEMORY|                PNY|
|25637375|       BICYCLES|CITYBIKE COLORS 2...|FAHRRAEDER // FAH...|     TRENDMAXX GMBH|
|49459632|       BICYCLES|126 CITY BIKE 28 ...|             1160005|               null|
|21250597|WASHINGMACH

In [34]:
data.printSchema()

root
 |-- V1: integer (nullable = true)
 |-- V2: string (nullable = true)
 |-- V3: string (nullable = true)
 |-- V4: string (nullable = true)
 |-- V5: string (nullable = true)



In [35]:
data=data.withColumnRenamed("V1","ID").withColumnRenamed("V2","product_group").withColumnRenamed("V3","main_text")\
    .withColumnRenamed("V4","add_text").withColumnRenamed("V5","manufacturer")

In [36]:
#data = data.withColumn('joined_description', 
#                    sf.concat(sf.col('main_text'),sf.lit(' '), sf.col('add_text'),sf.lit(' '), sf.col('manufacturer')))

In [37]:
data=data.select('product_group','main_text','add_text','manufacturer') # ID doesnt add any value

In [38]:
data.printSchema()

root
 |-- product_group: string (nullable = true)
 |-- main_text: string (nullable = true)
 |-- add_text: string (nullable = true)
 |-- manufacturer: string (nullable = true)



In [39]:
data.filter(data.manufacturer.isNull()).count()

1344

In [40]:
data.filter(data.main_text.isNull()).count()

2

In [41]:
# remove NULL because cause a lot of issues
data=data.fillna({'manufacturer':"NO_Manufacturer"})
data=data.fillna({'main_text':"NO_TEXT"})

In [42]:
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="main_text", outputCol="main_text_t")
stopwordsRemover = StopWordsRemover(inputCol="main_text_t", outputCol="main_text_f")
hashingTF = HashingTF(inputCol="main_text_f", outputCol="rawFeatures1")
idf = IDF(inputCol="rawFeatures1", outputCol="tf_idf")
# regular expression tokenizer
regexTokenizer2 = RegexTokenizer(inputCol="add_text", outputCol="add_text_t")
stopwordsRemover2 = StopWordsRemover(inputCol="add_text_t", outputCol="add_text_f")
hashingTF2 = HashingTF(inputCol="add_text_f", outputCol="rawFeatures2")
idf2 = IDF(inputCol="rawFeatures2", outputCol="tf_idf2")

In [43]:
data_prep_pipe = Pipeline(stages=[regexTokenizer,stopwordsRemover,hashingTF,idf,\
                                  regexTokenizer2,stopwordsRemover2,hashingTF2,idf2])

In [44]:
data_transformer = data_prep_pipe.fit(data)

In [45]:
data = data_transformer.transform(data)

In [46]:
data=data.select("main_text","add_text","product_group","rawFeatures1","rawFeatures2","manufacturer")

In [47]:
product_group_Y = StringIndexer(inputCol = "product_group", outputCol = "label")

In [48]:
datat=product_group_Y.fit(data)

In [49]:
data=datat.transform(data)

In [50]:
manufacturer_Y = StringIndexer(inputCol = "manufacturer", outputCol = "manufacturer_C")

In [51]:
datat2=manufacturer_Y.fit(data)

In [52]:
data=datat2.transform(data)

In [53]:
data.show()

+--------------------+--------------------+---------------+--------------------+--------------------+-------------------+-----+--------------+
|           main_text|            add_text|  product_group|        rawFeatures1|        rawFeatures2|       manufacturer|label|manufacturer_C|
+--------------------+--------------------+---------------+--------------------+--------------------+-------------------+-----+--------------+
|           WAQ284E25|      WASCHMASCHINEN|WASHINGMACHINES|(262144,[4112],[1...|(262144,[120109],...|              BOSCH|  2.0|           8.0|
|LEEF IBRIDGE MOBI...|PC__1100COMPUTING...|     USB MEMORY|(262144,[22790,45...|(262144,[86422],[...|               LEEF|  1.0|          64.0|
|SANDISK 32GB ULTR...|               W1370|     USB MEMORY|(262144,[45908,12...|(262144,[258930],...|    NO_Manufacturer|  1.0|           0.0|
|HOLLANDRAD DAMEN ...|FAHRRAEDER // SPO...|       BICYCLES|(262144,[15657,47...|(262144,[58789,14...|SCHALOW & KROH GMBH|  0.0|          43.0|

In [54]:
from pyspark.ml.feature import VectorAssembler

In [55]:
clean_up = VectorAssembler(inputCols=['rawFeatures1','rawFeatures2','manufacturer_C'],outputCol='features')

In [56]:
dataout=clean_up.transform(data)

In [57]:
dataout.show(1)

+---------+--------------+---------------+--------------------+--------------------+------------+-----+--------------+--------------------+
|main_text|      add_text|  product_group|        rawFeatures1|        rawFeatures2|manufacturer|label|manufacturer_C|            features|
+---------+--------------+---------------+--------------------+--------------------+------------+-----+--------------+--------------------+
|WAQ284E25|WASCHMASCHINEN|WASHINGMACHINES|(262144,[4112],[1...|(262144,[120109],...|       BOSCH|  2.0|           8.0|(524289,[4112,382...|
+---------+--------------+---------------+--------------------+--------------------+------------+-----+--------------+--------------------+
only showing top 1 row



In [58]:
datax=dataout.select("main_text","add_text","product_group","features",'label')

In [59]:
datax.show()

+--------------------+--------------------+---------------+--------------------+-----+
|           main_text|            add_text|  product_group|            features|label|
+--------------------+--------------------+---------------+--------------------+-----+
|           WAQ284E25|      WASCHMASCHINEN|WASHINGMACHINES|(524289,[4112,382...|  2.0|
|LEEF IBRIDGE MOBI...|PC__1100COMPUTING...|     USB MEMORY|(524289,[22790,45...|  1.0|
|SANDISK 32GB ULTR...|               W1370|     USB MEMORY|(524289,[45908,12...|  1.0|
|HOLLANDRAD DAMEN ...|FAHRRAEDER // SPO...|       BICYCLES|(524289,[15657,47...|  0.0|
|DAHON SPEED D7 SC...|SPORTS__30000WHEE...|       BICYCLES|(524289,[33633,42...|  0.0|
|PNY 16GB LEGO USB...| COMPONENT __ MEMORY|     USB MEMORY|(524289,[27536,29...|  1.0|
|CITYBIKE COLORS 2...|FAHRRAEDER // FAH...|       BICYCLES|(524289,[52859,61...|  0.0|
|126 CITY BIKE 28 ...|             1160005|       BICYCLES|(524289,[538,8227...|  0.0|
|AEG WASCHMASCHINE...|                  GG|

In [67]:
# set seed for reproducibility
(trainingData, testData) = datax.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 5664
Test Dataset Count: 2336


In [68]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("main_text","add_text","product_group","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+-------------+------------------------------+-----+----------+
|                     main_text|                      add_text|product_group|                   probability|label|prediction|
+------------------------------+------------------------------+-------------+------------------------------+-----+----------+
|1712738000 KS CYCLING KINDE...|SPORT&FITNESS_FAHRRAEDER_KI...|     BICYCLES|[0.9768269565207385,0.00719...|  0.0|       0.0|
|PROPHETE ALU-CITY 26  GENIE...|  SPORT & FREIZEIT__CITYRAEDER|     BICYCLES|[0.9678626535927078,0.00999...|  0.0|       0.0|
|FUJI SPORTIF 2.1 TRIPLE LTD...|                         1_5_1|     BICYCLES|[0.9677948292952667,0.00990...|  0.0|       0.0|
|ZUENDAPP SILVER 4.0 ALU TRE...|FAHRRAEDER // FAHRRAEDER TR...|     BICYCLES|[0.9672373288787075,0.01380...|  0.0|       0.0|
|PROPHETE ENTDECKER 6.0 TREK...|FAHRRAEDER // FAHRRAEDER TR...|     BICYCLES|[0.9664650623860626,0.01012...|  0.0|    

In [70]:
lrModel.save("lg_mode_path")

In [73]:
evaluatorlg = LogisticRegressionModel.load("lg_mode_path")

In [74]:
evaluatorlg = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluatorlg.evaluate(predictions)

0.9970045111946206

In [82]:
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("main_text","add_text","product_group","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+-------------+------------------------------+-----+----------+
|                     main_text|                      add_text|product_group|                   probability|label|prediction|
+------------------------------+------------------------------+-------------+------------------------------+-----+----------+
|207566  NOSTALGIERAD 28 NOS...|112100__FASHION & SPORT_SPO...|     BICYCLES|[1.0,1.0612026993848946E-16...|  0.0|       0.0|
|298426  TREKKING 26 D KCP T...|112100__FASHION & SPORT_SPO...|     BICYCLES|[1.0,9.204533569805521E-17,...|  0.0|       0.0|
|   425012  MTB 29 MTB 29 ROT 2|112100__FASHION & SPORT_SPO...|     BICYCLES|[1.0,8.453498697369825E-17,...|  0.0|       0.0|
|755902  MTB 275 MTB 27,5 BL...|112100__FASHION & SPORT_SPO...|     BICYCLES|[1.0,6.234894889116862E-17,...|  0.0|       0.0|
|216064  MTB 26 EVA MTB 26 E...|112100__FASHION & SPORT_SPO...|     BICYCLES|[1.0,5.664901629937955E-17,...|  0.0|    

In [85]:
model.save("nb_mode_path")

In [86]:
evaluatornb = NaiveBayesModel.load("nb_mode_path")

In [87]:
evaluatornb = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluatornb.evaluate(predictions)

0.8485700461258683

In [None]:
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 3, \
                            maxBins = 624)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("main_text","add_text","product_group","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)