In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='python3'
os.environ["SPARK_HOME"]='/opt/cloudera/parcels/SPARK2/lib/spark2/'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.6-src.zip'))
os.environ["PYSPARK_PYTHON"] = 'python3'
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.0.cloudera2
      /_/

Using Python version 3.4.3 (default, Nov 17 2016 01:08:31)
SparkSession available as 'spark'.


In [2]:
spark

![kmeans](pics/kmeans.svg)

![kmeans_algo](pics/kmeans_algo.png)

In [18]:
from pyspark.sql.types import *

In [19]:
import pandas as pd

In [20]:
df = pd.read_csv("/home/pklemenkov/hsu/lectures/lecture03/toxic_comment/train.csv")
df.fillna("", inplace=True)

In [21]:
schema = StructType([
    StructField("id", StringType()),
    StructField("comment_text", StringType()),
    StructField("toxic", IntegerType()),
    StructField("severe_toxic", IntegerType()),
    StructField("obscene", IntegerType()),
    StructField("threat", IntegerType()),
    StructField("insult", IntegerType()),
    StructField("identity_hate", IntegerType())
])

In [22]:
dataset = spark.createDataFrame(df, schema=schema)

In [23]:
dataset.rdd.getNumPartitions()

2

In [24]:
dataset = dataset.repartition(4).cache()

In [25]:
dataset

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int]

In [26]:
from pyspark.ml.feature import *

In [27]:
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")

In [28]:
stop_words = StopWordsRemover.loadDefaultStopWords("english")

In [29]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", stopWords=stop_words)

In [30]:
count_vectorizer = CountVectorizer(inputCol=swr.getOutputCol(), outputCol="word_vector", vocabSize=200)

In [31]:
from pyspark.ml import Pipeline

In [32]:
preprocessing = Pipeline(stages=[
    tokenizer,
    swr,
    count_vectorizer
])

In [33]:
preprocessing_model = preprocessing.fit(dataset)

In [34]:
preprocessed_dataset = preprocessing_model.transform(dataset)

In [35]:
preprocessed_dataset.select(["word_vector"]).take(5)

[Row(word_vector=SparseVector(200, {0: 2.0, 1: 2.0, 5: 1.0, 53: 1.0, 55: 2.0, 68: 1.0, 101: 1.0, 115: 1.0, 129: 1.0})),
 Row(word_vector=SparseVector(200, {0: 1.0, 2: 2.0, 12: 1.0, 14: 1.0, 52: 1.0, 142: 1.0, 166: 1.0, 178: 1.0})),
 Row(word_vector=SparseVector(200, {})),
 Row(word_vector=SparseVector(200, {0: 2.0, 2: 1.0, 3: 1.0, 57: 1.0, 62: 1.0, 70: 1.0})),
 Row(word_vector=SparseVector(200, {12: 1.0, 24: 1.0, 153: 1.0}))]

In [181]:
dataset

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int]

In [36]:
from pyspark.ml.clustering import KMeans

In [42]:
kmeans = KMeans(featuresCol="word_vector", k=7, seed=5757)

In [43]:
kmeans_model = kmeans.fit(preprocessed_dataset)

In [44]:
clustering = kmeans_model.transform(preprocessed_dataset)

In [45]:
clustering[clustering.columns[2:8] + ["prediction"]].take(10)

[Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=1, severe_toxic=1, obscene=1, threat=0, insult=1, identity_hate=1, prediction=0),
 Row(toxic=1, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=1, threat=0, insult=1, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0)]

In [46]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [47]:
evaluator = ClusteringEvaluator(featuresCol="word_vector")

In [48]:
evaluator.evaluate(clustering)

0.5023566031104862

In [63]:
clustering.filter(clustering.prediction == 1)[["comment_text"]].take(5)

[Row(comment_text='"Contents of the library (objects and functions to be used outside, situation\nlate August 2004)\n\nClasses:\nPage: A MediaWiki page\n    __init__               Page(Site, Title) - the page with title Title on wikimedia site Site\n    title                  The name of the page, in a form suitable for an interwiki link\n    urlname                The name of the page, in a form suitable for a URL\n    titleWithoutNamespace  The name of the page, with the namespace part removed\n    section                The section of the page (the part of the name after \'#\')\n    sectionFreeTitle       The name without the section part\n    aslink                 The name of the page in the form Title or lang:Title\n    site                   The wiki this page is in\n    encoding               The encoding of the page\n    isAutoTitle            If the title is a well known, auto-translatable title\n    autoFormat             Returns (dictName, value), where value can be a year,

In [49]:
kmeans = KMeans(featuresCol="word_vector", k=2, seed=1234)

In [50]:
kmeans_model = kmeans.fit(preprocessed_dataset)

In [51]:
clustering = kmeans_model.transform(preprocessed_dataset)

In [52]:
evaluator.evaluate(clustering)

0.9955064024878033

In [53]:
kmeans_model.clusterCenters()

[array([1.35142132e+02, 5.78680203e-01, 9.61928934e-01, 1.08375635e+00,
        7.41116751e-01, 1.34263959e+00, 8.29949239e-01, 2.03045685e+00,
        7.10659898e-01, 7.00507614e-01, 6.14213198e-01, 7.10659898e-01,
        5.05076142e-01, 1.45177665e+00, 4.13705584e-01, 4.03553299e-01,
        4.77157360e-01, 4.01015228e-01, 4.21319797e-01, 3.37563452e-01,
        4.49238579e-01, 2.96954315e-01, 4.23857868e-01, 2.71573604e-01,
        4.31472081e-01, 3.65482234e-01, 4.61928934e-01, 5.00000000e-01,
        2.81725888e-01, 2.28426396e-01, 4.89847716e-01, 3.04568528e-01,
        3.09644670e-01, 2.51269036e-01, 2.20812183e-01, 3.29949239e-01,
        2.05583756e-01, 2.30964467e-01, 4.26395939e-01, 2.30964467e-01,
        1.01522843e+00, 2.91878173e-01, 1.97969543e-01, 2.28426396e-01,
        2.05583756e-01, 4.97461929e-01, 1.87817259e-01, 2.58883249e-01,
        2.51269036e-01, 3.19796954e-01, 1.72588832e-01, 2.33502538e-01,
        2.46192893e-01, 2.53807107e-01, 2.18274112e-01, 3.020304

In [54]:
import numpy as np

In [60]:
np.max(kmeans_model.clusterCenters()[0])

135.14213197969542

In [56]:
np.argsort(-kmeans_model.clusterCenters()[0])

array([  0,   7,  13,   5,   3,  40,   2, 105,   6, 153,   4,  11,   8,
         9,  10,   1, 116,  12,  27, 103,  45,  30,  16,  26,  20,  24,
        38,  22,  18,  14,  15,  17,  25,  76,  19,  35,  79,  49,  32,
        31,  55,  21, 110,  41, 113,  28, 127, 106,  23,  47,  53,  33,
        48,  86,  52,  87,  88,  70,  51,  39,  37,  71,  29,  43,  85,
        56,  34,  90,  54,  72,  59,  78, 120,  60,  75,  44,  36, 140,
       115,  58, 123,  42,  64,  89,  69, 149, 136, 159, 164,  46, 148,
        65, 128, 124,  73, 135, 194,  57, 134,  95,  91,  63,  50, 117,
       109, 175, 156, 102,  77, 142,  61, 169, 133, 114,  97, 130,  99,
       107, 101,  81, 189, 138,  74, 181,  68, 188, 152, 119, 104,  80,
       132, 155,  96,  84, 122, 172, 129, 186, 154, 157, 165, 199, 112,
       179, 193,  83, 111, 143, 192, 146, 145, 167, 180, 178, 168,  93,
       198,  67, 190, 162, 108, 177,  98, 196, 197, 183,  62, 174,  82,
        94, 131, 161, 137,  92, 118, 166, 163, 139, 100, 150, 17

In [209]:
np.argsort(-kmeans_model.clusterCenters()[1])

array([  0,   3,   7, 194, 174,  36,   1,  56,  13, 117, 109,  91,  86,
        55,  99,  12, 135, 166,  49,  20,  54,  18,  17,  10,  19,  63,
        65,  71,   6,  75,  79,  80,  82,  85,   8, 101,  72,  15,   5,
        11, 134, 142, 151, 152, 164, 165, 167,   4, 180, 181, 186,   2,
       114, 129, 199,  26,  27,  39,  41,  42,  43,  30,  23,  22,  45,
        50,  33, 140, 132, 160, 159, 158, 133, 157, 156, 155, 154, 153,
        32,  29, 136, 137, 150, 149, 148, 161, 138, 147, 146, 145, 144,
       143,  31, 141, 139, 162,  52,  28, 197, 196, 195,  16, 193, 192,
       191, 190, 189, 188, 187,  21, 185, 184, 163, 183, 179, 178, 177,
       176, 175,  24, 173, 172, 171, 170, 169, 168,  25, 131, 182, 130,
       127, 128,  40,  90,  89,  88,  87,   9,  84,  83,  81,  44,  78,
        77,  76,  74,  73,  46,  47,  51,  57,  58,  59,  60,  61,  92,
        62,  48,  66,  67,  68,  69,  70,  64,  34,  93,  95,  53, 126,
       125, 124, 123, 122, 121, 120, 119, 118,  35, 116, 115,  3

In [210]:
preprocessing_model.stages[2].vocabulary

['',
 '"',
 'article',
 'page',
 'please',
 'like',
 'one',
 '-',
 'wikipedia',
 'talk',
 'think',
 'see',
 'also',
 'know',
 'may',
 'edit',
 'people',
 'use',
 'get',
 'even',
 'make',
 'articles',
 'good',
 'want',
 'time',
 'it.',
 'need',
 'new',
 'thank',
 'go',
 'first',
 'information',
 'many',
 'made',
 'find',
 'page.',
 'name',
 'really',
 'thanks',
 'say',
 'fuck',
 'much',
 'used',
 'since',
 'article.',
 'user',
 'add',
 'way',
 'take',
 'help',
 'sources',
 'look',
 'someone',
 'still',
 'read',
 'section',
 'pages',
 'going',
 'two',
 'deletion',
 'you.',
 'source',
 'edits',
 'without',
 'discussion',
 'well',
 'editing',
 'wikipedia.',
 'point',
 'deleted',
 'back',
 'might',
 'work',
 'something',
 'image',
 'another',
 'added',
 'never',
 'put',
 'link',
 'seems',
 'stop',
 ',',
 'blocked',
 'feel',
 '.',
 'list',
 'block',
 'right',
 'said',
 '(utc)',
 'using',
 'ask',
 'personal',
 'fact',
 'sure',
 'article,',
 'believe',
 'hope',
 'page,',
 'note',
 'actually',


In [57]:
for i in np.argsort(-kmeans_model.clusterCenters()[1])[:20]:
    print(preprocessing_model.stages[2].vocabulary[i])


"
article
page
please
like
one
-
wikipedia
talk
think
see
also
know
may
edit
people
use
get
even


In [59]:
for i in np.argsort(-kmeans_model.clusterCenters()[0])[:40]:
    print(preprocessing_model.stages[2].vocabulary[i])


-
know
like
page
fuck
article
•
one
|
please
see
wikipedia
talk
think
"
must
also
new
keep
user
first
people
need
make
time
thanks
good
get
may
edit
use
it.
added
even
page.
link
help
many
information


## The curse of dimensionality
![curse](pics/dimensionality_vs_performance.png)

## Why is that?
![curse](pics/curseofdimensionality.png)

## LDA

In [61]:
from pyspark.ml.clustering import LDA

In [105]:
lda = LDA(featuresCol="word_vector", seed=5757, k=6)

In [106]:
lda_model = lda.fit(preprocessed_dataset)

In [107]:
topics = lda_model.transform(preprocessed_dataset)

In [25]:
topics.take(5)

[Row(id='26e1b63617df36b1', comment_text='"\n\n charlie wilson \n\ni didnt notice the music genres that were reverted. However my intention was to revert his alias that you deleted.  His alias a.k.a is actually ""Uncle Charlie"" and needs to be put back and shouldn\'t  have been removed."', toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, words=['"', '', '', 'charlie', 'wilson', '', '', 'i', 'didnt', 'notice', 'the', 'music', 'genres', 'that', 'were', 'reverted.', 'however', 'my', 'intention', 'was', 'to', 'revert', 'his', 'alias', 'that', 'you', 'deleted.', '', 'his', 'alias', 'a.k.a', 'is', 'actually', '""uncle', 'charlie""', 'and', 'needs', 'to', 'be', 'put', 'back', 'and', "shouldn't", '', 'have', 'been', 'removed."'], words_filtered=['"', '', '', 'charlie', 'wilson', '', '', 'didnt', 'notice', 'music', 'genres', 'reverted.', 'however', 'intention', 'revert', 'alias', 'deleted.', '', 'alias', 'a.k.a', 'actually', '""uncle', 'charlie""', 'needs', 'put', 'back

In [66]:
lda_model.vocabSize()

200

In [69]:
lda_model.describeTopics(maxTermsPerTopic=10).collect()

[Row(topic=0, termIndices=[2, 0, 3, 9, 4, 59, 14, 121, 69, 8], termWeights=[0.08926104104533482, 0.08049662990931651, 0.041972549586315355, 0.03592724599917715, 0.031607872010297106, 0.022460090874317162, 0.01975179985182996, 0.017614269436307835, 0.016819445851956244, 0.016460579858474857]),
 Row(topic=1, termIndices=[0, 1, 2, 9, 3, 105, 27, 7, 12, 90], termWeights=[0.5629140781243558, 0.06147194524571807, 0.010492120700474976, 0.009842360425537242, 0.008670961466459667, 0.008579475782924026, 0.006931165646587559, 0.006835693977168162, 0.006690663121360881, 0.006096005573499908]),
 Row(topic=2, termIndices=[0, 85, 131, 4, 45, 3, 62, 153, 9, 14], termWeights=[0.1041518502593904, 0.04009007838347178, 0.03658567901600966, 0.03560774729181142, 0.035367207683024045, 0.034888935583241465, 0.027656443714736762, 0.026402463835583072, 0.023601424238417826, 0.019444576171549457]),
 Row(topic=3, termIndices=[5, 6, 16, 0, 20, 10, 12, 22, 119, 41], termWeights=[0.05093547865439432, 0.0476213744872

In [74]:
for i in [0, 4, 40, 8, 17, 74, 28, 1, 15, 126]:
    print(preprocessing_model.stages[-1].vocabulary[i])


please
fuck
wikipedia
use
image
thank
"
edit
copyright


In [131]:
for i in [0, 2, 1, 3, 4, 10, 7, 14, 8, 11]:
    print(preprocessing_model.stages[-1].vocabulary[i])


article
"
page
please
think
-
may
wikipedia
see


In [132]:
for i in [40, 0, 379, 257, 249, 29, 359, 567, 474, 153]:
    print(preprocessing_model.stages[-1].vocabulary[i])

fuck

fat
shit
suck
go
gay
jew
ass
|


In [133]:
for i in [0, 105, 1107, 175, 13, 18, 36, 57, 8, 1]:
    print(preprocessing_model.stages[-1].vocabulary[i])


•
tacos
u
know
get
name
going
wikipedia
"


## Clustering is a good dimensionality reduction technique

In [75]:
topics

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int, words: array<string>, words_filtered: array<string>, word_vector: vector, topicDistribution: vector]

In [92]:
from pyspark.sql import functions as f

In [93]:
target = f.when(
    (topics.toxic == 0) &
    (topics.severe_toxic == 0) &
    (topics.obscene == 0) &
    (topics.threat == 0) &
    (topics.insult == 0) &
    (topics.identity_hate == 0),
    0
).otherwise(1)

In [108]:
new_dataset = topics.withColumn("target", target)[["id", "target", "topicDistribution"]].cache()

In [109]:
new_dataset.take(5)

[Row(id='6fdb7b6734f8bf40', target=0, topicDistribution=DenseVector([0.012, 0.0115, 0.0112, 0.3808, 0.573, 0.0116])),
 Row(id='39b742437bd11ec9', target=0, topicDistribution=DenseVector([0.5958, 0.015, 0.3392, 0.0164, 0.0185, 0.0151])),
 Row(id='9bbb8e1922fe1efb', target=0, topicDistribution=DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0])),
 Row(id='54f9e59924682c6e', target=0, topicDistribution=DenseVector([0.6195, 0.0188, 0.0183, 0.0205, 0.3041, 0.0189])),
 Row(id='62e38775721eb79e', target=1, topicDistribution=DenseVector([0.0393, 0.0378, 0.0367, 0.3939, 0.4542, 0.038]))]

In [110]:
from pyspark.ml.classification import LogisticRegression

In [111]:
lr = LogisticRegression(featuresCol="topicDistribution", labelCol="target")

In [112]:
train = new_dataset.sampleBy("target", fractions={0: 0.8, 1: 0.8}, seed=5757).cache()

In [113]:
test = new_dataset.join(train, on="id", how="leftanti").cache()

In [114]:
lr_model = lr.fit(train)

In [115]:
predictions = lr_model.transform(test)

In [116]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [117]:
evaluator = BinaryClassificationEvaluator(labelCol="target")

In [118]:
evaluator.evaluate(predictions)

0.6069737271395673

## Last time with CountVectorizer with 20k words in vocabulary we got 0.8275751487175559

In [119]:
spark.stop()