In [160]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='python3'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.4-src.zip'))
os.environ["PYSPARK_PYTHON"] = 'python3'
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.0
      /_/

Using Python version 3.6.4 (default, Jan 28 2018 00:00:00)
SparkSession available as 'spark'.


![kmeans](pics/kmeans.svg)

![kmeans_algo](pics/kmeans_algo.png)

In [162]:
spark

In [163]:
from pyspark.sql.types import *

In [164]:
import pandas as pd

In [165]:
df = pd.read_csv("/data/home/pavel.klemenkov/lectures/lecture03/toxic_comment/train.csv")
df.fillna("", inplace=True)

In [166]:
schema = StructType([
    StructField("id", StringType()),
    StructField("comment_text", StringType()),
    StructField("toxic", IntegerType()),
    StructField("severe_toxic", IntegerType()),
    StructField("obscene", IntegerType()),
    StructField("threat", IntegerType()),
    StructField("insult", IntegerType()),
    StructField("identity_hate", IntegerType())
])

In [167]:
dataset = spark.createDataFrame(df, schema=schema)

In [168]:
dataset.rdd.getNumPartitions()

2

In [169]:
dataset = dataset.repartition(4).cache()

In [170]:
dataset

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int]

In [171]:
from pyspark.ml.feature import *

In [172]:
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")

In [173]:
stop_words = StopWordsRemover.loadDefaultStopWords("english")

In [174]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", stopWords=stop_words)

In [175]:
count_vectorizer = CountVectorizer(inputCol=swr.getOutputCol(), outputCol="word_vector", vocabSize=200)

In [176]:
from pyspark.ml import Pipeline

In [177]:
preprocessing = Pipeline(stages=[
    tokenizer,
    swr,
    count_vectorizer
])

In [178]:
preprocessing_model = preprocessing.fit(dataset)

In [179]:
preprocessed_dataset = preprocessing_model.transform(dataset)

In [180]:
preprocessed_dataset.select(["word_vector"]).take(5)

[Row(word_vector=SparseVector(200, {0: 6.0, 1: 1.0, 70: 1.0, 78: 1.0, 101: 1.0, 179: 1.0})),
 Row(word_vector=SparseVector(200, {0: 2.0, 2: 1.0, 6: 1.0, 15: 1.0, 19: 2.0, 20: 1.0, 22: 1.0, 33: 1.0, 64: 1.0, 66: 1.0, 69: 1.0, 80: 1.0, 89: 1.0, 108: 1.0, 109: 1.0, 114: 1.0, 128: 1.0, 133: 1.0, 152: 1.0, 157: 1.0})),
 Row(word_vector=SparseVector(200, {0: 5.0, 1: 2.0, 59: 1.0, 69: 1.0, 87: 1.0, 100: 1.0, 194: 3.0, 198: 2.0})),
 Row(word_vector=SparseVector(200, {0: 3.0, 4: 1.0, 66: 1.0, 67: 1.0, 83: 1.0, 90: 1.0, 151: 1.0})),
 Row(word_vector=SparseVector(200, {0: 8.0, 1: 2.0, 2: 5.0, 3: 1.0, 4: 2.0, 9: 1.0, 10: 1.0, 11: 1.0, 14: 3.0, 21: 1.0, 31: 1.0, 46: 1.0, 55: 1.0, 59: 4.0, 63: 1.0, 65: 1.0, 67: 1.0, 69: 3.0, 100: 1.0, 106: 1.0, 108: 1.0, 121: 6.0, 160: 2.0, 168: 2.0, 171: 2.0}))]

In [181]:
dataset

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int]

In [182]:
from pyspark.ml.clustering import KMeans

In [196]:
kmeans = KMeans(featuresCol="word_vector", k=6, seed=5757)

In [197]:
kmeans_model = kmeans.fit(preprocessed_dataset)

In [None]:
kmeans.fitMultiple()

In [198]:
clustering = kmeans_model.transform(preprocessed_dataset)

In [199]:
clustering[clustering.columns[2:8] + ["prediction"]].take(10)

[Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=1, severe_toxic=0, obscene=1, threat=0, insult=1, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0)]

In [187]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [188]:
evaluator = ClusteringEvaluator(featuresCol="word_vector")

In [200]:
evaluator.evaluate(clustering)

0.9531213553637657

In [63]:
clustering.filter(clustering.prediction == 1)[["comment_text"]].take(5)

[Row(comment_text='"Contents of the library (objects and functions to be used outside, situation\nlate August 2004)\n\nClasses:\nPage: A MediaWiki page\n    __init__               Page(Site, Title) - the page with title Title on wikimedia site Site\n    title                  The name of the page, in a form suitable for an interwiki link\n    urlname                The name of the page, in a form suitable for a URL\n    titleWithoutNamespace  The name of the page, with the namespace part removed\n    section                The section of the page (the part of the name after \'#\')\n    sectionFreeTitle       The name without the section part\n    aslink                 The name of the page in the form Title or lang:Title\n    site                   The wiki this page is in\n    encoding               The encoding of the page\n    isAutoTitle            If the title is a well known, auto-translatable title\n    autoFormat             Returns (dictName, value), where value can be a year,

In [220]:
kmeans = KMeans(featuresCol="word_vector", k=2, seed=1234)

In [221]:
kmeans_model = kmeans.fit(preprocessed_dataset)

In [222]:
clustering = kmeans_model.transform(preprocessed_dataset)

In [223]:
evaluator.evaluate(clustering)

0.9994799602125465

In [207]:
kmeans_model.clusterCenters()

[array([3.06700217, 0.51181359, 0.2445131 , 0.17984859, 0.17241574,
        0.16505183, 0.15437886, 0.14628796, 0.14061619, 0.14033416,
        0.11887542, 0.11762826, 0.10257455, 0.10253694, 0.09565561,
        0.09482834, 0.09417029, 0.09082363, 0.08224389, 0.08131009,
        0.07925446, 0.07138291, 0.07052431, 0.06881338, 0.06504055,
        0.0631416 , 0.06251488, 0.06223286, 0.06039032, 0.05929357,
        0.05810281, 0.05765784, 0.05692458, 0.05540793, 0.05475614,
        0.05465587, 0.05432371, 0.05344004, 0.05312042, 0.05281333,
        0.05280706, 0.05275692, 0.0525125 , 0.05196726, 0.05142202,
        0.05116506, 0.05008711, 0.04930372, 0.04917838, 0.04870207,
        0.04813176, 0.04769306, 0.04764919, 0.04754892, 0.04740477,
        0.0460824 , 0.04578784, 0.04533661, 0.04529274, 0.04521753,
        0.04462215, 0.04422106, 0.04403931, 0.04393903, 0.04333739,
        0.04328725, 0.04289242, 0.04227824, 0.04098094, 0.04091826,
        0.04075532, 0.04057984, 0.04027901, 0.04

In [208]:
import numpy as np

In [82]:
np.max(kmeans_model.clusterCenters()[0])

3.0109564999373197

In [86]:
np.argsort(-kmeans_model.clusterCenters()[0])

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  41,  42,  43,  44,  45,  40,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  87,  86,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 125, 124, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 137, 136, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156,
       157, 158, 159, 153, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 175, 174, 176, 177, 178, 179, 18

In [209]:
np.argsort(-kmeans_model.clusterCenters()[1])

array([  0,   3,   7, 194, 174,  36,   1,  56,  13, 117, 109,  91,  86,
        55,  99,  12, 135, 166,  49,  20,  54,  18,  17,  10,  19,  63,
        65,  71,   6,  75,  79,  80,  82,  85,   8, 101,  72,  15,   5,
        11, 134, 142, 151, 152, 164, 165, 167,   4, 180, 181, 186,   2,
       114, 129, 199,  26,  27,  39,  41,  42,  43,  30,  23,  22,  45,
        50,  33, 140, 132, 160, 159, 158, 133, 157, 156, 155, 154, 153,
        32,  29, 136, 137, 150, 149, 148, 161, 138, 147, 146, 145, 144,
       143,  31, 141, 139, 162,  52,  28, 197, 196, 195,  16, 193, 192,
       191, 190, 189, 188, 187,  21, 185, 184, 163, 183, 179, 178, 177,
       176, 175,  24, 173, 172, 171, 170, 169, 168,  25, 131, 182, 130,
       127, 128,  40,  90,  89,  88,  87,   9,  84,  83,  81,  44,  78,
        77,  76,  74,  73,  46,  47,  51,  57,  58,  59,  60,  61,  92,
        62,  48,  66,  67,  68,  69,  70,  64,  34,  93,  95,  53, 126,
       125, 124, 123, 122, 121, 120, 119, 118,  35, 116, 115,  3

In [210]:
preprocessing_model.stages[2].vocabulary

['',
 '"',
 'article',
 'page',
 'please',
 'like',
 'one',
 '-',
 'wikipedia',
 'talk',
 'think',
 'see',
 'also',
 'know',
 'may',
 'edit',
 'people',
 'use',
 'get',
 'even',
 'make',
 'articles',
 'good',
 'want',
 'time',
 'it.',
 'need',
 'new',
 'thank',
 'go',
 'first',
 'information',
 'many',
 'made',
 'find',
 'page.',
 'name',
 'really',
 'thanks',
 'say',
 'fuck',
 'much',
 'used',
 'since',
 'article.',
 'user',
 'add',
 'way',
 'take',
 'help',
 'sources',
 'look',
 'someone',
 'still',
 'read',
 'section',
 'pages',
 'going',
 'two',
 'deletion',
 'you.',
 'source',
 'edits',
 'without',
 'discussion',
 'well',
 'editing',
 'wikipedia.',
 'point',
 'deleted',
 'back',
 'might',
 'work',
 'something',
 'image',
 'another',
 'added',
 'never',
 'put',
 'link',
 'seems',
 'stop',
 ',',
 'blocked',
 'feel',
 '.',
 'list',
 'block',
 'right',
 'said',
 '(utc)',
 'using',
 'ask',
 'personal',
 'fact',
 'sure',
 'article,',
 'believe',
 'hope',
 'page,',
 'note',
 'actually',


In [224]:
for i in np.argsort(-kmeans_model.clusterCenters()[1])[:20]:
    print(preprocessing_model.stages[2].vocabulary[i])


|
page
-
"
get
know
(talk)
article
name
section
way
one
make
u
edit
people
like
come
take


In [225]:
for i in np.argsort(-kmeans_model.clusterCenters()[0])[:20]:
    print(preprocessing_model.stages[2].vocabulary[i])


"
article
page
please
like
one
-
wikipedia
talk
think
see
also
know
may
edit
people
use
get
even


## The curse of dimensionality
![curse](pics/dimensionality_vs_performance.png)

## Why is that?
![curse](pics/curseofdimensionality.png)

## LDA

In [226]:
from pyspark.ml.clustering import LDA

In [242]:
lda = LDA(featuresCol="word_vector", seed=5757, k=6)

In [243]:
lda_model = lda.fit(preprocessed_dataset)

In [244]:
topics = lda_model.transform(preprocessed_dataset)

In [245]:
topics.take(5)

[Row(id='6fdb7b6734f8bf40', comment_text='"\n\n""Katara""\nI\'ve removed the section entirely. I don\'t care if you like to pretend that Katara and Zuko are meant for each other. It\'s still not case, and there has been no indication whatsoever. Thus, there\'s little point to actually have the section and exempt Toph and Sokka beyond the insane delusions of shippers.  "', toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, words=['"', '', '""katara""', "i've", 'removed', 'the', 'section', 'entirely.', 'i', "don't", 'care', 'if', 'you', 'like', 'to', 'pretend', 'that', 'katara', 'and', 'zuko', 'are', 'meant', 'for', 'each', 'other.', "it's", 'still', 'not', 'case,', 'and', 'there', 'has', 'been', 'no', 'indication', 'whatsoever.', 'thus,', "there's", 'little', 'point', 'to', 'actually', 'have', 'the', 'section', 'and', 'exempt', 'toph', 'and', 'sokka', 'beyond', 'the', 'insane', 'delusions', 'of', 'shippers.', '', '"'], words_filtered=['"', '', '""katara""', 'remove

In [246]:
lda_model.vocabSize()

200

In [247]:
spark

In [248]:
lda_model.describeTopics().take(1)

AttributeError: 'NoneType' object has no attribute 'setCallSite'

In [130]:
for i in [214, 211, 5, 463, 582, 334, 131, 751, 700, 1355]:
    print(preprocessing_model.stages[-1].vocabulary[i])

nigger
fucking
like
moron
sucks
redirect
hi
dick
jews
fucksex


In [131]:
for i in [0, 2, 1, 3, 4, 10, 7, 14, 8, 11]:
    print(preprocessing_model.stages[-1].vocabulary[i])


article
"
page
please
think
-
may
wikipedia
see


In [132]:
for i in [40, 0, 379, 257, 249, 29, 359, 567, 474, 153]:
    print(preprocessing_model.stages[-1].vocabulary[i])

fuck

fat
shit
suck
go
gay
jew
ass
|


In [133]:
for i in [0, 105, 1107, 175, 13, 18, 36, 57, 8, 1]:
    print(preprocessing_model.stages[-1].vocabulary[i])


•
tacos
u
know
get
name
going
wikipedia
"


## Clustering is a good dimensionality reduction technique

In [137]:
topics

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int, words: array<string>, words_filtered: array<string>, word_vector: vector, topicDistribution: vector]

In [139]:
from pyspark.sql import functions as f

In [140]:
target = f.when(
    (topics.toxic == 0) &
    (topics.severe_toxic == 0) &
    (topics.obscene == 0) &
    (topics.threat == 0) &
    (topics.insult == 0) &
    (topics.identity_hate == 0),
    0
).otherwise(1)

In [144]:
new_dataset = topics.withColumn("target", target)[["id", "target", "topicDistribution"]].cache()

In [145]:
new_dataset.take(5)

[Row(id='6fdb7b6734f8bf40', target=0, topicDistribution=DenseVector([0.0052, 0.9694, 0.0053, 0.0082, 0.0056, 0.0063])),
 Row(id='39b742437bd11ec9', target=0, topicDistribution=DenseVector([0.0025, 0.0031, 0.0025, 0.0039, 0.0027, 0.9853])),
 Row(id='9bbb8e1922fe1efb', target=0, topicDistribution=DenseVector([0.0665, 0.0836, 0.0674, 0.1029, 0.0708, 0.6088])),
 Row(id='54f9e59924682c6e', target=0, topicDistribution=DenseVector([0.009, 0.0113, 0.0091, 0.9501, 0.0096, 0.0108])),
 Row(id='62e38775721eb79e', target=1, topicDistribution=DenseVector([0.0127, 0.0159, 0.4886, 0.0196, 0.0135, 0.4497]))]

In [146]:
from pyspark.ml.classification import LogisticRegression

In [147]:
lr = LogisticRegression(featuresCol="topicDistribution", labelCol="target")

In [150]:
train = new_dataset.sampleBy("target", fractions={0: 0.8, 1: 0.8}, seed=5757).cache()

In [151]:
test = new_dataset.join(train, on="id", how="leftanti").cache()

In [153]:
lr_model = lr.fit(train)

In [154]:
predictions = lr_model.transform(test)

In [155]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [156]:
evaluator = BinaryClassificationEvaluator(labelCol="target")

In [157]:
evaluator.evaluate(predictions)

0.8381544862513701

## Last time with CountVectorizer with 20k words in vocabulary we got 0.8275751487175559