In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [0]:
#tokenization in pyspark
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [0]:
sen_df = spark.createDataFrame([
    (0,'Hi I heard about spark'),
    (1, 'I wish java could use case classes'),
    (2, 'logistic,regression,models,are,neat')
    
], ['id','sentence'])


In [0]:
sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish java could...|
|  2|logistic,regressi...|
+---+--------------------+



In [0]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
regextokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern="\\W")
count_tokens = udf(lambda words: len(words), IntegerType())

In [0]:
tokenized = tokenizer.transform(sen_df)
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [0]:
rg_tokenizer = regextokenizer.transform(sen_df)
rg_tokenizer.withColumn('tokens', count_tokens(col('words'))).show()


+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [0]:
from pyspark.ml.feature import StopWordsRemover
sen_df = spark.createDataFrame([
    (0,'Hi I heard about spark'),
    (1, 'I wish java could use case classes'),
    (2, 'logistic regression models are neat')
    
], ['id','sentence'])
remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')
remover.transform(sen_df).show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
[0;32m<command-173559009452800>[0m in [0;36m<module>[0;34m[0m
[1;32m      7[0m ], ['id','sentence'])
[1;32m      8[0m [0mremover[0m [0;34m=[0m [0mStopWordsRemover[0m[0;34m([0m[0minputCol[0m[0;34m=[0m[0;34m'tokens'[0m[0;34m,[0m [0moutputCol[0m[0;34m=[0m[0;34m'filtered'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 9[0;31m [0mremover[0m[0;34m.[0m[0mtransform[0m[0;34m([0m[0msen_df[0m[0;34m)[0m[0;34m.[0m[0mshow[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m/databricks/spark/python/pyspark/ml/base.py[0m in [0;36mtransform[0;34m(self, dataset, params)[0m
[1;32m    215[0m                 [0;32mreturn[0m [0mself[0m[0;34m.[0m[0mcopy[0m[0;34m([0m[0mparams[0m[0;34m)[0m[0;34m.[0m[0m_transform[0m[0;34m([0m[0mdataset[0m[0;34m)

In [0]:
from pyspark.ml.feature import NGram
wordDataFrame = spark.createDataFrame([
    (0, ['hi','i','heard','about','sparl']),
    (1,['i','wish','java','could','use','case','classes']),
    (2,['logistics','regression','models','are','neat'])
],['id','words'])

In [0]:
ngram = NGram(n=2, inputCol='words', outputCol='grams')
ngram.transform(wordDataFrame).show()

+---+--------------------+--------------------+
| id|               words|               grams|
+---+--------------------+--------------------+
|  0|[hi, i, heard, ab...|[hi i, i heard, h...|
|  1|[i, wish, java, c...|[i wish, wish jav...|
|  2|[logistics, regre...|[logistics regres...|
+---+--------------------+--------------------+



In [0]:
ngram.transform(wordDataFrame).select('grams').show(truncate=False)

+------------------------------------------------------------------+
|grams                                                             |
+------------------------------------------------------------------+
|[hi i, i heard, heard about, about sparl]                         |
|[i wish, wish java, java could, could use, use case, case classes]|
|[logistics regression, regression models, models are, are neat]   |
+------------------------------------------------------------------+



In [0]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
sen_df = spark.createDataFrame([
    (0,'Hi I heard about spark'),
    (1, 'I wish java could use case classes'),
    (2, 'logistic regression models are neat')],['id','sentence'])

tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
word_data = tokenizer.transform(sen_df)

word_data.show(truncate=False)

+---+-----------------------------------+------------------------------------------+
|id |sentence                           |words                                     |
+---+-----------------------------------+------------------------------------------+
|0  |Hi I heard about spark             |[hi, i, heard, about, spark]              |
|1  |I wish java could use case classes |[i, wish, java, could, use, case, classes]|
|2  |logistic regression models are neat|[logistic, regression, models, are, neat] |
+---+-----------------------------------+------------------------------------------+



In [0]:
#term frequency
hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures')
featurized_data = hashing_tf.transform(word_data)
#idf
idf = IDF(inputCol='rawFeatures', outputCol='features')
idf_model = idf.fit(featurized_data)


In [0]:
rescaled_data = idf_model.transform(featurized_data)
rescaled_data.select('id','features').show(truncate=False)

+---+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                                                                                      |
+---+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0  |(262144,[18700,19036,33808,66273,173558],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453])                                                   |
|1  |(262144,[19036,20719,55551,58672,98717,109547,192310],[0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])|
|2  |(262144,[4

In [0]:
from pyspark.ml.feature import CountVectorizer
df = spark.createDataFrame([
    (0, 'a b c'.split(" ")),
    (1,'a b b c a'.split(' '))], 
['id', 'words'])
df.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [0]:
cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=3, minDF=2.0)
model = cv.fit(df)
results = model.transform(df)
results.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

