In [1]:
import findspark

In [2]:
findspark.init('/home/oussama/spark-2.4.0-bin-hadoop2.7')

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [5]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

In [6]:
from pyspark.sql.functions import col, udf

In [7]:
from pyspark.sql.types import IntegerType

In [8]:
sen_df = spark.createDataFrame([
    (0, 'hi I heared about spark'),
    (1, 'I wish java could use case classes'),
    (2, 'Logistic,regression,models,are,neat')
],['id', 'sentence'])

In [9]:
sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|hi I heared about...|
|  1|I wish java could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [10]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [11]:
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words',
                                pattern='\\W')

In [12]:
count_tokens = udf(lambda words:len(words), IntegerType())

In [13]:
tokenized = tokenizer.transform(sen_df)

In [15]:
tokenized.withColumn('tokens',count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|hi I heared about...|[hi, i, heared, a...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [16]:
rg_tokenized = regex_tokenizer.transform(sen_df)

In [17]:
rg_tokenized.withColumn('tokens', count_tokens(col('words'))).show()


+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|hi I heared about...|[hi, i, heared, a...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [18]:
from pyspark.ml.feature import StopWordsRemover

In [19]:
sentenceDataFrame = spark.createDataFrame([
    (0,['I', 'saw', 'the', 'green', 'horse']),
    (1,['Mary', 'had', 'a', 'little', 'lamb'])
],['id', 'tokens'])

In [20]:
remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')

In [21]:
remover.transform(sentenceDataFrame).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



# N-GRAMS

In [23]:
from pyspark.ml.feature import NGram

In [35]:
wordDataFrame = spark.createDataFrame([
    (0, ['hi', 'I', 'heared', 'about', 'spark']),
    (1, ['I', 'wish', 'java', 'could', 'use', 'case', 'classes']),
    (2, ['Logistic','regression','models','are','neat'])
],['id', 'words'])

In [36]:
ngram = NGram(n=2, inputCol='words', outputCol='grams')

In [37]:
ngram.transform(wordDataFrame).select('grams').show(truncate=False)

+------------------------------------------------------------------+
|grams                                                             |
+------------------------------------------------------------------+
|[hi I, I heared, heared about, about spark]                       |
|[I wish, wish java, java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+

