In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlpa').getOrCreate()

In [0]:
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer, StandardScaler, Tokenizer, RegexTokenizer, StopWordsRemover, NGram, HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.sql.functions import year, month, dayofmonth, col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.clustering import KMeans
from pyspark.ml.recommendation import ALS

In [0]:
sen_df = spark.createDataFrame([
  (0,'Hi I heard oabout spark'),
  (1,'I wish java could use case classes'),
  (2,'Logistic,regfression,models,are,neat')
], ['id','sentence'])

In [0]:
sen_df.show()

In [0]:
tzr = Tokenizer(inputCol='sentence', outputCol='words')

In [0]:
rgxtzr = RegexTokenizer(inputCol='sentence', outputCol='words',pattern='\\W')

In [0]:
count_tokens = udf(lambda words:len(words),IntegerType())

In [0]:
tzd = tzr.transform(sen_df)
tzd.show()

In [0]:
tzd.withColumn('tokens',count_tokens(col('words'))).show()

In [0]:
rgtzd = rgxtzr.transform(sen_df)
rgtzd.show()

In [0]:
rgtzd.withColumn('tokens',count_tokens(col('words'))).show()

In [0]:
df = spark.createDataFrame([
  (0,['I','saw','the','green','horse']),
  (1,['Marry','had','a','little','lamb'])
], ['id','tokens'])
df.show()

In [0]:
remover = StopWordsRemover(inputCol='tokens',outputCol='filtered')

In [0]:
remover.transform(df).show()

In [0]:
wdf = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["id", "words"])
wdf.show()

In [0]:
ngram = NGram(n=2,inputCol='words',outputCol='grams')

In [0]:
ngram.transform(wdf).select('grams').show(truncate=False)

In [0]:
sd = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

sd.show()

In [0]:
tzr = Tokenizer(inputCol='sentence', outputCol='words')

In [0]:
words_data = tzr.transform(sd)
words_data.show()

In [0]:
hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures')

In [0]:
featurized_data = hashing_tf.transform(words_data)

In [0]:
idf = IDF(inputCol='rawFeatures', outputCol='features')

In [0]:
idf_model=idf.fit(featurized_data)

In [0]:

rescaled_data = idf_model.transform(featurized_data)
rescaled_data.show()

In [0]:
dfb= spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])

In [0]:
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)

In [0]:
model = cv.fit(dfb)

In [0]:
res = model.transform(dfb)

In [0]:
res.show(truncate=False)