<a href="https://colab.research.google.com/github/muhammetsnts/SPARK/blob/main/2.ML_with_PySpark_MLlib/NLP/2.TF_IDF_and_CountVectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Environment

In [1]:
# install Java8
!apt-get -q install openjdk-8-jdk-headless -qq > /dev/null

# download spark3.1.1
!wget -q https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

# unzip it
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

# install findspark 
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
#spark = SparkSession.builder.appName('ops').getOrCreate()

# Hashing Term Frequency (HashingTF) and Inverse Document Frequency (IDF)

In [2]:
# IDF: Inverse document frequency

from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [5]:
sentenceData = spark.createDataFrame([
                                       (0.0, "Hi, I heard about Saprk"),
                                       (0.0, "I wish Java could use case classes"),
                                       (1.0, "Logistic regression models are neat")],
                                       ["label", "sentence"])

In [6]:
sentenceData.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi, I heard about...|
|  0.0|I wish Java could...|
|  1.0|Logistic regressi...|
+-----+--------------------+



In [7]:
tokenizer = Tokenizer(inputCol='sentence', outputCol="words")

In [10]:
words_data = tokenizer.transform(sentenceData)

In [12]:
words_data.show(truncate=False)

+-----+-----------------------------------+------------------------------------------+
|label|sentence                           |words                                     |
+-----+-----------------------------------+------------------------------------------+
|0.0  |Hi, I heard about Saprk            |[hi,, i, heard, about, saprk]             |
|0.0  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|
|1.0  |Logistic regression models are neat|[logistic, regression, models, are, neat] |
+-----+-----------------------------------+------------------------------------------+



In [14]:
hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures")

In [15]:
featured_data = hashing_tf.transform(words_data)

In [16]:
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [17]:
idf_model = idf.fit(featured_data)

In [18]:
rescaled_data = idf_model.transform(featured_data)

In [19]:
rescaled_data.select("label", "features").show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                      |
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(262144,[18700,19036,40983,66273,233667],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453])                                                   |
|0.0  |(262144,[19036,20719,55551,58672,98717,109547,192310],[0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])|
|1.0 

# Count Vectorization

In [20]:
from pyspark.ml.feature import CountVectorizer

In [21]:
df = spark.createDataFrame([
                            (0, "a b c".split(" ")),
                            (1, "a b b c a".split(" "))
                           ], ["id", "words"])

In [22]:
df.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [23]:
# vocabSize : whats the max number of available vocabulary words I want
# In this example a, b, c, so vocabSize = 3 

# minDF : Affects that fitting process, by specifying the minimum number of documents a term must appearance to be included in the vocabulary.
# So if minDF=2 and 1 term only appears on 1 document, and then minDF had that cut off then it wouldn't appear on count vectorizer  

cv = CountVectorizer(inputCol="words", outputCol = "features", vocabSize=3, minDF=2)

In [24]:
model = cv.fit(df)

In [25]:
result = model.transform(df)

In [26]:
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

