In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 47 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 65.2 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=d825e020facee9235b2906a9d44c93b473a5ec5f93ba7f1872cb66e136895c9f
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("practice").getOrCreate()

In [None]:
from pyspark.sql.functions import col, udf
from pyspark.ml.feature import HashingTF , IDF ,Tokenizer , StopWordsRemover
from pyspark.sql.types import IntegerType


In [None]:
from pyspark import SparkFiles
url ="https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-online/v2/module_17/airlines.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("airlines.csv") , sep = ',' , header=True)
df.show()

+--------------------+
|      Airline Tweets|
+--------------------+
|@VirginAmerica pl...|
|@VirginAmerica se...|
|@VirginAmerica do...|
|@VirginAmerica Ar...|
|@VirginAmerica aw...|
+--------------------+



In [None]:
# Tokenize DataFrame
separater = Tokenizer(inputCol="Airline Tweets" , outputCol= "words")
new_df = separater.transform(df)
new_df.show()

+--------------------+--------------------+
|      Airline Tweets|               words|
+--------------------+--------------------+
|@VirginAmerica pl...|[@virginamerica, ...|
|@VirginAmerica se...|[@virginamerica, ...|
|@VirginAmerica do...|[@virginamerica, ...|
|@VirginAmerica Ar...|[@virginamerica, ...|
|@VirginAmerica aw...|[@virginamerica, ...|
+--------------------+--------------------+



In [None]:
# Remove stop words
remover = StopWordsRemover(inputCol= "words" ,outputCol="filterd word")
new_filtered_df = remover.transform(new_df)
new_filtered_df.show()

+--------------------+--------------------+--------------------+
|      Airline Tweets|               words|        filterd word|
+--------------------+--------------------+--------------------+
|@VirginAmerica pl...|[@virginamerica, ...|[@virginamerica, ...|
|@VirginAmerica se...|[@virginamerica, ...|[@virginamerica, ...|
|@VirginAmerica do...|[@virginamerica, ...|[@virginamerica, ...|
|@VirginAmerica Ar...|[@virginamerica, ...|[@virginamerica, ...|
|@VirginAmerica aw...|[@virginamerica, ...|[@virginamerica, ...|
+--------------------+--------------------+--------------------+



In [None]:
# hasinng term frequency
hashing = HashingTF(inputCol="filterd word" , outputCol="hasing freq")
new_filtered_df_with_hf = hashing.transform(new_filtered_df)
new_filtered_df_with_hf.show()

+--------------------+--------------------+--------------------+--------------------+
|      Airline Tweets|               words|        filterd word|         hasing freq|
+--------------------+--------------------+--------------------+--------------------+
|@VirginAmerica pl...|[@virginamerica, ...|[@virginamerica, ...|(262144,[1419,999...|
|@VirginAmerica se...|[@virginamerica, ...|[@virginamerica, ...|(262144,[30053,44...|
|@VirginAmerica do...|[@virginamerica, ...|[@virginamerica, ...|(262144,[107065,1...|
|@VirginAmerica Ar...|[@virginamerica, ...|[@virginamerica, ...|(262144,[9641,506...|
|@VirginAmerica aw...|[@virginamerica, ...|[@virginamerica, ...|(262144,[6122,505...|
+--------------------+--------------------+--------------------+--------------------+



In [None]:
# IDF - inverse document frequency
idf = IDF(inputCol="hasing freq" , outputCol="idf")
idfModel = idf.fit(new_filtered_df_with_hf)
new_filtered_df_with_hfidf = idfModel.transform(new_filtered_df_with_hf)
new_filtered_df_with_hfidf.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|      Airline Tweets|               words|        filterd word|         hasing freq|                 idf|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|@VirginAmerica pl...|[@virginamerica, ...|[@virginamerica, ...|(262144,[1419,999...|(262144,[1419,999...|
|@VirginAmerica se...|[@virginamerica, ...|[@virginamerica, ...|(262144,[30053,44...|(262144,[30053,44...|
|@VirginAmerica do...|[@virginamerica, ...|[@virginamerica, ...|(262144,[107065,1...|(262144,[107065,1...|
|@VirginAmerica Ar...|[@virginamerica, ...|[@virginamerica, ...|(262144,[9641,506...|(262144,[9641,506...|
|@VirginAmerica aw...|[@virginamerica, ...|[@virginamerica, ...|(262144,[6122,505...|(262144,[6122,505...|
+--------------------+--------------------+--------------------+--------------------+--------------------+

