In [1]:
### Imports for TF-IDF 
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.linalg import SparseVector, VectorUDT, DenseVector
from pyspark.sql.functions import concat_ws, collect_list, udf



In [2]:

import time
# def main():

# Initialize SparkSession
print("about to create spark sessions")
spark = SparkSession.builder \
    .appName("test-td-idf")\
    .master("spark://spark-master:7077")\
    .config("spark.executor.instances", 1)\
    .config("spark.cores.max", 2)\
    .getOrCreate()

# Load data from JSON file
import os
print("about to print directory")
print(os.listdir())

print("about to load data")
data = spark.read.json("/opt/data/*.json")
print("read file")
data.show()
print("loaded data correctly")


about to create spark sessions


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/17 01:57:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


about to print directory
['.ipynb_checkpoints', 'Dockerfile', 'process_data.py', 'TF_IDF.ipynb', 'Untitled.ipynb']
about to load data


                                                                                

read file


                                                                                

+--------------------+--------------------+--------------------+--------------------+--------------------+---------+------+----------------+------------------+----------------------+------------------+--------+--------------------+--------------------+----+------+-------------+-------------+---------+------------+--------------------+--------------------+--------------------+----------+
|             account|         application|                card|             content|          created_at|edited_at|emojis|favourites_count|                id|in_reply_to_account_id|    in_reply_to_id|language|   media_attachments|            mentions|poll|reblog|reblogs_count|replies_count|sensitive|spoiler_text|                tags|                 uri|                 url|visibility|
+--------------------+--------------------+--------------------+--------------------+--------------------+---------+------+----------------+------------------+----------------------+------------------+--------+----------

In [3]:
data.printSchema()

root
 |-- account: struct (nullable = true)
 |    |-- acct: string (nullable = true)
 |    |-- avatar: string (nullable = true)
 |    |-- avatar_static: string (nullable = true)
 |    |-- bot: boolean (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- discoverable: boolean (nullable = true)
 |    |-- display_name: string (nullable = true)
 |    |-- emojis: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- shortcode: string (nullable = true)
 |    |    |    |-- static_url: string (nullable = true)
 |    |    |    |-- url: string (nullable = true)
 |    |    |    |-- visible_in_picker: boolean (nullable = true)
 |    |-- fields: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- value: string (nullable = true)
 |    |    |    |-- verified_at: string (nullable = true)
 |    |-- followers_count: long (nullable = true)
 |    

In [4]:


from pyspark.sql.functions import regexp_replace
# Remove html tags
data = data.withColumn("content", regexp_replace("content", "<[^>]*>", ""))



In [5]:
# type(data.select('account').show(1))
# data.select('account').collect()

In [6]:
# Use concat_ws() to combine the array of strings into a single column
data = data.withColumn("content", concat_ws(" ", "content"))
data.show()
print("before group by")
# Use groupBy() and concat_ws() to combine the strings for rows with the same ID
data = data.groupBy("account").agg(concat_ws(" ", collect_list("content")).alias("combined_content"))
data.show()

                                                                                

+--------------------+--------------------+--------------------+--------------------+--------------------+---------+------+----------------+------------------+----------------------+------------------+--------+--------------------+--------------------+----+------+-------------+-------------+---------+------------+--------------------+--------------------+--------------------+----------+
|             account|         application|                card|             content|          created_at|edited_at|emojis|favourites_count|                id|in_reply_to_account_id|    in_reply_to_id|language|   media_attachments|            mentions|poll|reblog|reblogs_count|replies_count|sensitive|spoiler_text|                tags|                 uri|                 url|visibility|
+--------------------+--------------------+--------------------+--------------------+--------------------+---------+------+----------------+------------------+----------------------+------------------+--------+----------

                                                                                

+--------------------+--------------------+
|             account|    combined_content|
+--------------------+--------------------+
|{20thCenturyFoxes...|Would you fuck my...|
|{2blv7@queer.part...|Klingt nach 90er ...|
|{435d0addfa045737...|Good Friday, whos...|
|{718_louis@mstdn....|Whenever you're f...|
|{82341f882b6eabcd...|curl https://nost...|
|{AITA@botsin.spac...|AITA for calling ...|
|{AirplayAccess, h...|Country music has...|
|{Alexandros2112@m...|#Yankees fans get...|
|{Ally_SMMiller@ma...|https://open.subs...|
|{Arcana@akko.disq...|Rust users or as ...|
|{ArnoldSchiller@m...|Nur, bitte...Ich ...|
|{BennyOtt, https:...|Ob man zum Finale...|
|{BinroHeretic@mst...|"And the #knowled...|
|{BloodyHell@thebl...|Wordle 694 5/6⬜⬜?...|
|{Bossito@mastodon...|I love Filomena C...|
|{BraveGods@mastod...|Because of me?I'm...|
|{CORDIS_EU@respub...|RT @ProjectPerfor...|
|{Captn_Steve@troe...|Colorfull #Dive #...|
|{CatBuddha@mstdn....|                    |
|{CheComunitario, ...|cof foco f

In [8]:

# Tokenize content column
tokenizer = Tokenizer(inputCol="combined_content", outputCol="words")
data = tokenizer.transform(data)
data.show()

# return

# import time

# if __name__ == '__main__':
#     print("in the main file")
#     while True:
#         print("in the while loop")
#         main()
#         time.sleep(300)

IllegalArgumentException: Output column words already exists.

In [9]:

print("after tokenization")
# Compute Term Frequencies
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
data = hashingTF.transform(data)
data.show()


after tokenization


IllegalArgumentException: requirement failed: Column rawFeatures already exists.

In [10]:

print("after term frequencies")
# Compute Inverse Document Frequencies
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(data)
data = idfModel.transform(data)


after term frequencies


IllegalArgumentException: requirement failed: Column features already exists.

In [11]:

print("after idfModel")
# Convert sparse vectors to dense vectors

data.show()
print("After convert to dense vector")


after idfModel


23/05/17 02:16:55 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/17 02:17:37 WARN TaskSetManager: Lost task 0.0 in stage 29.0 (TID 44) (192.168.32.5 executor 3): java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:85)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:514)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(Buffere

+--------------------+--------------------+--------------------+--------------------+--------------------+
|             account|    combined_content|               words|         rawFeatures|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|{20thCenturyFoxes...|Would you fuck my...|[would, you, fuck...|(262144,[19876,44...|[0.0,0.0,0.0,0.0,...|
|{2blv7@queer.part...|Klingt nach 90er ...|[klingt, nach, 90...|(262144,[26497,10...|[0.0,0.0,0.0,0.0,...|
|{435d0addfa045737...|Good Friday, whos...|[good, friday,, w...|(262144,[83997,10...|[0.0,0.0,0.0,0.0,...|
|{718_louis@mstdn....|Whenever you're f...|[whenever, you're...|(262144,[3530,139...|[0.0,0.0,0.0,0.0,...|
|{82341f882b6eabcd...|curl https://nost...|[curl, https://no...|(262144,[15873,46...|[0.0,0.0,0.0,0.0,...|
|{AITA@botsin.spac...|AITA for calling ...|[aita, for, calli...|(262144,[9420,198...|[0.0,0.0,0.0,0.0,...|
|{AirplayAccess, h...|Country music h

In [12]:

print("after drop combined content")
to_dense = lambda v: DenseVector(v.toArray()) if isinstance(v, SparseVector) else v
to_dense_udf = udf(to_dense, VectorUDT())
data = data.withColumn("features", to_dense_udf("features"))
data.show()



after drop combined content


23/05/17 02:18:24 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
                                                                                

+--------------------+--------------------+--------------------+--------------------+--------------------+
|             account|    combined_content|               words|         rawFeatures|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|{20thCenturyFoxes...|Would you fuck my...|[would, you, fuck...|(262144,[19876,44...|[0.0,0.0,0.0,0.0,...|
|{2blv7@queer.part...|Klingt nach 90er ...|[klingt, nach, 90...|(262144,[26497,10...|[0.0,0.0,0.0,0.0,...|
|{435d0addfa045737...|Good Friday, whos...|[good, friday,, w...|(262144,[83997,10...|[0.0,0.0,0.0,0.0,...|
|{718_louis@mstdn....|Whenever you're f...|[whenever, you're...|(262144,[3530,139...|[0.0,0.0,0.0,0.0,...|
|{82341f882b6eabcd...|curl https://nost...|[curl, https://no...|(262144,[15873,46...|[0.0,0.0,0.0,0.0,...|
|{AITA@botsin.spac...|AITA for calling ...|[aita, for, calli...|(262144,[9420,198...|[0.0,0.0,0.0,0.0,...|
|{AirplayAccess, h...|Country music h

In [None]:
print("after withColumn")
data.printSchema()
data.show(5)
# Write to file
data.write.parquet(path="/opt/warehouse/tf_idf3.parquet",mode="overwrite")
#spark.stop()

after withColumn
root
 |-- account: struct (nullable = true)
 |    |-- acct: string (nullable = true)
 |    |-- avatar: string (nullable = true)
 |    |-- avatar_static: string (nullable = true)
 |    |-- bot: boolean (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- discoverable: boolean (nullable = true)
 |    |-- display_name: string (nullable = true)
 |    |-- emojis: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- shortcode: string (nullable = true)
 |    |    |    |-- static_url: string (nullable = true)
 |    |    |    |-- url: string (nullable = true)
 |    |    |    |-- visible_in_picker: boolean (nullable = true)
 |    |-- fields: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- value: string (nullable = true)
 |    |    |    |-- verified_at: string (nullable = true)
 |    |-- followers_count: long (nullab

23/05/17 02:19:03 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
                                                                                

+--------------------+--------------------+--------------------+--------------------+--------------------+
|             account|    combined_content|               words|         rawFeatures|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|{20thCenturyFoxes...|Would you fuck my...|[would, you, fuck...|(262144,[19876,44...|[0.0,0.0,0.0,0.0,...|
|{2blv7@queer.part...|Klingt nach 90er ...|[klingt, nach, 90...|(262144,[26497,10...|[0.0,0.0,0.0,0.0,...|
|{435d0addfa045737...|Good Friday, whos...|[good, friday,, w...|(262144,[83997,10...|[0.0,0.0,0.0,0.0,...|
|{718_louis@mstdn....|Whenever you're f...|[whenever, you're...|(262144,[3530,139...|[0.0,0.0,0.0,0.0,...|
|{82341f882b6eabcd...|curl https://nost...|[curl, https://no...|(262144,[15873,46...|[0.0,0.0,0.0,0.0,...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



23/05/17 02:19:14 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB
23/05/17 02:23:51 WARN TaskSetManager: Lost task 0.0 in stage 38.0 (TID 59) (192.168.32.5 executor 4): org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to file:/opt/warehouse/tf_idf3.parquet.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:788)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:420)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:888)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:888)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala

In [None]:

from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.ml.linalg import VectorUDT

In [None]:
schema = StructType([
    StructField("account", StringType(), True),
    StructField("combined_content", StringType(), True),
    StructField("words", ArrayType(StringType()), True),
    StructField("rawFeatures", VectorUDT(), True),
    StructField("features", VectorUDT(), True)
])
tfidf = spark.read.schema(schema).parquet('/opt/warehouse/tf_idf3.parquet')
tfidf_df=tfidf.toPandas()
tfidf_df.head()


In [None]:
users_list = tfidf_df[['account']].to_dict('records')
print(users_list)

In [None]:
tfidf = spark.read.schema(schema).parquet('/opt/warehouse/tf_idf3.parquet')
tfidf_df=tfidf.toPandas()
users_list = tfidf_df[['account']].to_dict('records')