In [83]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from itertools import combinations
from pyspark.ml.feature import CountVectorizer, IDF, Tokenizer
from pyspark.ml.linalg import DenseVector
from pyspark.sql.types import ArrayType, StringType

spark = SparkSession \
        .builder \
        .appName("frequent_itemsets") \
        .getOrCreate()

In [23]:
df_preds = spark.read.parquet("../lab2_clustering/data_and_predictions")
df_text = spark.read.csv("../clean_mfd2+liwc.csv", header= True).select(["id", "cleaned_text"])

In [24]:
df_preds.show(5)

+-------+-----------+---------+---------------+-------------+--------------+------------+----------------+--------------+---------------+-------------+-----+--------------------+----------+
|id_clst|Care_Virtue|Care_Vice|Fairness_Virtue|Fairness_Vice|Loyalty_Virtue|Loyalty_Vice|Authority_Virtue|Authority_Vice|Sanctity_Virtue|Sanctity_Vice|   id|            features|prediction|
+-------+-----------+---------+---------------+-------------+--------------+------------+----------------+--------------+---------------+-------------+-----+--------------------+----------+
|     15|        0.0|      0.0|            0.0|          0.0|          3.45|         0.0|             0.0|           0.0|            0.0|          0.0|r02b5|[0.50187895826043...|         0|
|     16|        2.4|      0.0|            0.0|          0.0|           0.0|         0.0|             0.0|           0.0|            0.0|          0.0|r89qc|[-0.1435318624002...|         0|
|     20|        0.0|      0.0|            0.0|   

In [22]:
df_text.show(5)

+-----+--------------------+
|   id|        cleaned_text|
+-----+--------------------+
|hk5r2|i had an appointm...|
|iqimz|i created this si...|
|pfzt5|hello everyone   ...|
|pk714|i grew up with bo...|
|q0q8x|i have to ask  wh...|
+-----+--------------------+
only showing top 5 rows



In [25]:
df_merged = df_text.join(df_preds, on="id", how="inner")
df_merged.show(5)

[Stage 30:>                                                         (0 + 5) / 5]

+-------+--------------------+-------+-----------+---------+---------------+-------------+--------------+------------+----------------+--------------+---------------+-------------+--------------------+----------+
|     id|        cleaned_text|id_clst|Care_Virtue|Care_Vice|Fairness_Virtue|Fairness_Vice|Loyalty_Virtue|Loyalty_Vice|Authority_Virtue|Authority_Vice|Sanctity_Virtue|Sanctity_Vice|            features|prediction|
+-------+--------------------+-------+-----------+---------+---------------+-------------+--------------+------------+----------------+--------------+---------------+-------------+--------------------+----------+
|1001497|i am meeting a po...|  79758|       1.03|      0.0|            0.0|          0.0|           0.0|         0.0|             0.0|           0.0|            0.0|         0.52|[0.12984472686258...|         0|
|1001uik|hey  so basically...|  79762|        0.0|      0.0|            0.0|          0.0|           0.0|         0.0|            0.39|           0.

                                                                                

In [28]:
df0 = df_merged.filter(F.col('prediction') == 0) \
                    .show(10)



+-------+--------------------+-------+-----------+---------+---------------+-------------+--------------+------------+----------------+--------------+---------------+-------------+--------------------+----------+
|     id|        cleaned_text|id_clst|Care_Virtue|Care_Vice|Fairness_Virtue|Fairness_Vice|Loyalty_Virtue|Loyalty_Vice|Authority_Virtue|Authority_Vice|Sanctity_Virtue|Sanctity_Vice|            features|prediction|
+-------+--------------------+-------+-----------+---------+---------------+-------------+--------------+------------+----------------+--------------+---------------+-------------+--------------------+----------+
|1001497|i am meeting a po...|  79758|       1.03|      0.0|            0.0|          0.0|           0.0|         0.0|             0.0|           0.0|            0.0|         0.52|[0.12984472686258...|         0|
|1001uik|hey  so basically...|  79762|        0.0|      0.0|            0.0|          0.0|           0.0|         0.0|            0.39|           0.

                                                                                

In [29]:
df1 = df_merged.filter(F.col('prediction') == 1) \
                    .show(10)



+-------+--------------------+-------+-----------+---------+---------------+-------------+--------------+------------+----------------+--------------+---------------+-------------+--------------------+----------+
|     id|        cleaned_text|id_clst|Care_Virtue|Care_Vice|Fairness_Virtue|Fairness_Vice|Loyalty_Virtue|Loyalty_Vice|Authority_Virtue|Authority_Vice|Sanctity_Virtue|Sanctity_Vice|            features|prediction|
+-------+--------------------+-------+-----------+---------+---------------+-------------+--------------+------------+----------------+--------------+---------------+-------------+--------------------+----------+
|1006c2b|self respect is a...|  79786|       2.51|     0.22|           0.25|          0.0|          0.07|         0.0|            2.87|          0.04|           0.22|          0.0|[1.69006669331013...|         1|
|100e82k|i have started an...|  79815|        0.0|      0.0|            0.0|         1.85|          5.56|         0.0|             0.0|           0.

                                                                                

Get TF IDF scores

In [34]:
#Tokenize
tokenizer = Tokenizer(inputCol="cleaned_text", outputCol="tokenized")
tokenized_df = tokenizer.transform(df_merged)

#Vectorize
cv = CountVectorizer(inputCol="tokenized", outputCol="vectorized")
model_cv = cv.fit(tokenized_df)
vectorized_df = model_cv.transform(tokenized_df)

#TF-IDF vectors
idf = IDF(inputCol="vectorized", outputCol="tf-idf", minDocFreq = 10)
model_idf = idf.fit(vectorized_df)
weighted_df = model_idf.transform(vectorized_df)

                                                                                

24/10/30 18:33:43 WARN DAGScheduler: Broadcasting large task binary with size 1740.1 KiB




24/10/30 18:33:55 WARN DAGScheduler: Broadcasting large task binary with size 1741.1 KiB


                                                                                

In [63]:
#Get vocab 
vocab = model_cv.vocabulary  
type(vocab)

#Get tf idf vector
tfidf_vector =model_idf.idf

#Asked ChatGPT "how can I map the TF IDF vector with it's corresponding words?" Suggested the following syntax
#word_tfidf_pairs = [(vocab[i], tfidf_vector[i]) for i in range(len(tfidf_vector)) if tfidf_vector[i] > 1]
#but I prefer to have it as a dictionary so I'll adapt the code 

#Have a dictionary so I can map numbers to words later
word_tfidf_pairs = {tfidf_vector[i]:vocab[i] for i in range(len(tfidf_vector)) if tfidf_vector[i] > 5}
len(word_tfidf_pairs)

2242

In [72]:
weighted_df.select(["tf-idf"]).show(1, truncate=False)

24/10/30 19:03:51 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [84]:
# Step 2: Function to filter words based on TF-IDF score
def filter_words_by_tfidf(tfidf_vector):
    tfidf_vector = tfidf_vector.toArray()
    return tfidf_vector[0]
                        
                        
filter_words_udf = F.udf(lambda vector, indices: filter_words_by_tfidf(vector, indices, vocab, tfidf_threshold), 
                          ArrayType(StringType()))



In [78]:
type(tfidf_vector)

pyspark.ml.linalg.DenseVector

In [80]:
array_representation = tfidf_vector.toArray()

In [81]:
array_representation

array([0.00219731, 0.18867776, 0.05604172, ..., 0.        , 0.        ,
       0.        ])

In [85]:
filtered_words_df = weighted_df.withColumn("tokenized", filter_words_udf(F.col("tf-idf"), F.col("vectorized")))

In [86]:
filtered_words_df.show(5)

[Stage 76:>                                                         (0 + 5) / 5]

24/10/30 19:14:50 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


[Stage 79:>                                                         (0 + 1) / 1]

24/10/30 19:14:51 ERROR Executor: Exception in task 0.0 in stage 79.0 (TID 198)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/scratch/local/jobs/24976501/ipykernel_2543762/357167070.py", line 7, in <lambda>
NameError: name 'tfidf_threshold' is not defined

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:86)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:68)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.s

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/scratch/local/jobs/24976501/ipykernel_2543762/357167070.py", line 7, in <lambda>
NameError: name 'tfidf_threshold' is not defined
