In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, IntegerType, FloatType, StringType

spark = SparkSession \
        .builder \
        .appName("frequent_itemsets") \
        .getOrCreate()

24/10/30 22:22:36 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Load data

In [2]:
df_preds = spark.read.parquet("lab2_clustering/data_and_predictions").select(["id", "prediction"])
df_scores = spark.read.csv("clean_mfd2+liwc.csv", header= True).select(["id",'emo_pos',
       'emo_anx', 'emo_anger', 'emo_sad', 'Care_Virtue', 'Care_Vice',
       'Fairness_Virtue', 'Fairness_Vice', 'Loyalty_Virtue', 'Loyalty_Vice',
       'Authority_Virtue', 'Authority_Vice', 'Sanctity_Virtue',
       'Sanctity_Vice'])


scores_cols = ['emo_pos',
       'emo_anx', 'emo_anger', 'emo_sad', 'Care_Virtue', 'Care_Vice',
       'Fairness_Virtue', 'Fairness_Vice', 'Loyalty_Virtue', 'Loyalty_Vice',
       'Authority_Virtue', 'Authority_Vice', 'Sanctity_Virtue',
       'Sanctity_Vice']

df_scores = df_scores.select(*(F.col(c).cast("float").alias(c) for c in scores_cols), "id").dropna()

In [3]:
#Check they're ok
df_preds.show(5)
df_scores.show(5)

+-----+----------+
|   id|prediction|
+-----+----------+
|r02b5|         0|
|r89qc|         0|
|rrhg8|         0|
|rtji7|         0|
|s0ruk|         0|
+-----+----------+
only showing top 5 rows

+-------+-------+---------+-------+-----------+---------+---------------+-------------+--------------+------------+----------------+--------------+---------------+-------------+-----+
|emo_pos|emo_anx|emo_anger|emo_sad|Care_Virtue|Care_Vice|Fairness_Virtue|Fairness_Vice|Loyalty_Virtue|Loyalty_Vice|Authority_Virtue|Authority_Vice|Sanctity_Virtue|Sanctity_Vice|   id|
+-------+-------+---------+-------+-----------+---------+---------------+-------------+--------------+------------+----------------+--------------+---------------+-------------+-----+
|    0.0|    0.0|      0.0|    0.0|        0.0|      0.0|            0.0|          0.0|           0.0|         0.0|             0.0|           0.0|            0.0|          0.0|hk5r2|
|   2.56|    0.0|      0.0|    0.0|       2.56|      0.0|           

Create one merged dataframe and one additional dataframe for each cluster obtained in Lab2. Recall that cluster 1 exhibited stronger moral language and had more negative words on their top 100 words

In [4]:
#Merge dataframes 
df = df_preds.join(df_scores, on="id", how="inner")
df.show(5)

+-----+----------+-------+-------+---------+-------+-----------+---------+---------------+-------------+--------------+------------+----------------+--------------+---------------+-------------+
|   id|prediction|emo_pos|emo_anx|emo_anger|emo_sad|Care_Virtue|Care_Vice|Fairness_Virtue|Fairness_Vice|Loyalty_Virtue|Loyalty_Vice|Authority_Virtue|Authority_Vice|Sanctity_Virtue|Sanctity_Vice|
+-----+----------+-------+-------+---------+-------+-----------+---------+---------------+-------------+--------------+------------+----------------+--------------+---------------+-------------+
|hk5r2|         0|    0.0|    0.0|      0.0|    0.0|        0.0|      0.0|            0.0|          0.0|           0.0|         0.0|             0.0|           0.0|            0.0|          0.0|
|iqimz|         0|   2.56|    0.0|      0.0|    0.0|       2.56|      0.0|            0.0|          0.0|          0.85|         0.0|             0.0|           0.0|            0.0|          0.0|
|pfzt5|         0|   2.06

In [5]:
#Create df for clusters obtained in lab2 
df0 = df.filter(F.col('prediction') == 0) 
df1 = df.filter(F.col('prediction') == 1)

I asked ChatGPT: I need to create ONE new column that takes the names of all of the columns where the value for a given row is greater than 1


I then asked: What if I want that instead of the names, each column is represented as an integer? (e.g. emo pos would be 1, emo_neg 2)

It showed me how to do it for a dataframe and then I generalized it myself on a function to use it in the three dataframes


In [7]:
column_mapping = {
    'emo_pos': 1,
    'emo_anx': 3,
    'emo_anger': 4,
    'emo_sad': 5,
    'Care_Virtue': 6,
    'Care_Vice': 7,
    'Fairness_Virtue': 8,
    'Fairness_Vice': 9,
    'Loyalty_Virtue': 10,
    'Loyalty_Vice': 11,
    'Authority_Virtue': 12,
    'Authority_Vice': 13,
    'Sanctity_Virtue': 14,
    'Sanctity_Vice': 15
}


def create_items(input_df):
    # Create a new column 'positive_columns' with the integers where values are greater than 1
        df = input_df.withColumn(
        "positive_columns",
        F.array([
        F.when(F.col(col) > 1, F.lit(value)).otherwise(None) 
        for col, value in column_mapping.items()]))

        # Filter out None values from the array
        df = df.withColumn("relevant", F.expr("filter(positive_columns, x -> x is not null)"))
        
        return df 
    
df_all = create_items(df)
df_all.select("relevant").show(5)

df0 = create_items(df0)
df0.select("relevant").show(5)

df1 = create_items(df1)
df1.select("relevant").show(5)

+--------+
|relevant|
+--------+
|      []|
|  [1, 6]|
|  [1, 6]|
|  [1, 6]|
|  [1, 7]|
+--------+
only showing top 5 rows

+--------+
|relevant|
+--------+
|      []|
|  [1, 6]|
|  [1, 6]|
|  [1, 6]|
|  [1, 7]|
+--------+
only showing top 5 rows

+--------------------+
|            relevant|
+--------------------+
|[1, 6, 7, 10, 12,...|
|                  []|
|            [10, 13]|
|       [4, 6, 9, 12]|
|          [6, 7, 15]|
+--------------------+
only showing top 5 rows



Now, I can look for association rules. First try in the entire dataset

In [13]:
from pyspark.ml.fpm import FPGrowth

fp = FPGrowth(minConfidence=0.5, minSupport=0.001)
fpm = fp.fit(df_all.select(df_all.relevant.alias('items')))
fpm.associationRules.orderBy("lift", "confidence", ascending=False).show(truncate=False)



+-----------+----------+------------------+------------------+---------------------+
|antecedent |consequent|confidence        |lift              |support              |
+-----------+----------+------------------+------------------+---------------------+
|[10, 14, 1]|[6]       |0.5485254691689008|1.7098797514928963|0.0019627518433175304|
|[5, 14, 1] |[6]       |0.5395833333333333|1.6820050622400846|0.001490770461640001 |
|[7, 14, 1] |[6]       |0.5333723653395784|1.662644049841184 |0.0017478660110090618|
|[12, 14, 1]|[6]       |0.5262515262515263|1.6404467604632673|0.0016538534593741067|
|[12, 5, 1] |[6]       |0.525560538116592 |1.6382927918932462|0.001124313372613952 |
|[5, 7, 1]  |[6]       |0.5234254992319508|1.6316373857859796|0.0026150838342539526|
|[10, 7, 1] |[6]       |0.5193452380952381|1.6189182755702969|0.0013391992049224205|
|[12, 7, 1] |[6]       |0.5176752546434991|1.6137125539585304|0.001657690706379615 |
|[5, 10, 1] |[6]       |0.5158311345646438|1.6079640085228304|0.0

                                                                                

I see there are a lot of rules, I'll put a higher threshold of support to filter even more 

In [12]:
fp = FPGrowth(minConfidence=0.5, minSupport=0.01)
fpm = fp.fit(df_all.select(df_all.relevant.alias('items')))
fpm.associationRules.orderBy("lift", "confidence", ascending=False).show(truncate=False)



+----------+----------+------------------+------------------+--------------------+
|antecedent|consequent|confidence        |lift              |support             |
+----------+----------+------------------+------------------+--------------------+
|[10, 6]   |[1]       |0.6025974025974026|1.302039567264673 |0.015579222842363974|
|[14, 6]   |[1]       |0.6006894390473206|1.297917007120209 |0.0183880876503961  |
|[5, 6]    |[1]       |0.5903799692510433|1.2756412098226868|0.01547177992620974 |
|[6]       |[1]       |0.5862310259446657|1.2666765373498938|0.18806155711646236 |
|[4, 6]    |[1]       |0.5828057619965135|1.2592755277046546|0.012187096489494577|
|[3, 6]    |[1]       |0.5797829256439333|1.2527440482799832|0.020600260549071676|
|[12, 6]   |[1]       |0.572869509967437 |1.2378061465948011|0.01383903132536593 |
|[7, 6]    |[1]       |0.5581903915806739|1.2060887962216578|0.015671316770496175|
|[15, 6]   |[1]       |0.5528386393899044|1.1945251999025532|0.017524707074156716|
+---

                                                                                

I see (the first time running this) that there are a lot of associations with confidence of 1, which makes sense since the emo_neg scores includes the summation of emo_anx, emo_sad and emo_anger. I think it would be better to drop the emo_neg. I'll do it again from the beginning so what I'm describing here, will dissapear. 

I can see now that Care_Virtue is somehow (not to strongly) associated with positive emotions, even if there are parts of sad emotions in the text as well. To observe other types of association, It's worthy to go back to the previous run with support of 0.001

Now, I'll see if there's any negative association

In [19]:
fp = FPGrowth(minConfidence=0.5, minSupport=0.001)
fpm = fp.fit(df_all.select(df_all.relevant.alias('items')))
fpm.associationRules.orderBy("lift", "confidence", ascending=True).show(5,truncate=False)



+----------+----------+------------------+------------------+---------------------+
|antecedent|consequent|confidence        |lift              |support              |
+----------+----------+------------------+------------------+---------------------+
|[4, 14]   |[1]       |0.5027004570004154|1.086191016880091 |0.0023215344383325626|
|[13, 6]   |[1]       |0.5033492822966508|1.0875929416217165|0.0010091959624487009|
|[10, 3]   |[1]       |0.5081018518518519|1.0978618767023802|0.003369102870836347 |
|[5, 14]   |[1]       |0.511727078891258 |1.1056949490410244|0.002762817843966025 |
|[8, 5]    |[1]       |0.5161887141535615|1.1153352588418677|0.0010705919145368348|
+----------+----------+------------------+------------------+---------------------+
only showing top 5 rows



                                                                                

There's not. Now, I'll see the associations in each individual cluster

In [None]:
#Cluster 0 - less moral language, using the same support as above

fp = FPGrowth(minConfidence=0.5, minSupport=0.001)
fpm = fp.fit(df0.select(df0.relevant.alias('items')))
fpm.associationRules.orderBy("lift", "confidence", ascending=False).show(truncate=False)

                                                                                

In [18]:
#Cluster 1 - more moral language
fp = FPGrowth(minConfidence=0.5, minSupport=0.001)
fpm = fp.fit(df1.select(df1.relevant.alias('items')))
fpm.associationRules.orderBy("lift", "confidence", ascending=False).show(truncate=False)



+--------------+----------+------------------+------------------+---------------------+
|antecedent    |consequent|confidence        |lift              |support              |
+--------------+----------+------------------+------------------+---------------------+
|[5, 4, 1, 6]  |[7]       |0.6566265060240963|1.9349963315797925|0.0017420210640712152|
|[5, 4, 1]     |[7]       |0.6357388316151202|1.873442962981665 |0.002956641255533714 |
|[5, 4, 6]     |[7]       |0.6261980830670927|1.8453275682000216|0.003132441546403286 |
|[5, 4]        |[7]       |0.6104651162790697|1.7989644793810424|0.006712374742292755 |
|[5, 3, 1]     |[7]       |0.5957943925233645|1.755731688154262 |0.004075370379249173 |
|[5, 3, 1, 6]  |[7]       |0.5900383141762452|1.7387692439279347|0.0024612040721740105|
|[5, 3]        |[7]       |0.5895833333333333|1.7374284721895163|0.009045724057470714 |
|[5, 3, 6]     |[7]       |0.57              |1.679718833890642 |0.00455482571798437  |
|[5, 3, 15]    |[7]       |0.541

                                                                                