In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, IntegerType, FloatType, StringType

spark = SparkSession \
        .builder \
        .appName("frequent_itemsets") \
        .getOrCreate()

24/11/29 16:23:18 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
data = spark.read.parquet("../data/data_with_topics.parquet").dropna()

In [3]:
data.show(5)

+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+-------+-------+---------+-------+-----+
|     id|        cleaned_text|              topic0|              topic1|              topic2|              topic3|              topic4|              topic5|              topic6|              topic7|              topic8|emo_pos|emo_neg|emo_anx|emo_anger|emo_sad|moral|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+-------+-------+---------+-------+-----+
|1001mlo|hi im a 24 f who ...| 0.11968858380250595|7.442736887136972E-4| 0.06107536248365846|9.333486297156222E-4|0.052468338961077526|9.792222275302406E-4|0.001049030320834...| 0.0632594240943850

## Convert topic probabilities and LIWC scores into "attributes" for Association Rules Mining. 

These attributes correspond to either the presence or absence of a given topic or emotional/moral language, based on thresholds. 

Eg: "document x has: topic 1, topic 4, moral language and negative emotional language"

In [4]:
#Create dictionary of topics where each topic has a tuple as value: 

#First value of tuple corresponds to threshold to consider a topic probable enough
#in a document (0.20 for all topics, decided after observing a distribution of
#topics probabilities and manually checking for results of assignments.

#Second value is just the number of the topic to identify the document

mapping_topics = {
    'topic0': (0.2, 0),
    'topic1': (0.2, 1),
    'topic2': (0.2, 2),
    'topic3': (0.2, 3),
    'topic4': (0.2, 4),
    'topic5': (0.2, 5),
    'topic6': (0.2, 6),
    'topic7': (0.2, 7),
    'topic8': (0.2, 8)}

#Same logic for dictionary construction of morality values

#Thresholds defined as higher than the means of scores in blogs for the development
#of the dictionary 
#Boyd, R. L. (n.d.). The Development and Psychometric Properties of LIWC-22.
mapping_emotions = {
    'emo_pos': (1.17, 11),
    'emo_neg': (0.81, 12),
    'moral': (0.28, 13)}

def create_items(input_df, mapping, output_col):
    # Create a new column with the integers corresponding to attributes where
        #values are higher than the threshold. 
        df = input_df.withColumn(
        "relevant",
        F.array(*[
        F.when(F.col(col) > threshold, F.lit(value)).otherwise(None) 
        for col, (threshold, value) in mapping.items()]))

        # Filter out None values from the array
        df = df.withColumn(f"{output_col}", F.expr("filter(relevant, x -> x is not null)"))
        df = df.drop("relevant")
        
        return df 

In [5]:
df_topics = create_items(data, mapping_topics, "relevant_topics")
df_liwc = create_items(df_topics, mapping_emotions, "relevant_liwc_scores")
df_topics.select('relevant_topics').show(5)
df_liwc.select('relevant_topics', 'relevant_liwc_scores').show(5)

df = df_liwc.withColumn("relevant_attr", F.concat(df_liwc["relevant_topics"], df_liwc["relevant_liwc_scores"]))
df.select('relevant_attr').show(5)

#Save file for usage later 
df.write.mode("overwrite").parquet("../data/data_with_attributes.parquet")

+---------------+
|relevant_topics|
+---------------+
|            [8]|
|         [0, 4]|
|         [1, 7]|
|         [3, 7]|
|         [6, 8]|
+---------------+
only showing top 5 rows

+---------------+--------------------+
|relevant_topics|relevant_liwc_scores|
+---------------+--------------------+
|            [8]|            [12, 13]|
|         [0, 4]|        [11, 12, 13]|
|         [1, 7]|        [11, 12, 13]|
|         [3, 7]|                [11]|
|         [6, 8]|                  []|
+---------------+--------------------+
only showing top 5 rows

+------------------+
|     relevant_attr|
+------------------+
|       [8, 12, 13]|
|[0, 4, 11, 12, 13]|
|[1, 7, 11, 12, 13]|
|        [3, 7, 11]|
|            [6, 8]|
+------------------+
only showing top 5 rows



                                                                                

## Association Rules Mining

In [6]:
from pyspark.ml.fpm import FPGrowth

fp = FPGrowth(minConfidence=0.5, minSupport=0.1)
fpm = fp.fit(df.select(df.relevant_attr.alias('items')))
fpm.associationRules.orderBy("lift", "confidence", ascending=False).show(truncate=False)

[Stage 10:>                                                       (0 + 10) / 11]

+----------+----------+------------------+------------------+-------------------+
|antecedent|consequent|confidence        |lift              |support            |
+----------+----------+------------------+------------------+-------------------+
|[0]       |[12]      |0.5848391026291747|1.326063905955136 |0.13619842200135931|
|[7]       |[11]      |0.5295764100163819|1.2826948079952165|0.11144886279685977|
|[5]       |[12]      |0.5221016410799365|1.1838130151756199|0.11657686587011554|
|[13]      |[12]      |0.5128201448141743|1.1627681549124225|0.140782695205918  |
+----------+----------+------------------+------------------+-------------------+



                                                                                

- When mental health is discussed, it's 32% more likely to be discussed with negative emotional language
- When social interaction is discussed, it's 28% more likely be discussed with positive emotional language

In [7]:
fp = FPGrowth(minConfidence=0.5, minSupport=0.01)
fpm = fp.fit(df.select(df.relevant_attr.alias('items')))
fpm.associationRules.orderBy("lift", "confidence", ascending=False).show(truncate=False)



+-----------+----------+------------------+------------------+--------------------+
|antecedent |consequent|confidence        |lift              |support             |
+-----------+----------+------------------+------------------+--------------------+
|[5, 0, 13] |[12]      |0.6893991206643869|1.5631432416206035|0.01390057229538716 |
|[5, 0]     |[12]      |0.6503388786198399|1.474578067198627 |0.02911121836861339 |
|[0, 13]    |[12]      |0.629139409982946 |1.426510401992189 |0.04869337378473419 |
|[0, 13, 4] |[12]      |0.614774951076321 |1.393940434630193 |0.012377734655884005|
|[7, 4]     |[11]      |0.5746488230827639|1.3918653622921413|0.023855162971207926|
|[5, 0, 11] |[12]      |0.613556338028169 |1.3911773519785444|0.01098491937628668 |
|[7, 4, 12] |[11]      |0.5603203568532036|1.357160173594965 |0.01088838762423538 |
|[0, 13, 11]|[12]      |0.5962921422445074|1.352032522586772 |0.017805183164074426|
|[2, 0]     |[12]      |0.5961912479740681|1.3518037549654793|0.014493553057

                                                                                

When personal struggles and mental health are discussed with a moral tone, it's 56% more likely to be discussed with an emotional negative tone. 

In [8]:
fp = FPGrowth(minConfidence=0.5, minSupport=0.001)
fpm = fp.fit(df.select(df.relevant_attr.alias('items')))
fpm.associationRules.orderBy("lift", "confidence", ascending=False).show(truncate=False)



+--------------+----------+------------------+------------------+---------------------+
|antecedent    |consequent|confidence        |lift              |support              |
+--------------+----------+------------------+------------------+---------------------+
|[5, 0, 4, 12] |[13]      |0.5140845070422535|1.8726228458868832|0.0021571891529831265|
|[5, 0, 13, 4] |[12]      |0.7023733162283515|1.5925609265339968|0.0021571891529831265|
|[5, 0, 13]    |[12]      |0.6893991206643869|1.5631432416206035|0.01390057229538716  |
|[7, 5, 0, 13] |[12]      |0.6719128329297821|1.5234948410438245|0.0010933698446626806|
|[5, 0, 4]     |[12]      |0.6647940074906367|1.507353619594698 |0.004196176160597315 |
|[2, 5, 0]     |[12]      |0.6578332034294622|1.4915706957435853|0.0016627101781897342|
|[5, 0]        |[12]      |0.6503388786198399|1.474578067198627 |0.02911121836861339  |
|[5, 0, 13, 11]|[12]      |0.6495880535530381|1.472875646795654 |0.004970400212763862 |
|[7, 5, 0]     |[12]      |0.635

                                                                                

When personal struggles, mental health and mindset take a negative tone, they're 87% more likely to also have a moral tone