## Set up

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("network") \
        .getOrCreate()

24/12/10 14:42:58 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [11]:
import sparknlp
from sparknlp.base import DocumentAssembler
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.sql import types as T
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType
from pyspark.sql.types import StringType
from pyspark.sql import functions as F
import itertools
from sparknlp.base import Finisher
from pyspark.sql.functions import col, least, greatest, lit

In [3]:
data = spark.read.csv("../data/cleaned_moral_scores.csv", header= True).select(["id", "cleaned_text"])

In [4]:
data.show(3)

+-----+--------------------+
|   id|        cleaned_text|
+-----+--------------------+
|hk5r2|i had an appointm...|
|iqimz|i created this si...|
|pfzt5|hello everyone  i...|
+-----+--------------------+
only showing top 3 rows



## Define words to include in the network

In [5]:
liwc_moral = [
    "absurd", "absurdity", "absurdities", "accusation", "accusations", "accusative", "accuse",
    "accuses", "accusing", "admirable", "admonish", "admonished", "admonishing", "admonishes",
    "admonishment", "adulterate", "adulterated", "adulterating", "adulterates", "adulteration",
    "adulterer", "adulterers", "adulteress", "adulteresses", "adulteries", "adulterous",
    "adultery", "amoral", "amorality", "arrogant", "betray", "betrayed", "betraying",
    "betrays", "betrayal", "betrayer", "bigot", "bigots", "bigoted", "bigotry", "blame",
    "blames", "blamed", "blaming", "brave", "bravely", "braver", "bravest", "buffoon",
    "buffoons", "buffoonish", "careless", "carelessness", "carpetbag", "carpetbags",
    "censure", "censured", "censures", "censuring", "chastise", "chastised", "chastises",
    "chastising", "chauvinism", "chauvinist", "chauvinistic", "cheat", "cheats", "cheated",
    "cheating", "commend", "commended", "commends", "commending", "competence",
    "competent", "conceit", "conceited", "connive", "connived", "connives", "conniving",
    "conscience", "contemptible", "contemptibly", "corrupt", "corrupted", "corrupting",
    "corruption", "courage", "courageous", "craven", "criminal", "criminals", "crook",
    "crooks", "cruel", "crueler", "cruelest", "crueller", "cruellest", "cruelly",
    "cruelties", "cruelty", "debauch", "debauched", "debauches", "debauching", "decadence",
    "decadent", "deceive", "deceived", "deceives", "deceiving", "decency", "decent",
    "decently", "deceptive", "deceptively", "delinquent", "delinquency", "deprave",
    "depraved", "depraves", "depraving", "deserve", "deserved", "deserves", "deserving",
    "despicable", "deviant", "deviants", "dignified", "dignity", "disapprove",
    "disapproved", "disapproves", "disapproving", "disgrace", "disgraced", "disgraces",
    "disgracing", "dishonest", "dishonesty", "dishonor", "dishonored", "dishonorable",
    "dishonourable", "disloyal", "disrespect", "disrespected", "disrespecting",
    "disrespectful", "diss", "dissed", "dissing", "dumb", "dutiful", "duty", "elitism",
    "elitist", "elitists", "equality", "equitable", "ethic", "ethical", "ethics", "evil",
    "evildoer", "evildoers", "excuse", "excuses", "excused", "excusing", "fairness",
    "faithful", "faithless", "fake", "fakes", "faking", "fatass", "fatso", "fatties",
    "forgive", "forgiven", "forgives", "forgiving", "foul", "fouled", "fouling", "fraud",
    "frauds", "fraudulent", "generosity", "generous", "glutton", "gluttony", "godless",
    "godlessness", "grandiose", "greed", "greedy", "hateful", "haters", "heathen",
    "heathens", "hero", "heroes", "heroic", "heroine", "heroines", "hideous", "hideously",
    "homily", "fault", "faults", "faulted", "faulting", "honest", "honesty", "honor",
    "honored", "honoring", "honorable", "honour", "horrid", "horridly", "humane",
    "humanitarian", "hypocrisy", "hypocrite", "hypocrites", "ideal", "ideals", "ideologue",
    "ignoble", "ignorant", "immodest", "immoral", "immorality", "inappropriate",
    "inconsiderate", "incorruptible", "indecency", "indecent", "indignantly", "inequity",
    "infallible", "infidel", "infidels", "infidelity", "inhumane", "iniquity", "injustice",
    "innocence", "innocent", "innocently", "irresponsible", "judge", "judged", "judges",
    "judging", "judgy", "justice", "justness", "kosher", "laughingstock", "lawless",
    "lawlessness", "lazier", "laziest", "laziness", "lazy", "lecherous", "lewd", "liar",
    "liars", "lousy", "loyal", "magnanimity", "magnanimous", "mansplain", "misbehave",
    "misbehaved", "misbehaving", "misconduct", "miser", "miserly", "misogynist",
    "misogynistic", "mistreat", "mistreated", "mistreating", "misuse", "misused",
    "misuses", "misusing", "molest", "molested", "molesting", "moral", "morality",
    "nefarious", "nerd", "nerds", "nerdy", "noble", "obstinate", "offensive",
    "opinionated", "outlaw", "outlawed", "outlawing", "outrageous", "overbearing",
    "overconfident", "pariah", "patriot", "patriots", "pedophile", "penance", "penitent",
    "perv", "pervert", "perverted", "perverts", "pervy", "pettier", "pettiest", "pettily",
    "pettiness", "petty", "phony", "pitiful", "pitifully", "plagiarize", "prejudice",
    "principled", "promiscuity", "promiscuous", "prude", "prudish", "psycho", "puny",
    "pussies", "racist", "rapist", "rectitude", "redneck", "reprehensible", "repulsive",
    "revolting", "revoltingly", "ridicule", "ridiculous", "ridiculously", "righteous",
    "righteously", "ruthless", "scandal", "scandals", "scruples", "scrupulous", "scum",
    "selfish", "selflessness", "sexism", "sexist", "shame", "shamed", "shaming", "sin",
    "sincere", "sincerity", "sinful", "sinfully", "sinister", "sinned", "sinner", "sinners",
    "sins", "sissies", "sissy", "skank", "slander", "slandered", "slandering", "slimy",
    "slothful", "slut", "sluts", "slutty", "smug", "sneakily", "sneaky", "snide",
    "snidely", "snob", "snobs", "spineless", "thief", "thieves", "traitor", "transgress",
    "treacherous", "treason", "trustworthy", "trusty", "truthful", "truthfully",
    "unacceptable", "unethical", "unfair", "unfaithful", "ungodly", "ungracious", "unjust",
    "unloyal", "unpatriotic", "unprincipled", "unqualified", "unreasonable", "unsavory",
    "unscrupulous", "unselfish", "untrustworthy", "unvirtuous", "unworthy", "upstanding",
    "useful", "useless", "vain", "vainly", "vanity", "vengeance", "vile", "vilify",
    "vindicate", "virtue", "virtuous", "wanton", "wicked", "worthless", "worthwhile",
    "worthy", "wrong", "wrongdoing", "wronged", "wrongful", "wrongly", "zealot"]

In [7]:
related_morality = [
    "respect", "mistakes", "admit", "abuse", "responsibility", "lie", 
    "prove", "truth", "society", "behavior", "anger", "human", 
    "acknowledge", "belief", "treat", "angry", "god", "act", 
    "standard", "boundaries", 
    "character", "stupid", "horrible", "opinion", "accept"]


In [6]:
topic0 = [
    "therapy", "emotion", "relationship", "feeling", "therapist", "child", 
    "emotional", "family", "depression", "toxic", "call", "mom", "god", 
    "response", "sad", "angry", "parent", "voice", "anger", "mother"
]

topic1 = [
    "book", "game", "video", "read", "value", "youtube", "purpose", 
    "opinion", "play", "waste", "teach", "boundaries", "development", 
    "personal", "character", "meaningful", "knowledgeable", "information", 
    "trouble", "productive"
]

topic2 = [
    "woman", "man", "social", "media", "sex", "porn", "anxiety", "date", 
    "relationship", "partner", "addiction", "drug", "anxious", "dude", 
    "male", "guy", "instagram", "account", "sick", "doctor"
]

topic3 = [
    "skill", "interest", "activity", "hobbies", "practice", "language", 
    "content", "hobby", "specific", "music", "internet", "attention", 
    "online", "movie", "learn", "project", "community", "sport", "area", 
    "tv"
]

topic4 = [
    "fear", "mind", "moment", "happiness", "journey", "success", "mindset", 
    "failure", "belief", "present", "future", "true", "reality", "happy", 
    "power", "process", "feeling", "challenge", "desire", "truth"
]

topic5 = [
    "pain", "choice", "decision", "shit", "worth", "trust", "respect", 
    "fuck", "mistakes", "proud", "nobody", "treat", "environment", "suck", 
    "pressure", "tough", "excuse", "regret", "wrong", "move"
]

topic6 = [
    "weight", "food", "body", "gym", "exercise", "sleep", "bed", "drink", 
    "task", "list", "routine", "healthy", "wake", "water", "energy", 
    "health", "diet", "eat", "cold", "smoke"
]

topic7 = [
    "girl", "guy", "confidence", "conversation", "talk", "nice", 
    "confident", "friend", "group", "personality", "comfortable", "weird", 
    "fun", "meet", "comfort", "hair", "kinda", "attractive", "face", "club"
]

topic8 = [
    "job", "school", "money", "college", "class", "career", "high", "home", 
    "parent", "business", "car", "degree", "study", "house", "kid", 
    "company", "university", "family", "student", "country"
]

In [8]:
words_keep = (liwc_moral + related_morality + topic0 + topic1 + topic2 + topic3 + topic4 + 
              topic5 + topic6 + topic7 + topic8)

In [10]:
len(words_keep)

656

## Define stopwords to facilitate further preprocessing

In [16]:
english = [
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", 
    "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "cannot", "could", "did", 
    "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", 
    "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "it", 
    "its", "itself", "let", "me", "more", "most", "must", "my", "myself", "no", "nor", "not", "of", "off", "on", 
    "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "some", "such", 
    "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", 
    "through", "to", "too", "under", "until", "up", "very", "was", "we", "were", "what", "when", "where", "which", 
    "while", "who", "whom", "why", "with", "would", "you", "your", "yours", "yourself", "yourselves", "will", "ll", 
    "re", "ve", "d", "s", "m", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", 
    "s", "t", "u", "v", "w", "x", "y", "z", "many", "us", "ok", "hows", "ive", "ill", "im", "cant", "topics", "topic",
    "discuss", "thoughts", "yo", "thats", "whats", "lets", "nothing", "oh", "omg", 
         "things", "stuff", "yall", "haha", "yes", "no", "wo", "like", 'good', 
         'work', 'got', 'going', 'dont', 'really', 'want', 'make', 'think', 
         'know', 'feel', 'people', 'life', "getting", "lot" "great", "i", "me", 
         "my", "myself", "we", "our", "ours", "ourselves", 
        "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", 
        "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
        "they", "them", "their", "theirs","themselves", "what", "which", "who", 
        "whom", "this", "that", "these", "those", "am", "is", "are", "was", 
        "were", "be", "been", "being", "have", "has", "had", "having", "do", 
        "does", "did", "doing", "will", "would", "can", "could", "may",
        "might", "shall", "ought", "about", "above", "across", "after", 
        "against", "along", "amid", "among", "around", "as", "at", "before", "behind",
        "below", "beneath", "beside", "between", "beyond", "but", "by", 
        "considering", "despite", "down", "during", "except", "for",
        "from", "in", "inside", "into", "like", "near", "next", "notwithstanding",
        "of", "off", "on", "onto", "opposite", "out", "outside", "over", "past",
        "regarding", "round", "since", "than", "through", "throughout", "till", 
        "to", "toward", "towards", "under", "underneath", "unlike", "until", "up",
        "upon", "versus", "via", "with", "within", "without", "cant", "cannot", 
        "couldve", "couldnt", "didnt", "doesnt", "dont", "hadnt", "hasnt", 
        "havent", "hed", "hell", "hes", "howd", "howll", "hows", "id", "ill", 
        "im", "ive", "isnt", "itd", "itll", "its", "lets", "mightve", 
        "shant", "shed", "shell", "shes", 
        "thatll", "thats", "thered", "therell", "therere", "theres", "theyd", 
        "theyll", "theyre", "theyve", "wed", "well", "were", "weve", "werent", 
        "whatd", "whatll", "whatre", "whats", "whatve", "whend", "whenll", 
        "whens", "whered", "wherell", "wheres", "whichd", "whichll", "whichre", 
        "whichs", "whod", "wholl", "whore", "whos", "whove", "whyd", "whyll", 
        "whys", "wont", "wouldve", "wouldnt", "youd", "youll", "youre", "youve",
        "f", "m", "because", "go", "lot", "get", "still", "way", "something", "much",
        "thing", "someone", "person", "anything", "goes", "ok", "so", "just", "mostly", 
        "put", "also", "lots", "yet", "ha", "etc", "even", "one", "bye", "take", "wasnt"]

time = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", 
        "sunday", "morning", "noon", "afternoon", "evening", "night", "midnight",
        "dawn", "dusk", "week", "weekend", "weekends","weekly", "today", 
        "yesterday", "tomorrow", "yesterdays", "todays", "mondays", "tuesdays",
        "wednesdays", "thursdays", "fridays", "saturdays", "sundays", "day",
        "everyday", "daily", "workday", 'time', 'month', 'year', 'pm', 'am', "ago",
        "year", "now"]

reddit = ["welcome", "hi", "hello", "sub", "reddit", "thanks", "thank", "maybe",
          "wo30", "mods", "mod", "moderators", "subreddit", "btw", "aw", "aww", 
          "aww", "hey", "hello", "join", "joined", "post", "rselfimprovement", "blah"]

topic_specific = ["self", "improvement", "change", "action",
    'change', 'start', 'goal', 'habit', 'new', 'old', 
    'care', 'world', 'everyone', 'love', 'u', 'right', 'mean', 'matter',
    'best', 'step', 'focus', 'hard', 'small',
    'bad', 'help', 'time', 'problem', 'issue', 'advice',
    'bit', 'experience', 'different',
    'point', 'situation', 'negative', 'control', 'positive',
    'use', 'question', 'idea', 'amp', 'medium', 'hour', 'day', 'minute',
    'aaaaloot', "selfimprovement", "_", "ampxb"]

stopwords = english + time + reddit + topic_specific

## Create network from corpus, only keeping defined words

### Standard preprocessing pipeline

In [18]:
documentAssembler = DocumentAssembler()\
     .setInputCol("cleaned_text")\
     .setOutputCol('document')

tokenizer = Tokenizer() \
            .setInputCols(['document'])\
            .setOutputCol('tokenized')

normalizer = Normalizer() \
     .setInputCols(['tokenized']) \
     .setOutputCol('normalized')

lemmatizer = LemmatizerModel.load("../models/lemma_ewt_en_3.4.3_3.0_1651416655397/")\
      .setInputCols("normalized")\
      .setOutputCol("lemmatized")

stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemmatized']) \
     .setOutputCol('words') \
     .setStopWords(stopwords)

finisher = Finisher().setInputCols(['words'])


In [19]:
my_pipeline = Pipeline(
      stages = [
          documentAssembler,
          tokenizer,
          normalizer,
          lemmatizer,
          stopwords_cleaner,
          finisher
      ])
pipelineModel = my_pipeline.fit(data)
processed_data = pipelineModel.transform(data)
processed_data.persist()
processed_data.show(1)

[Stage 5:>                                                          (0 + 1) / 1]

+-----+--------------------+--------------------+
|   id|        cleaned_text|      finished_words|
+-----+--------------------+--------------------+
|hk5r2|i had an appointm...|[appointment, den...|
+-----+--------------------+--------------------+
only showing top 1 row



                                                                                

### Keep only desired words 

In [21]:
def filter_words(word_list):
    return [word for word in word_list if word in words_keep]

filter_words_udf = udf(filter_words, ArrayType(StringType()))

filtered_df = processed_data.withColumn("filtered_words", 
            filter_words_udf(processed_data["finished_words"]))

filtered_df.show(20)

[Stage 6:>                                                          (0 + 1) / 1]

+-----+--------------------+--------------------+--------------------+
|   id|        cleaned_text|      finished_words|      filtered_words|
+-----+--------------------+--------------------+--------------------+
|hk5r2|i had an appointm...|[appointment, den...|       [call, smoke]|
|iqimz|i created this si...|[create, site, se...|[forgive, useful,...|
|pfzt5|hello everyone  i...|[recently, made, ...|[information, min...|
|pk714|i grew up with bo...|[grow, body, dysm...|[body, social, de...|
|q0q8x|i have to ask whe...|[content, never, ...|[content, process...|
|q412v|nothing but oppor...|[butt, opportunit...|[feeling, mind, p...|
|q5mqk|im getting out of...|[comfort, zone, t...|[comfort, club, c...|
|q70xe|hey everyone firs...|[first, learn, so...|[learn, social, t...|
|q7mrn|facebook is great...|[facebook, great,...|[stupid, social, ...|
|qcsyp|okay so im 18 yea...|[okay, male, semi...|[male, standard, ...|
|qu825|well to give ever...|[give, everybody,...|[worthwhile, hous...|
|qxco0

                                                                                

### Create edges

In [22]:
def generate_edges(tokens):
    return [list(pair) for pair in itertools.combinations(tokens, 2)]

generate_edges_udf = udf(generate_edges, ArrayType(ArrayType(StringType())))

In [23]:
df_edges = filtered_df.withColumn("edges", generate_edges_udf(F.col("filtered_words")))

In [24]:
df_flat_edges = df_edges.select(
    F.col("id"),
    F.explode(F.col("edges")).alias("edge")
)

In [25]:
edges_df = df_flat_edges.select(
    F.col("id").alias("id_doc"),
    F.col("edge")[0].alias("node1"),
    F.col("edge")[1].alias("node2")
)

In [26]:
edges_df = edges_df.withColumn("weight", lit(1))

In [27]:
# Normalize the pairs: ensure node1 is always less than node2, so they can be always on the same order
edges_df = edges_df.withColumn("node1_norm", least(col("node1"), col("node2"))) \
             .withColumn("node2_norm", greatest(col("node1"), col("node2")))

In [34]:
edges_df = edges_df.groupBy("node1_norm", "node2_norm").sum("weight") \
                        .withColumnRenamed("sum(weight)", "weight")

In [35]:
edges_df = edges_df.filter(F.col('node1_norm') != F.col('node2_norm'))
edges_df.orderBy('weight', ascending=False).show(10)



+----------+----------+------+
|node1_norm|node2_norm|weight|
+----------+----------+------+
|      book|      read|116662|
|       man|     woman| 85356|
|     learn|      read| 57908|
|      game|      play| 54228|
|     media|    social| 49230|
|      book|     learn| 43003|
|      high|    school| 42261|
|     learn|     skill| 41263|
|       job|     money| 40559|
|       job|     learn| 40473|
+----------+----------+------+
only showing top 10 rows



                                                                                

In [36]:
edges_df.write.mode("overwrite").csv("edges_topics_morality_net", header=True)

                                                                                

### Create nodes 

In [37]:
vertices_df = edges_df.select(F.col("node1_norm").alias("node")).union(edges_df.select(F.col("node2_norm").alias("node"))).distinct()
vertices_df.write.mode("overwrite").csv("nodes_topics_morality_net", header=True)

                                                                                