In [22]:
# Import modules for: regular expressions; reading timestamps as date objects; loading files using regular expression;
# generate random numbers; reading JSONL files; working with XML files
import re
from datetime import datetime
from glob import glob
import jsonlines
import pandas as pd
import operator
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialise the predefined-dictionary Sentiment Analysis tool
analyzer = SentimentIntensityAnalyzer()
# Initialise the AI Sentiment Analysis tool
sentiment_pipeline = pipeline(model="bhadresh-savani/distilbert-base-uncased-emotion")

In [33]:
# List all filenames with the .jsonl extension
files = glob("*.jsonl")

# For each file do:
for file in files:
    # create an empty list to save the spreadsheet rows
    data = []
    output_filename = file.replace("*.jsonl", "") + ".csv"
    # Read the file as a jsonlines one:
    with jsonlines.open(file) as comments:
        # For each line (i.e. metadata data-points for one comment) do:
        for comment in comments:
            # Extract the comment id ('cid') and save it to a variable
            comment_id = str(comment["cid"])
            print(f"Processing comment {comment_id}")
            # Check if the 'cid' contains a full stop character. If so, the comment is a reply to another comment: take the string on
            # the left of the full stop and assign it as value of the attribute 'comment_id', then the string on the right and assign
            # it as value of the attribute 'comment_reply_to' to preserve the original hierarchical structure
            if re.search("(.*?)\.(.*)", comment_id) is not None:
                comment_reply_to = str(
                    re.search("(.*?)\.(.*)", comment_id).group(1)
                )
                comment_id = str(
                    re.search("(.*?)\.(.*)", comment_id).group(2)
                )
            # If there is no full stop character, assign the 'comment_id' as value of the <comment> attribute 'comment_id' and the
            # value 'na' to the 'comment_reply_to' attribute
            else:
                comment_id = comment_id
                comment_reply_to = "na"

            # Extract other metadata data-points and assign them to a set of variables
            username = str(comment["author"])
            votes = str(comment["votes"])
            heart = str(comment["heart"])
            comment_timestamp = comment["time_parsed"]
            
            # At last, get the content of the comment (the actual message)
            comment_text = str(comment["text"])
            
            #print(comment_text)
            
            # Calculate the AI sentiment of the comment, and extract the results to different variables
            ai_text = sentiment_pipeline(comment_text, top_k=None)
            for dictionary in ai_text:
                if dictionary["label"] == "fear":
                    fear = dictionary["score"]
                elif dictionary["label"] == "joy":
                    joy = dictionary["score"]
                elif dictionary["label"] == "anger":
                    anger = dictionary["score"]
                elif dictionary["label"] == "sadness":
                    sadness = dictionary["score"]
                elif dictionary["label"] == "love":
                    love = dictionary["score"]
                elif dictionary["label"] == "surprise":
                    surprise = dictionary["score"]

                    # Calculate the predefined-dictionary sentiment of the comment, and extract the four results to four different variables
            pred_text = analyzer.polarity_scores(comment_text)
            negative = pred_text["neg"]
            neutral = pred_text["neu"]
            positive = pred_text["pos"]
            compound = pred_text["compound"]
            
            # append all the extracted data to the list, formatting it as a csv line
            data.append([comment_timestamp, comment_id, comment_reply_to, username, comment_text, negative, neutral, positive, compound, fear, joy, anger, sadness, love, surprise])
            
        # after all the comments have been extracted, sort them in chronological order using their timestamps
        data.sort(key=operator.itemgetter(0))
        
        # add a progressive number to the newly ordered comments, to preserve the chronological order
        for index, element in enumerate(data, start=1):
            element.insert(0, index)
            
        # create a dataframe with all the collected comments
        csv_df = pd.DataFrame(data, columns=["turn", "comment_timestamp", "comment_id", "reply_to", "username", "comment_text", "negative", "neutral", "positive", "compound", "fear", "joy", "anger", "sadness", "love", "surprise"])
        # write the dataframe to a csv file
        csv_df.to_csv(output_filename, sep="\t", index=False)

Processing comment UgxRqHRnwAelIPmIW454AaABAg
[{'label': 'joy', 'score': 0.9969204664230347}, {'label': 'anger', 'score': 0.0010109911672770977}, {'label': 'sadness', 'score': 0.000612446223385632}, {'label': 'fear', 'score': 0.0006110440008342266}, {'label': 'love', 'score': 0.0005889174644835293}, {'label': 'surprise', 'score': 0.0002562185109127313}]
Processing comment Ugwqe3vTioYXvaqRYLd4AaABAg
[{'label': 'joy', 'score': 0.9816443920135498}, {'label': 'anger', 'score': 0.008725502528250217}, {'label': 'fear', 'score': 0.004825378302484751}, {'label': 'sadness', 'score': 0.003243070561438799}, {'label': 'surprise', 'score': 0.0008458417723886669}, {'label': 'love', 'score': 0.0007158176158554852}]
Processing comment UgzcDSg8k5U72ykEkbJ4AaABAg
[{'label': 'joy', 'score': 0.9424716830253601}, {'label': 'anger', 'score': 0.03320019692182541}, {'label': 'surprise', 'score': 0.014175686053931713}, {'label': 'sadness', 'score': 0.004915421828627586}, {'label': 'fear', 'score': 0.0034292603

KeyboardInterrupt: 