<a href="https://colab.research.google.com/github/mazinkamal134/DS_MRP_2024/blob/main/TensiStrength/2_TensiStrenght_Stress_Score_Calculator_Instance_x.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Please note:
- Multiple versions of this notebook, equal to the number of file splits per disorder, need to be created and run in parallel to save the stress score calculation time.

- This notebook must run on a local Colab or Jupyter notebook on a machine with Java installed.

In [None]:
import pandas as pd
import numpy as np
import pickle
import subprocess
import os
import re
import html
from tqdm import tqdm
import time

## Global Params

In [None]:
# Configure the instance, disorder, and the location of the API and the file chunks
instanceId = 0
disorder = "anxiety" # ["depression", "ptsd"]
# API location
path = "/Stress_Detection_Files"
apiPath = path + "TensiStrengthMain.jar"
tensiStrengthDir = "/TensiStrength/Chunks"

## Data cleaning

In [None]:
# Function to preprocess the text
def clean_text(text):

    # regex
    JUST_WORDS = re.compile(r"\W+")
    SLASHES = re.compile(r"[\\/]")
    PERCENT = re.compile(r"[%]")
    URLs = re.compile(r"http\S+|www\S+|https\S+")
    HASHTAG = re.compile(r"#\w+")
    EMOJIS = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
        "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
        "\U00002700-\U000027BF"  # Dingbats
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002600-\U000026FF"  # Miscellaneous Symbols
        "\U00002500-\U00002BEF"  # Other symbols
        "\U0001F004"             # Mahjong Tile Red Dragon
        "\U0001F0CF"             # Playing Card Black Joker
        "\U0001F018-\U0001F270"  # Various Asian characters
        "\U0001F201-\U0001F251"  # Enclosed characters
        "\U0001F926-\U0001F937"  # Additional emoji
        "\U0001F97C-\U0001F9FF"  # Additional emoji
        "\U0001F9E0-\U0001F9E6"  # Additional emoji
        "\U0001F680-\U0001F6C5"  # Transport and map symbols
        "\U0001F6F4-\U0001F6F8"  # Additional transport and map symbols
        "\U0001F7E0-\U0001F7EB"  # Additional symbols
        "\U0001F90D-\U0001F93A"  # Additional emoji
        "\U0001F93C-\U0001F945"  # Additional emoji
        "\U0001F947-\U0001F978"  # Additional emoji
        "\U0001F97A-\U0001F9CB"  # Additional emoji
        "\U0001F9CD-\U0001FA6D"  # Additional emoji
        "\U0001FA60-\U0001FA6D"  # Additional emoji
        "\U0001FA70-\U0001FA74"  # Additional emoji
        "\U0001FA78-\U0001FA7A"  # Additional emoji
        "\U0001FA80-\U0001FA86"  # Additional emoji
        "\U0001FA90-\U0001FAA8"  # Additional emoji
        "\U0001FAB0-\U0001FAB6"  # Additional emoji
        "\U0001FAC0-\U0001FAC2"  # Additional emoji
        "\U0001FAD0-\U0001FAD6"  # Additional emoji
        "\u200d"                 # Zero Width Joiner
        "\u2640-\u2642"          # Gender symbols
        "\u2600-\u2B55"          # Miscellaneous symbols and dingbats
        "\u23cf"                 # Eject symbol
        "\u23e9"                 # Fast forward
        "\u231a"                 # Watch
        "\ufe0f"                 # Dingbats
        "\u3030"                 # Wavy dash
        "\u00a9"                 # Copyright
        "\u00ae"                 # Registered
        "\u2122"                 # Trademark
        "]+", flags = re.UNICODE)

    # remove html related
    text = html.unescape(text)
    # remove the line feeds
    text = re.sub(r"\n", " ", text)
    # lowercase text
    text = text.lower()
    # Remove emojis from the text using the pattern
    text = EMOJIS.sub(" ", text)
    # Substitute the matched string in URLs with space.
    text = URLs.sub(" ", text)
    # Substitute the matched string in HASHTAG with space.
    text = HASHTAG.sub(" ", text)
    # Substitute the matched string in SLASHES with space.
    text = SLASHES.sub(" ", text)
    # Replace % (this is specific to the Java API)
    text = re.sub(r'%', " percent", text)
    # Remove what's left
    text = JUST_WORDS.sub(" ", text)
    # Trim the text
    text = text.strip()
    # Add . to the end of the text
    if not text.endswith("."):
        text += "."

    return text

## Stress score analyzer function

In [None]:
# Run the API passing a piece of text and getting the output from cmd
def analyze(text):
    # command to run the Java API on a windows machine
    cmd = ["C:/Program Files/Java/jre-1.8/bin/java.exe", "-jar", apiPath, "sentidata", path, "explain", "text", text, "urlencoded", "mood", "0"]

    # Run and get a result
    result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, text = True, shell = True)

    # Sucessful run
    if result.returncode == 0:
        # Format the output
        splitted = result.stdout.split("+")
        relaxScore, stressScore = splitted[0], splitted[1]
        return relaxScore, stressScore
    else:
        # Unsucessful run
        print("Error executing command. Return code: ", result.stderr)
        return None, None

## Read the tweets file

In [None]:
# Read the pickle file
fileName = f"{disorder}TweetsDfWithTensiStrength{instanceId}.pickle"
print("Working for:", fileName)
tweetsDf = pd.read_pickle(os.path.join(tensiStrengthDir, fileName))
print("Shape:", tweetsDf.shape)
tweetsDf.sample(3)

## Run the API

In [None]:
# Run the tweets dataframe against the Java API
# Create two new columns for the scores
tweetsDf["relaxScore"] = np.NaN
tweetsDf["stressScore"] = np.NaN

start = time.time()
for index, row in tqdm(tweetsDf.iterrows(), total = tweetsDf.shape[0]):
    text = clean_text(row["cleaned_text"])
    try:
        relaxScore, stressScore = analyze(text)
    except:
        relaxScore, stressScore = None, None
    tweetsDf.at[index, "relaxScore"] = relaxScore
    tweetsDf.at[index, "stressScore"] = stressScore
end = time.time()
print("Elapsed time: ", np.round((end - start)/60.0/60, 2), " Hours")

Once the entire file is processed, check the None stress scores and fix accordingly

## Save the result

In [None]:
# Pickle
fileName = f"{disorder}TweetsDfWithTensiStrengthScore_{instanceId}.pickle"
tweetsDf.to_pickle(os.path.join(tensiStrengthDir, fileName))

Next step is to combine all the files per disorder into one file having the calculated stress scores for all English, non-empty timeline tweets