In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Prediction Phase

In [1]:
import pandas as pd
import re
import demoji
import jaconv
from nltk.corpus import words
import nltk
from xml.sax.saxutils import unescape

# Download the dictionary by uncommenting the following line only the first time it is run
# nltk.download('words')

# Get a list of English words from nltk
english_words = set(words.words())

# Regular expression patterns for detecting URLs
url_pattern = re.compile(r'https?://\S+|www\.\S+')

def confirm_URL(df, url_pattern):
    # Checks each line of text for URLs and deletes lines that contain them
    df = df[~df['post'].str.contains(url_pattern, na=False)]

    return df

def convert_to_halfwidth(text):
    return jaconv.z2h(text, kana=False, ascii=True, digit=True)

def replace_words(df, column, replacements):
    """
    Replaces a word in the text of the specified column of the data frame

    :param df: Data Frame
    :param column: Name of column containing text
    :param replacements: Dictionary of words to be replaced (keys are before replacement, values are after replacement)
    """
    df[column] = df[column].apply(lambda x: ' '.join([replacements.get(word, word) for word in x.split()]))
    return df

def text_conversion(text):
    """
    Converting text to complex

    :param text: text
    """

    # Decode encoded strings using unescape
    text = unescape(text)

    # Delete pictograms
    text = demoji.replace(string=text, repl='')

    # Convert to half-width characters
    text = convert_to_halfwidth(text)

    # Delete hashtags
    text = text.replace(';', '; ')
    text = re.sub(r'#.*', "", text)

    # Various symbol processing
    text = re.sub(r'\！', '!', text)
    text = re.sub(r'\!+', '.', text)

    text = re.sub(r'\？', '?', text)
    text = re.sub(r'\‽', '?', text)
    text = re.sub(r'\?+', '.', text)

    text = re.sub(r'\．', '.', text)
    text = re.sub(r'\.+', '.', text)

    text = re.sub(r'\，', ',', text)
    text = re.sub(r'\,+', ',', text)

    text = re.sub(r'\【', '[', text)
    text = re.sub(r'\「', '[', text)
    text = re.sub(r'\[+', '', text)

    text = re.sub(r'\】', ']', text)
    text = re.sub(r'\」', ']', text)
    text = re.sub(r'\]+', '', text)

    text = re.sub(r'\）', ')', text)
    text = re.sub(r'\)+', '', text)

    text = re.sub(r'\（', '(', text)
    text = re.sub(r'\(+', '', text)

    text = re.sub(r'\＿', '_', text)
    text = re.sub(r'\_+', ' ', text)

    # Delete URLs
    text = re.sub(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+\$,%#]+)", "" ,text)
    text = text.replace('…', '').replace('²', '').replace('*', '').replace('%', 'percent').replace('; -;', '').replace('.,', '.').replace('. ,', '.').replace(' .', '.')
    text = text.replace('xa0,', '').replace('xa0', '').replace('amp;', '').replace('&', 'and').replace('@', 'at').replace('=', 'is').replace('+', '')
    text = text.replace('percent', ' percent').replace('yo', ' yo')
    # text = text.replace('Removed', '').replace('Removed,', '').replace('Removed.', '')

    # Add spaces after “.” or “,” if there are no spaces
    text = text.replace('.', '. ').replace(',', ', ')

    # Remove white space at the beginning and end of sentences
    text = text.strip()

    # Replace consecutive spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    text = text.replace('"', '').replace("'", '').replace(":", '').replace("Id", 'I would')

    text = text.lower()

    # Newline code deletion
    text = text.replace('\n', '').replace('\r', '').replace('\t', '').replace('\\n', '').replace('\\r', '').replace('\\t', '').replace('\\', '')

    return text

def capitalize_i(text):
    # Replace single 'i' with 'I'
    text = re.sub(r'\bi\b', 'I', text)
    # Replace 'i' in 'i'm' and 'i'll' with 'I'
    text = re.sub(r"\bi'", "I'", text)

    return text

def capitalize_sentences_initial(text):
    """
    Capitalize the first letter of each sentence.

    :param text: sentence
    :return: String with the first letter of each sentence capitalized
    """
    sentences = re.split(r'(?<=[.!?]) +', text)
    sentences = [s.capitalize() for s in sentences]
    return ' '.join(sentences)

# Dictionary of words to be replaced
replacements = {
    "im": "I am",
    "iam": "I am",
    "i'm": "I am",
    "Im": "I am",
    "I'm": "I am",
    "Ive": "I have",
    "I'd": "I would",
    "ive": "I have",
    "i'd": "I would",
    "tbh": "to be honest",
    "kinda": "kind of",
    "cuz": "because",
    "geez": "jesus christ",
    "ill": "i'll",
    "Id": "i'd",
    "didnt": "didn't",
    "dont": "don't",
    "doesnt": "doesn't",
    "cant": "can't",
    "isnt": "isn't",
    "arent": "aren't",
    "wasnt": "wasn't",
    "werent": "weren't",
    "havent": "haven't",
    "hasnt": "hasn't",
    "hadnt": "hadn't",
    "wont": "won't",
    "wouldnt": "wouldn't",
    "shouldnt": "shouldn't",
    "couldnt": "couldn't",
    "mustnt": "mustn't",
    "shes": "she's",
    "hes": "he's",
    "its": "it's",
    "thats": "that's",
    "theres": "there's",
    "heres": "here's",
    "whos": "who's",
    "whats": "what's",
    "whys": "why's",
    "hows": "how's",
    "lets": "let's",
    "id've": "i'd have",
    "could've": "could have",
    "would've": "would have",
    "should've": "should have",
    "we're": "we are",
    "they're": "they are",
    "i'll": "I will",
    "you're": "you are",
    "it's": "it is",
    "idk": "I do not know",
    "we’re": "we are",
    "they’re": "they are",
    "i’ll": "I will",
    "you’re": "you are",
    "it’s": "it is",
    "id’ve": "i'd have",
    "could’ve": "could have",
    "would’ve": "would have",
    "should’ve": "should have",
    "kms": "kill myself",
    "laxy": "lazy",
    "fr": "for real",
    "dont": "do not",
    "plz": "please",
    "irl": "in real life",
    "meim": "maim",
    "imma": "I am going to",
    "ffs": "offs",
    "theyre": "they are",
    "dunno": "do not know",
    "ofc": "of course",
    "wouldk": "would",
    "cantdo": "can not do",
    "tw": "two",
    "harmi": "harm, i",
    "retardi": "retard i",
    "fem": "females",
    "dont": "do not",
    "coz": "because",
    "sch": "school",
    "u": "you",
    "bf": "boyfriend",
    "boy friend": "boyfriend",
    "girl friend": "girlfriend",
    "em": "them",
    "nobodyll": "nobody will",
    "h it": "hit",
    "schoo": "school",
    "xcuse me": "excuse me",
    "wasnt": "was not",
    "tgere": "there",
    "don": "do not",
    "didn": "did not",
    "4kdjejdne sjdjd fuckdjejekieirieiriodidieiridodiodididodidookdkekekdkoeooeoi": "i",
    "yt": "yet",
    "scaared": "scared",
    "fwiled": "failed",
    "wxpecter": "expected",
    "snd": "and",
    "oiving": "living",
    "awwy": "away",
    "cwre": "care",
    "ijust": "i just",
    "ppease": "please",
    "juet": "just",
    "ijust": "i just",
    "wwnt": "want",
    "tefuse": "refuse",
    "tovkill": "to kill",
}

# Dictionary of words to be replaced
detailed_replacements = {
    "i’m": "I am",
    "i'm": "I am",
    "i've": "I have",
    "i'll": "I will",
    "i'd": "I would",
    "didn't": "did not",
    "don't": "do not",
    "doesn't": "does not",
    "can't": "can not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "shouldn't": "should not",
    "couldn't": "could not",
    "mustn't": "must not",
    "she's": "she is",
    "he's": "he is",
    "it's": "it is",
    "that's": "that is",
    "there's": "there is",
    "here's": "here is",
    "who's": "who is",
    "what's": "what is",
    "why's": "why is",
    "how's": "how is",
    "let's": "let us",
    "i'd have": "I would have",
    "could've": "could have",
    "would've": "would have",
    "should've": "should have",
    "we're": "we are",
    "they're": "they are",
    "i'll": "I will",
    "you're": "you are",
    "it's": "it is",

    # Version ’ instead of '
    "i’ve": "I have",
    "i’ll": "I will",
    "i’d": "I would",
    "didn’t": "did not",
    "don’t": "do not",
    "doesn’t": "does not",
    "can’t": "can not",
    "isn’t": "is not",
    "aren’t": "are not",
    "wasn’t": "was not",
    "weren’t": "were not",
    "haven’t": "have not",
    "hasn’t": "has not",
    "hadn’t": "had not",
    "won’t": "will not",
    "wouldn’t": "would not",
    "shouldn’t": "should not",
    "couldn’t": "could not",
    "mustn’t": "must not",
    "she’s": "she is",
    "he’s": "he is",
    "it’s": "it is",
    "that’s": "that is",
    "there’s": "there is",
    "here’s": "here is",
    "who’s": "who is",
    "what’s": "what is",
    "why’s": "why is",
    "how’s": "how is",
    "let's": "let us",
    "i’d have": "I would have",
    "could’ve": "could have",
    "would’ve": "would have",
    "should’ve": "should have",
    "we’re": "we are",
    "they’re": "they are",
    "i’ll": "I will",
    "you’re": "you are",
    "it’s": "it is",
    "covid19": "covid-19",
    "covid 19": "covid-19",
    "Covid 19": "covid-19",
    "Covid19": "covid-19",
    "schizoaffective": "schizo-affective",
    "schizo affective": "schizo-affective",
    "thatll": "that will",
    "ibs-d": "Ibs-d",
    "ibsd": "Ibs-d",
    "ibs d": "Ibs-d",
    "Ibs d": "Ibs-d",
    "Ibs-d": "Ibs-d",
    "B-day": "birthday",
    "b-day": "birthday",
}

# Dictionary of words to be replaced (slang words)
slang_dict = {
    "brb": "be right back",
    "dms": "direct messages",
    "lol": "laugh out loud",
    "omg": "oh my god",
    "idk": "i don't know",
    "btw": "by the way",
    "bff": "best friends forever",
    "tbh": "to be honest",
    "imo": "in my opinion",
    "fyi": "for your information",
    "irl": "in real life",
    "yolo": "you only live once",
    "rofl": "rolling on the floor laughing",
    "smh": "shaking my head",
    "nvm": "never mind",
    "gtg": "got to go",
    "ily": "i love you",
    "jk": "just kidding",
    "tmi": "too much information",
    "fomo": "fear of missing out",
    "lmao": "laughing my ass off",
    "ttyl": "talk to you later",
    "icymi": "in case you missed it",
    "rn": "right now",
    "soml": "story of my life",
    "afk": "away from keyboard",
    "bae": "before anyone else",
    "cya": "see you",
    "diy": "do it yourself",
    "ftw": "for the win",
    "gg": "good game",
    "idc": "i don't care",
    "ilysm": "i love you so much",
    "jkl": "just kidding lol",
    "lmk": "let me know",
    "nbd": "no big deal",
    "nm": "not much",
    "nsfw": "not safe for work",
    "omw": "on my way",
    "ppl": "people",
    "tba": "to be announced",
    "tl;dr": "too long; didn't read",
    "wtf": "what the fuck",
    "wyd": "what you doing",
    "sus": "suspicious",
    "vibe": "feeling/atmosphere",
    "stan": "obsessive fan",
    "goat": "greatest of all time",
    "snacc": "attractive person",
    "lit": "exciting or amazing",
    "flex": "show off",
    "noob": "newbie",
    "savage": "bold or unapologetic",
    "mood": "current feeling",
    "clapback": "witty response",
    "thirsty": "desperate",
    "bop": "good song",
    "fam": "family or friends",
    "ship": "relationship",
    "big yikes": "major embarrassment",
    "cap": "lie",
    "no cap": "no lie",
    "bet": "sure or okay",
    "drip": "stylish",
    "cheugy": "outdated or uncool",
    "fit": "outfit",
    "go off": "express yourself",
    "heat": "high-quality content",
    "hundo p": "100 percent",
    "iykyk": "if you know, you know",
    "jomo": "joy of missing out",
    "l": "loss or failure",
    "simp": "overly attentive person",
    "snap": "snapchat",
    "slaps": "hits hard (good song)",
    "troll": "provoke online",
    "v": "very",
    "w": "win",
    "yeet": "throw/awesome",
    "lmao": "laughing my ass/arse off",
    "yo": "years old",
    "ya": "you",
}

# Dictionary of words to be replaced (slang words)
negative_slang_dict = {
    "kms": "kill myself",
    "kys": "kill yourself",
    "s/i": "self-injury",
    "c/t": "cutting",
    "od": "overdose",
    "ed": "eating disorder",
    "anx": "anxiety",
    "depr": "depression",
    "su": "suicidal",
    "bpd": "borderline personality disorder",
    "ts": "triggering",
    # "sad": "sadness",
    # "loner": "someone who feels isolated or alone",
    # "breakdown": "mental or emotional collapse",
    # "blackout": "loss of memory or consciousness",
    # "panic": "panic attack",
    # "meltdown": "emotional or mental breakdown",
    # "overthinking": "thinking too much about something",
    # "worthless": "feeling without value",
    # "hopeless": "feeling without hope",
    # "burnout": "extreme stress or exhaustion",
    # "self-loathing": "intense dislike of oneself",
    # "isolation": "feeling isolated or cut off from others",
    # "ghosted": "suddenly cut off communication",
    # "pain": "emotional or physical suffering",
    # "stress": "mental or emotional strain",
    # "crying": "shedding tears due to emotional pain",
    # "hurt": "feeling emotional or physical pain",
    # "miserable": "very unhappy or uncomfortable",
    # "broken": "feeling deeply hurt or defeated",
    # "trauma": "emotional response to a terrible event",
    # "regret": "feeling of sadness over past actions",
    # "grief": "deep sorrow, especially caused by death",
    # "shame": "feeling of guilt or disgrace",
    # "guilt": "feeling responsible for a wrongdoing"
}


URL_replacements = {
    "https://www.reddit.com/r/SuicideWatch/comments/jkf5bw/why_my_boyfriend_would_suggest_this_if_he_loves_me/": "Why my boyfriend would suggest this if he loves me.",
    "https://www.reddit.com/r/depression/comments/6izgy2/i_feel_hopeless_and_want_to_die/": "I Feel hopeless and want to die.",
    "https://www.reddit.com/r/AskReddit/comments/n1vroe/serious\\_redditors\\_who\\_have\\_lost\\_someone\\_to/](https://www.reddit.com/r/AskReddit/comments/n1vroe/serious_redditors_who_have_lost_someone_to/))": "[Serious] Redditors who have lost someone to suicide, what was it like?",
    "https://imgur.com/a/EqLEah7": "Recent therapy notes. I have since quit my job and haven’t left my apartment in three days.",
    "https://www.reddit.com/r/SuicideWatch/comments/jhy5yh/please_read_i_am_sad_not_suicidal_but_still/": "Please read. I am sad not suicidal but still..",
    "https://www.reddit.com/r/relationship_advice/comments/jcumcc/i_need_his_affection_and_love_but_i_dont_know_how/": "I need his affection and love but I don't know how to Express it right now...please read",
    "https://www.reddit.com/r/SuicideWatch/comments/kfpjrv/what_are_the_conditions_for_not_feeling_finegood/?utm_source=share&amp;utm_medium=ios_app&amp;utm_name=iossmf": "What are the conditions for not feeling fine/good about death?",
    "https://www.reddit.com/r/SuicideWatch/comments/knffgq/accidental_unknowing_rapist_and_i_cant_live_with/?utm_source=share&amp;utm_medium=ios_app&amp;utm_name=iossmf)": '"accidental, unknowing rapist" and i cant live with myself.',
    "https://www.reddit.com/r/SuicideWatch/comments/jja9xt/i_cant_decide_if_i_want_to_wait_until_the_us/?utm_source=share&amp;utm_medium=ios_app&amp;utm_name=iossmf": "i cant decide if i want to wait until the US election results, or if i should just do it now because either way everything that is already shit is only going to get shittier",
}

# Function to replace nonsensically consecutive characters
def normalize_text(text):
    # Replace multiple consecutive parts of the same word
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text)

    # Replace multiple repeated combinations of the same word
    text = re.sub(r'\b(\w+ \w+)( \1)+\b', r'\1', text)

    # Replace words that contain the same character three or more times in a row
    return re.sub(r'(.)\1{2,}', r'\1', text)

def drop_none_text(df):
    df['length'] = df['post'].apply(len)
    df = df[df['length']!=0]
    df = df.drop(columns='length')
    return df

def is_meaningful_word(word):
    # Check if the word exists in the dictionary of English words.
    return word.lower() in english_words

def extract_meaningful_text(text):
    # Regular expressions split sentences
    sentences = re.split(r'\. |\? |\! ', text)
    meaningful_sentences = []

    for sentence in sentences:
        words_in_sentence = sentence.split()
        if all(is_meaningful_word(word) or word.isdigit() for word in words_in_sentence):
            meaningful_sentences.append(sentence)

    return '. '.join(meaningful_sentences)

def extract_only_number_text(text):
    text = text.replace('wouldk', 'would').replace('.,', '.').replace('. ,', '.').replace(' .', '.').replace('h it', 'hit')
    text = text.replace('xcuse me', 'excuse me').replace('wer elearning', 'we are learning').replace('gonan', 'gonna').replace('nught', 'night').replace('hi mto', 'him to')
    text = text.replace('wasnt', 'was not').replace('wouldve', 'would have').replace('Https/imgur. Com/a/t3wtiov', '').replace('didnt', 'did not').replace('youve', 'you have')
    text = text.replace('4kdjejdne sjdjd fuckdjejekieirieiriodidieiridodiodididodidookdkekekdkoeooeoi', 'I').replace('Soi nherently unlikeable.', 'So I am inherently unlikeable.')
    text = text.replace('couldve', 'could have').replace('allcmy xanax and effexo', 'all my xanax and effexor')
    text = text.replace('Hope my mom callsme selfieh whem shensees my dewdnbody I am the morjimg.', 'Hope my mom calls me selfish when she sees my dead body in the morning.')
    text = text.replace('scaared', 'scared').replace('fwiled', 'failed').replace('wxpecter', 'expected').replace('snd', 'and').replace('oiving', 'living')
    text = text.replace('awwy', 'away').replace('cwre', 'care').replace('ijust', 'I just').replace('Ppease', 'Please').replace('juet', 'just')
    text = text.replace('wwnt', 'want').replace('tefuse', 'refuse').replace('tovkill', 'to kill').replace('callsme', 'calls me').replace('shensees', 'she sees')
    text = text.replace('I just turned 17 band', 'I just turned 17 and')
    text = text.replace('callckyself', 'call myself').replace('sngry', 'angry').replace('tlaking', 'talking')
    text = text.replace('howckuch', 'how much').replace('hebstarted', 'he started').replace('talkikgnsbout', 'talking about')
    text = text.replace('hownimnalmost', 'how I am almost').replace('almpdt', 'almost').replace('accountnto', 'account to')
    text = text.replace('mynfriend', 'my friend').replace('killijgnmyself', 'killing myself').replace('willcbe', 'will be')
    text = text.replace('awrulclife', 'awful life').replace('willcbe', 'will be').replace('youre', 'you are')
    text = text.replace('6 mo on', '6 months on').replace('knw', 'know').replace('an yone', 'anyone').replace('wont', 'will not').replace('dont', 'do not').replace('ever yones', 'everyones')
    text = text.replace('ever yone', 'everyone').replace('overdosibg', 'overdosing').replace('temme', 'tell me')

    # Regular expressions remove meaningless strings and extract meaningful sentences
    sentences = re.split(r'\. |\? |\! ', text)
    meaningful_sentences = [sentence for sentence in sentences if re.search(r'[a-zA-Z]', sentence)]
    return '. '.join(meaningful_sentences)

def preprosessing_text(df, url_pattern):

    # Replacing URLs
    # df = replace_words(df, 'post', URL_replacements)

    # Delete lines containing URLs
    df = confirm_URL(df, url_pattern)

    # Correcting meaningless consecutive words
    df['post'] = df['post'].apply(normalize_text)

    # Text Processing
    df['post'] = df['post'].apply(text_conversion)

    # Perform replacements
    df = replace_words(df, 'post', replacements)
    df = replace_words(df, 'post', detailed_replacements)
    df = replace_words(df, 'post', slang_dict)
    df = replace_words(df, 'post', negative_slang_dict)

    # Capitalize the first letter of each sentence
    df['post'] = df['post'].apply(capitalize_sentences_initial)
    df['post'] = df['post'].apply(capitalize_i)

    df = drop_none_text(df)

    # Remove unintelligible words
    # df['post'] = df['post'].apply(extract_meaningful_text)

    # Delete sentences with only numbers
    df['post'] = df['post'].apply(extract_only_number_text)

    return df

def text_sort(df):
    # Sort by length of sentences in POST column
    df['length'] = df['post'].apply(len)
    df = df[df['length']!=0]
    df = df.sort_values(by='length').reset_index(drop=True)
    df = df.drop(columns='length')

    return df

Exception in thread Thread-5 (attachment_entry):
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/debugpy/server/api.py", line 237, in listen
    sock, _ = endpoints_listener.accept()
  File "/usr/lib/python3.10/socket.py", line 293, in accept
    fd, addr = self._accept()
TimeoutError: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.10/dist-packages/google/colab/_debugpy.py", line 52, in attachment_entry
    debugpy.listen(_dap_port)
  File "/usr/local/lib/python3.10/dist-packages/debugpy/public_api.py", line 31, in wrapper
    return wrapped(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/debugpy/server/api.py", line 143, in debug
    log.reraise

In [2]:
# !pip install jaconv



In [3]:
def simple_capitalize_text(text):
    text = text.lower()
    # Remove white space at the beginning and end of sentences
    text = text.strip()

    # Replace consecutive spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\n', '').replace('\r', '').replace('\t', '').replace('\\n', '').replace('\\r', '').replace('\\t', '').replace('\\', '')
    text = re.sub(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+\$,%#]+)", "" ,text)
    text = text.replace('\n', '').replace('\r', '').replace('\t', '').replace('\\n', '').replace('\\r', '').replace('\\t', '').replace('\\', '')
    return text

def simple_fix_text(df, url_pattern):
    df = confirm_URL(df, url_pattern)
    df['post'] = df['post'].apply(simple_capitalize_text)
    return df

In [10]:
import torch
import torch.nn as nn
from transformers.trainer_utils import set_seed
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, AutoModel, AdamW
import numpy as np
import pandas as pd
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score
from pprint import pprint
from datasets import Dataset
from typing import Union
from transformers import BatchEncoding, EarlyStoppingCallback
from collections import Counter
import os
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import random
import json

def set_random_seed(seed: int = 42):
    set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Settings for reproducibility
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_random_seed(42)

def report_memory():
    print(f"Allocated: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MiB")
    print(f"Cached: {torch.cuda.memory_reserved() / 1024 ** 2:.2f} MiB")

def cleanup_gpu_memory():
    """
    Function to empty GPU cache, reset CUDA memory, and display memory usage.
    """
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.reset_accumulated_memory_stats()

    print("After cleanup:")
    report_memory()

def make_vector_data(model, tokenizer, df, device, dir, date, kind):
    if not os.path.exists(dir):
        os.makedirs(dir)

    # Execute 10 rows at a time
    num_rows_per_df = 10
    dfs = [df.iloc[i:i + num_rows_per_df] for i in range(0, len(df), num_rows_per_df)]

    print('Start Reasoning')

    # List to store data frames after processing
    processed_dfs = []
    submit_dfs = []

    max_length = tokenizer.model_max_length

    print('model_max_length:', max_length)

    for i, df_part in enumerate(dfs):
        set_random_seed(42)

        df_part = df_part.reset_index(drop=True)

        data_list = df_part['post'].values.tolist()
        inputs = tokenizer(data_list, return_tensors='pt', max_length=4096, truncation=True, padding='longest')

        # Transfers input data to the same device as the model
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
            # 4 vector extraction near the final layer
            final_layer_vectors = torch.cat([outputs["hidden_states"][-1*i][:,0] for i in range(1, 4+1)], dim=1)
            print(f"Shape of final_layer_vectors_{kind}: {final_layer_vectors.shape}")

        # mean_vectors = final_layer_vectors.mean(dim=1).cpu().numpy()
        final_layer_vectors = final_layer_vectors.cpu().numpy()
        df_vec = pd.DataFrame(final_layer_vectors).reset_index(drop=True)

        if kind == 'test':
            pass
        else:
            df_vec['post_risk'] = df_part['post_risk']
            print('')
            print('Completion of vector extraction for training and evaluation data')

        ################################################################################################
        del data_list, inputs, final_layer_vectors
        cleanup_gpu_memory()
        ################################################################################################

        # Obtaining prediction result
        logits = outputs.logits
        pred = F.softmax(logits, dim=-1)
        df_pred = pd.DataFrame(pred.cpu().numpy(), columns=['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']).reset_index(drop=True)

        result_df = pd.DataFrame(pred.cpu().numpy().argmax(axis=1), columns=['suicide risk']).reset_index(drop=True)

        df_merged = pd.concat([result_df, df_pred], axis=1)
        df_merged['post'] = df_part['post']
        df_merged = pd.concat([df_merged, df_vec], axis=1)
        submit_dfs.append(df_merged)

        ################################################################################################
        del logits, outputs, pred, df_pred, result_df, df_part, df_merged, df_vec
        cleanup_gpu_memory()
        ################################################################################################

    df_submit = pd.concat(submit_dfs, ignore_index=True)
    df_submit.to_csv(f'{dir}/submission_mental_LongFormer_{kind}_{date}.csv', float_format='%.30f')

    ################################################################################################
    del submit_dfs, df_submit, dfs
    cleanup_gpu_memory()
    ################################################################################################


In [5]:
# !pip install typing

In [11]:
from sklearn.model_selection import train_test_split
import pandas as pd

set_random_seed(42)

# Code Execution Date
save_date = 'final'

date_complex = f'{save_date}_complex'
date_simple = f'{save_date}_simple'

num_labels = 4

dir_complex = f"results/{save_date}/{date_complex}"
dir_simple = f"results/{save_date}/{date_simple}"

set_random_seed(42)

df_test = pd.read_excel('./test_100_label_competition.xlsx') # setting your test data
df_test_complex = preprosessing_text(df_test, url_pattern)
df_test_simple = simple_fix_text(df_test, url_pattern)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('Completion of data preparation')

cv_list1 = [2, 5]
cv_list2 = [4]

# Complex
for cv in cv_list1:

    model_dir_cv = f'./gdrive/MyDrive/IEEE Bigdata 2024/1mukumuku/mukumuku/models/model_mental_LongFormer_Smoothing_FocalLoss_ClassBalancedLoss_20240816_cv_{cv}_complex'
    date_cv = f'{save_date}_{cv}_complex'
    dir_cv = f"results/{save_date}/{date_cv}"

    ## -- Prediction -- ##
    model = (AutoModelForSequenceClassification
          .from_pretrained(model_dir_cv, num_labels=num_labels)
          .to(device))
    tokenizer = AutoTokenizer.from_pretrained(model_dir_cv)
    model.eval()

    dir_cv = f"results/{save_date}/{date_cv}"

    make_vector_data(model, tokenizer, df_test_complex, device, dir_cv, date_cv, 'test')

    ################################################################################################
    del model, tokenizer
    cleanup_gpu_memory()
    ################################################################################################

# Simple
for cv in cv_list1:

    model_dir_cv = f'./gdrive/MyDrive/IEEE Bigdata 2024/1mukumuku/mukumuku/models/model_mental_LongFormer_Smoothing_FocalLoss_ClassBalancedLoss_20240816_cv_{cv}_simple'
    date_cv = f'{save_date}_{cv}_simple'
    dir_cv = f"results/{save_date}/{date_cv}"

    ## -- Prediction -- ##
    model = (AutoModelForSequenceClassification
          .from_pretrained(model_dir_cv, num_labels=num_labels)
          .to(device))
    tokenizer = AutoTokenizer.from_pretrained(model_dir_cv)
    model.eval()

    dir_cv = f"results/{save_date}/{date_cv}"

    make_vector_data(model, tokenizer, df_test_simple, device, dir_cv, date_cv, 'test')

    ################################################################################################
    del model, tokenizer
    cleanup_gpu_memory()
    ################################################################################################

# simple
for cv in cv_list2:

    model_dir_cv = f'./gdrive/MyDrive/IEEE Bigdata 2024/1mukumuku/mukumuku/models/model_mental_LongFormer_Smoothing_FocalLoss_ClassBalancedLoss_20240829_cv_{cv}_simple'
    date_cv = f'{save_date}_{cv}_simple'
    dir_cv = f"results/{save_date}/{date_cv}"

    ## -- Prediction -- ##
    model = (AutoModelForSequenceClassification
          .from_pretrained(model_dir_cv, num_labels=num_labels)
          .to(device))
    tokenizer = AutoTokenizer.from_pretrained(model_dir_cv)
    model.eval()

    dir_cv = f"results/{save_date}/{date_cv}"

    make_vector_data(model, tokenizer, df_test_simple, device, dir_cv, date_cv, 'test')

    ################################################################################################
    del model, tokenizer
    cleanup_gpu_memory()
    ################################################################################################

Completion of data preparation
Start Reasoning
model_max_length: 4096
Shape of final_layer_vectors_test: torch.Size([10, 3072])


RuntimeError: invalid argument to reset_peak_memory_stats

In [None]:
# Complex
for cv in cv_list1:

    sub_dir = f'submits/{save_date}'

    print(sub_dir)
    if not os.path.exists(sub_dir):
        os.makedirs(sub_dir)

    date_cv = f'{save_date}_{cv}_complex'
    dir_cv = f"results/{save_date}/{date_cv}"

    ## -- Summarize -- ##
    df_sub1 = pd.read_csv(f'{dir_cv}/submission_mental_LongFormer_test_{date_cv}.csv')

    df_sub1 = df_sub1[['suicide risk', 'Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']]
    df_sub1.reset_index(drop=False, inplace=True)
    df_sub1.rename(columns={'index': 'index'}, inplace=True)
    df_sub1['probability_distribution'] = df_sub1[['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']].values.tolist()
    df_sub1 = df_sub1[['index', 'suicide risk', 'probability_distribution']]

    re_conversion_dict = {
        0: 'indicator',
        1: 'ideation',
        2: 'behavior',
        3: 'attempt'
    }

    df_sub1['suicide risk'] = df_sub1['suicide risk'].map(re_conversion_dict)
    print(df_sub1.shape)
    display(df_sub1)

    df_sub1.to_excel(f'{sub_dir}/mukumuku_submission_mental_LongFormer_model_{date_cv}.xlsx', float_format='%.30f', index=False)

# Simple
for cv in cv_list2:

    sub_dir = f'submits/{save_date}'

    print(sub_dir)
    if not os.path.exists(sub_dir):
        os.makedirs(sub_dir)

    date_cv = f'{save_date}_{cv}_simple'
    dir_cv = f"results/{save_date}/{date_cv}"

    ## -- Summarize -- ##
    df_sub1 = pd.read_csv(f'{dir_cv}/submission_mental_LongFormer_test_{date_cv}.csv')

    df_sub1 = df_sub1[['suicide risk', 'Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']]
    df_sub1.reset_index(drop=False, inplace=True)
    df_sub1.rename(columns={'index': 'index'}, inplace=True)
    df_sub1['probability_distribution'] = df_sub1[['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']].values.tolist()
    df_sub1 = df_sub1[['index', 'suicide risk', 'probability_distribution']]

    re_conversion_dict = {
        0: 'indicator',
        1: 'ideation',
        2: 'behavior',
        3: 'attempt'
    }

    df_sub1['suicide risk'] = df_sub1['suicide risk'].map(re_conversion_dict)
    print(df_sub1.shape)
    display(df_sub1)

    df_sub1.to_excel(f'{sub_dir}/mukumuku_submission_mental_LongFormer_model_{date_cv}.xlsx', float_format='%.30f', index=False)

submits/final
(100, 3)


Unnamed: 0,index,suicide risk,probability_distribution
0,0,indicator,"[0.7350791096687316, 0.2629762887954711, 0.001..."
1,1,attempt,"[0.0007071977597661, 0.001842537545599, 0.2968..."
2,2,indicator,"[0.7391923666000366, 0.2587279379367828, 0.001..."
3,3,behavior,"[0.0008905654540285, 0.1904413998126983, 0.625..."
4,4,indicator,"[0.7346532940864562, 0.2614104151725769, 0.002..."
...,...,...,...
95,95,attempt,"[0.0007090165745466, 0.0017409325810149, 0.302..."
96,96,attempt,"[0.0006822702125646, 0.0017445855773985, 0.309..."
97,97,indicator,"[0.5374879240989685, 0.4417671561241149, 0.020..."
98,98,ideation,"[0.1815739870071411, 0.6152970790863037, 0.202..."


submits/final
(100, 3)


Unnamed: 0,index,suicide risk,probability_distribution
0,0,indicator,"[0.707390308380127, 0.2907892167568206, 0.0011..."
1,1,attempt,"[0.0019300078274682, 0.0024791646283119, 0.323..."
2,2,indicator,"[0.7122471928596495, 0.2860381007194519, 0.001..."
3,3,behavior,"[0.0010001907357946, 0.2002476602792739, 0.607..."
4,4,indicator,"[0.7102662324905396, 0.2877483069896697, 0.001..."
...,...,...,...
95,95,attempt,"[0.0008872742764651, 0.0020625968463718, 0.325..."
96,96,attempt,"[0.0010700814891606, 0.0020848689600825, 0.323..."
97,97,ideation,"[0.2280506491661071, 0.6068233251571655, 0.164..."
98,98,ideation,"[0.1970385462045669, 0.6153063178062438, 0.186..."


submits/final
(100, 3)


Unnamed: 0,index,suicide risk,probability_distribution
0,0,indicator,"[0.645256519317627, 0.3469415605068206, 0.0070..."
1,1,attempt,"[0.0039697233587503, 0.0050305677577853, 0.265..."
2,2,indicator,"[0.6604900956153869, 0.3342711627483367, 0.004..."
3,3,behavior,"[0.0032813425641506, 0.2248504757881164, 0.631..."
4,4,indicator,"[0.6896944046020507, 0.302069067955017, 0.0049..."
...,...,...,...
95,95,attempt,"[0.0029338798485696, 0.0056213098578155, 0.272..."
96,96,attempt,"[0.0075995665974915, 0.0057050455361604, 0.270..."
97,97,ideation,"[0.1935986280441284, 0.5960878133773803, 0.208..."
98,98,ideation,"[0.1679004281759262, 0.6012853980064392, 0.229..."


# TabPFN

In [None]:
from tabpfn.scripts.transformer_prediction_interface import TabPFNClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import torch
from sklearn.model_selection import train_test_split, StratifiedKFold

set_random_seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"

use_col_train = ['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3', 'post_risk']
use_col_test = ['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def prediction_TabPFN_simple(use_col_train, use_col_test, date, sub_dir, n):

    df_train_vector_simple = pd.read_csv(f'data/20240817_2_simple/submission_mental_LongFormer_train_20240817_2_simple.csv', index_col=0)
    df_test_vector_simple = pd.read_csv(f'results/{save_date}/{save_date}_{n}_simple/submission_mental_LongFormer_test_{save_date}_{n}_simple.csv', index_col=0)

    ## -- Summarize -- ##
    df_train_vector = df_train_vector_simple[use_col_train].reset_index(drop=True)
    df_test_vector = df_test_vector_simple[use_col_test].reset_index(drop=True)

    data_set = df_train_vector.drop('post_risk', axis=1).reset_index(drop=True)
    target_set = df_train_vector['post_risk'].reset_index(drop=True)

    cvs = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    bst_scr = 0

    cv_results = []

    for fold, (train_index, test_index) in enumerate(skf.split(data_set, target_set)):

        cvs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        bst_scr = 0

        print(f'Fold {fold + 1}')

        X_train, X_valid = data_set.iloc[train_index], data_set.iloc[test_index]
        y_train, y_valid = target_set.iloc[train_index], target_set.iloc[test_index]

        for cv in cvs:
            TabPFN_classifier = TabPFNClassifier(device = device, N_ensemble_configurations=cv)
            TabPFN_classifier.fit(X_train, y_train)
            y_pred = TabPFN_classifier.predict(X_valid)
            F1 = f1_score(y_valid, y_pred, average='weighted')

            if bst_scr < F1:
                use_cv = cv
                bst_scr = F1

        print('Best N_ensemble_configurations:', use_cv)
        print('Best F1:', bst_scr)

        # Creating Submission Files
        X_train = df_train_vector.drop('post_risk', axis=1)
        y_train = df_train_vector['post_risk']
        X_test = df_test_vector

        TabPFN_classifier = TabPFNClassifier(device = device, N_ensemble_configurations=use_cv)
        TabPFN_classifier.fit(X_train, y_train)

        y_pred_proba = TabPFN_classifier.predict_proba(X_test)
        y_pred = TabPFN_classifier.predict(X_test)

        df_sub = pd.DataFrame(y_pred_proba, columns=[f'Pred_class_{i}' for i in range(y_pred_proba.shape[1])])
        cv_results.append(df_sub)

    # Combine the results of each CV
    all_cv_results = pd.concat(cv_results, axis=0)

    # Calculate the average of the results
    average_result = all_cv_results.groupby(all_cv_results.index).mean()

    conditions = [
        (average_result['Pred_class_0'] > average_result[['Pred_class_1', 'Pred_class_2', 'Pred_class_3']].max(axis=1)),
        (average_result['Pred_class_1'] > average_result[['Pred_class_0', 'Pred_class_2', 'Pred_class_3']].max(axis=1)),
        (average_result['Pred_class_2'] > average_result[['Pred_class_0', 'Pred_class_1', 'Pred_class_3']].max(axis=1)),
        (average_result['Pred_class_3'] > average_result[['Pred_class_0', 'Pred_class_1', 'Pred_class_2']].max(axis=1))
    ]
    choices = ['indicator', 'ideation', 'behavior', 'attempt']

    average_result['suicide risk'] = np.select(conditions, choices, default='unknown')

    average_result = average_result[['suicide risk', 'Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']]
    average_result.reset_index(drop=False, inplace=True)
    average_result.rename(columns={'index': 'index'}, inplace=True)
    average_result['probability_distribution'] = average_result[['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']].values.tolist()
    average_result = average_result[['index', 'suicide risk', 'probability_distribution']]

    print(average_result.shape)
    display(average_result)

    average_result.to_excel(f'{sub_dir}/mukumuku_submission_mental_LongFormer_TabPFN_{date}.xlsx', float_format='%.30f', index=False)

date = f'{save_date}_2_simple'
prediction_TabPFN_simple(use_col_train, use_col_test, date, sub_dir, 2)

Fold 1


  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)


Best N_ensemble_configurations: 6
Best F1: 0.9032678491957531


  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)


Fold 2


  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)


Best N_ensemble_configurations: 1
Best F1: 0.9108436435359514
Fold 3


  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)


Best N_ensemble_configurations: 1
Best F1: 0.9080839548924655
Fold 4


  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return 

Best N_ensemble_configurations: 5
Best F1: 0.9228830067539746


  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)


Fold 5


  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)
  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)


Best N_ensemble_configurations: 1
Best F1: 0.9520859937861826
(100, 3)


  with torch.cuda.amp.autocast(enabled=fp16_inference):
  return fn(*args, **kwargs)


Unnamed: 0,index,suicide risk,probability_distribution
0,0,indicator,"[0.9799255132675171, 0.01884634420275688, 0.00..."
1,1,attempt,"[0.0021039159037172794, 0.00040922313928604126..."
2,2,indicator,"[0.9827612042427063, 0.01612253487110138, 0.00..."
3,3,behavior,"[0.0008793595479801297, 0.00705487746745348, 0..."
4,4,indicator,"[0.9832169413566589, 0.01566879265010357, 0.00..."
...,...,...,...
95,95,attempt,"[0.0021962837781757116, 0.00032488530268892646..."
96,96,attempt,"[0.002221649279817939, 0.00046222168020904064,..."
97,97,ideation,"[0.001271538552828133, 0.9947392344474792, 0.0..."
98,98,ideation,"[0.0009504712070338428, 0.9970412254333496, 0...."


# Final result

In [None]:
import os
import ast
import numpy as np

df1 = pd.read_excel(f'{sub_dir}/mukumuku_submission_mental_LongFormer_model_{save_date}_2_complex.xlsx') # 0.7326
df2 = pd.read_excel(f'{sub_dir}/mukumuku_submission_mental_LongFormer_model_{save_date}_5_complex.xlsx') # 0.7349
df3 = pd.read_excel(f'{sub_dir}/mukumuku_submission_mental_LongFormer_TabPFN_{save_date}_2_simple.xlsx') # 0.7359
df4 = pd.read_excel(f'{sub_dir}/mukumuku_submission_mental_LongFormer_model_{save_date}_4_simple.xlsx') # 0.7326

df1.set_index('index', inplace=True)
df2.set_index('index', inplace=True)
df3.set_index('index', inplace=True)
df4.set_index('index', inplace=True)

df1['probability_distribution'] = df1['probability_distribution'].apply(ast.literal_eval)
df2['probability_distribution'] = df2['probability_distribution'].apply(ast.literal_eval)
df3['probability_distribution'] = df3['probability_distribution'].apply(ast.literal_eval)
df4['probability_distribution'] = df4['probability_distribution'].apply(ast.literal_eval)

df1[['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']] = pd.DataFrame(df1['probability_distribution'].tolist(), index=df1.index)
df2[['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']] = pd.DataFrame(df2['probability_distribution'].tolist(), index=df2.index)
df3[['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']] = pd.DataFrame(df3['probability_distribution'].tolist(), index=df3.index)
df4[['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']] = pd.DataFrame(df4['probability_distribution'].tolist(), index=df4.index)

In [None]:
df_avg = (df1[['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']] + df2[['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']]
         + df3[['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']] + df4[['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']]) / 4


# Store the name of the class with the highest predicted probability in the SUICIDE RISK column
conditions = [
    (df_avg['Pred_class_0'] > df_avg[['Pred_class_1', 'Pred_class_2', 'Pred_class_3']].max(axis=1)),
    (df_avg['Pred_class_1'] > df_avg[['Pred_class_0', 'Pred_class_2', 'Pred_class_3']].max(axis=1)),
    (df_avg['Pred_class_2'] > df_avg[['Pred_class_0', 'Pred_class_1', 'Pred_class_3']].max(axis=1)),
    (df_avg['Pred_class_3'] > df_avg[['Pred_class_0', 'Pred_class_1', 'Pred_class_2']].max(axis=1))
]
choices = ['indicator', 'ideation', 'behavior', 'attempt']

df_avg['suicide risk'] = np.select(conditions, choices, default='unknown')

df_avg = df_avg[['suicide risk', 'Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']]
df_avg.reset_index(drop=False, inplace=True)
df_avg.rename(columns={'index': 'index'}, inplace=True)
df_avg['probability_distribution'] = df_avg[['Pred_class_0', 'Pred_class_1', 'Pred_class_2', 'Pred_class_3']].values.tolist()
df_avg = df_avg[['index', 'suicide risk', 'probability_distribution']]

df_avg.to_excel(f'{sub_dir}/mukumuku_submission_mental_LongFormer_CV_simple_complex_final.xlsx', float_format='%.30f', index=False)
display(df_avg)

Unnamed: 0,index,suicide risk,probability_distribution
0,0,indicator,"[0.7669128626585007, 0.2298883525654673, 0.002..."
1,1,attempt,"[0.0021777112124254697, 0.00244037326774556, 0..."
2,2,indicator,"[0.7736727148294449, 0.2237899340689182, 0.001..."
3,3,behavior,"[0.0015128645754884573, 0.15564860333688552, 0..."
4,4,indicator,"[0.7794577181339264, 0.21672414569184179, 0.00..."
...,...,...,...
95,95,attempt,"[0.001681613619439253, 0.0024374311469727815, ..."
96,96,attempt,"[0.00289339189475866, 0.00249918043846261, 0.2..."
97,97,ideation,"[0.24010218496550803, 0.659854382276535, 0.099..."
98,98,ideation,"[0.13686585814866703, 0.7072325050830841, 0.15..."


# Reproducibility Confirmation

In [None]:
df_confirm = pd.read_excel('data/confirm_data/mukumuku_submission_mental_LongFormer_CV_simple_complex_top4.xlsx')

df_result = pd.read_excel(f'{sub_dir}/mukumuku_submission_mental_LongFormer_CV_simple_complex_final.xlsx')

different_rows = df_confirm[df_confirm['suicide risk'] != df_result['suicide risk']]

# Reproducibility Confirmation
if different_rows.empty:
    print("Success!!")
else:
    print(different_rows)

Success!!
