#Setup

In [None]:
!pip install praw openai transformers numpy requests nlpaug nltk>=3.4.5

In [None]:
import praw
import pandas as pd

# Create Dataset

In [None]:
# Read-only instance
reddit_read_only = praw.Reddit(client_id="", # your client id
                               client_secret="", # your client secret
                               user_agent="") # your user agent

In [None]:
subreddit = reddit_read_only.subreddit("offmychest")

for post in subreddit.hot(limit=5):
    print(post.title)
    print()

In [None]:
posts = subreddit.top(limit=1000)
# Scraping the top posts of the current month

posts_dict = {"Title": [], "Post Text": [],
              "ID": [], "Score": [],
              "Total Comments": [], "Post URL": []
              }

for post in posts:
    posts_dict["Title"].append(post.title)
    posts_dict["Post Text"].append(post.selftext)
    posts_dict["ID"].append(post.id)
    posts_dict["Score"].append(post.score)
    posts_dict["Total Comments"].append(post.num_comments)
    posts_dict["Post URL"].append(post.url)

top_posts = pd.DataFrame(posts_dict)
top_posts

In [None]:
lengths = top_posts["Post Text"].str.len()
long_posts = top_posts[lengths >= 200]
long_posts.count()

In [None]:
def get_posts_from_subreddit(subreddits, limit):
  posts_dict = {"id": [], "title": [], "text": [], "subreddit": []}

  for subreddit_name in subreddits:
    subreddit = reddit_read_only.subreddit(subreddit_name)
    posts = subreddit.top(limit = limit)

    for post in posts:
        posts_dict["id"].append(post.id)

        posts_dict["title"].append(post.title)

        posts_dict["text"].append(post.selftext)

        posts_dict["subreddit"].append(subreddit_name)

  return posts_dict


In [None]:
subreddits = ["offmychest", "DecidingToBeBetter", "TrueOffMyChest", "mentalhealth", "relationships"]
limit = 1000

reddit_posts = get_posts_from_subreddit(subreddits, limit)

In [None]:
reddit_posts_df = pd.DataFrame(reddit_posts)
reddit_posts_df

In [None]:
reddit_posts_df.to_csv("data/raw_data.csv", sep='\t', encoding='utf-8', index=False, header=True)

# Data Pre Processing

In [None]:
reddit_posts_df = pd.read_csv("data/raw_data.csv", sep='\t')
reddit_posts_df

In [None]:
import re
def clean_for_baseline(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
cleaned_posts_df = reddit_posts_df
cleaned_posts_df = cleaned_posts_df[cleaned_posts_df["text"].notnull()]
cleaned_posts_df = cleaned_posts_df[~cleaned_posts_df["text"].str.contains("removed|deleted", case=False)]
cleaned_posts_df = cleaned_posts_df[cleaned_posts_df["text"].str.len() >= 200]
cleaned_posts_df["full_text"] = cleaned_posts_df["title"] + " " + cleaned_posts_df["text"]

In [None]:
def heuristic_label(text):
    text = text.lower()
    if any(kw in text for kw in ["everything is ruined", "worst thing ever", "disaster"]):
        return "catastrophizing"
    elif any(kw in text for kw in ["always", "never", "everyone", "no one"]):
        return "overgeneralization"
    elif any(kw in text for kw in ["they think", "they must believe", "they want"]):
        return "mind reading"
    elif any(kw in text for kw in ["it’s my fault", "i caused", "because of me"]):
        return "personalization"
    elif any(kw in text for kw in ["i feel like", "i know it’s true because i feel it"]):
        return "emotional reasoning"
    else:
        return "none"

cleaned_posts_df['label'] = cleaned_posts_df['full_text'].apply(heuristic_label)

In [None]:
unlabeled_posts_df = cleaned_posts_df[cleaned_posts_df['label'] == "none"]
unlabeled_posts_df

In [None]:
# from openai import OpenAI
# API_KEY = ""

# client = OpenAI(
#     # This is the default and can be omitted
#     api_key=API_KEY,
# )

# def get_label_with_zero_shot_gpt(text):
#   prompt = f"""
#   You are a CBT-trained therapist. Read the journal entry below and label it with one cognitive distortion from: Catastrophizing, Overgeneralization, Mind Reading, Personalization, Emotional Reasoning.
#   Return the response in one word if it fits none of the above labels return the word none.
#   Below is an additional description of each of these cognitive distortion labels.
#   Catastrophizing - When individuals consistently imagine the worst possible outcome in a situation, even when it's unlikely or improbable. This can lead to heightened anxiety, fear, and a sense of being overwhelmed. It's essentially a form of negative thinking where minor issues are blown out of proportion, and the focus is solely on negative possibilities.
#   Overgeneralization - When major conclusions are drawn based on limited information, or some large group is said to have same behavior or property. For example: “one nurse was rude to me, this means all medical staff must be rude.” or “last time I was in the pool I almost drowned, I am a terrible swimmer and should not go into the water again”.
#   Mind Reading - Any evidence of the speaker suspecting what others are thinking or what are the motivations behind their actions. Statements like “they won’t understand”, “they dislike me” suggest mind reading distortion. However, “she said she dislikes me” is not a distortion, but “I think she dislikes me since she ignored me” is again mind reading distortion (since it is based on assumption that you know why someone behaved in a certain way).
#   Personalization - Personalizing or taking up the blame for a situation which is not directly related to the speaker. This could also be assigning the blame to someone who was not responsible for the situation that in reality involved many factors and was out of your/the person’s control. The first entry in the sample is a good example for this.
#   Emotional Reasoning - Basically, this distortion can be summed up as - “If I feel that way, it must be true.” Whatever a person is feeling is believed to be true automatically and unconditionally. One of the most common representation of this is some variation of – ‘I feel like a failure so I must be a failure’. It does not always have to be about the speaker themselves, “I feel like he is not being honest with me, he must be hiding something” is also an example of emotional reasoning.

#   Text: "{text}"
#   Label:"""

#   response = client.chat.completions.create(
#         model="gpt-4",
#         messages=[{"role": "user", "content": prompt}],
#         temperature=0
#     )

#   return response.choices[0].message.content.strip().lower()

In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli", device="cuda")

# Your list of distortions and definitions
distortions = {
    "Catastrophizing": "This post contains catastrophizing, where the speaker imagines the worst possible outcome, even when unlikely.",
    "Overgeneralization": "This post contains overgeneralization, drawing broad conclusions from limited events or examples.",
    "Mind Reading": "This post involves mind reading, where the speaker assumes others' thoughts or motives without proof.",
    "Personalization": "This post contains personalization, where the speaker blames themselves for events outside their control.",
    "Emotional Reasoning": "This post shows emotional reasoning, treating emotions as facts (e.g., 'I feel it, so it must be true')."
}

# Convert dictionary values to a list of hypotheses
hypotheses = list(distortions.values())
labels = list(distortions.keys())


def get_label_with_bart(text):
    result = classifier(text, hypotheses, multi_label=False,)
    best_hypothesis = result['labels'][0]
    best_label = labels[hypotheses.index(best_hypothesis)]
    return best_label.lower()

In [None]:
data = unlabeled_posts_df.iloc[0].to_dict()
data

In [None]:
# get_label_with_zero_shot_gpt(data["full_text"])

In [None]:
get_label_with_bart(data["full_text"])

In [None]:
unlabeled_posts_df["label"] = unlabeled_posts_df["full_text"].apply(get_label_with_bart)

In [None]:
unlabeled_posts_df[unlabeled_posts_df['label'] == "none"]

In [None]:
unlabeled_posts_df

In [None]:
labeled_posts_df = cleaned_posts_df[cleaned_posts_df['label'] != "none"]
labeled_posts_df

In [None]:
all_labeled_df = pd.concat([labeled_posts_df, unlabeled_posts_df], axis=0, ignore_index=True)
all_labeled_df = all_labeled_df.drop(columns=["text", "subreddit", "title"])
all_labeled_df

In [None]:
baseline_posts_df = all_labeled_df.copy()

In [None]:
baseline_posts_df['full_text'] = baseline_posts_df['full_text'].apply(clean_for_baseline)

In [None]:
baseline_posts_df

In [None]:
baseline_posts_df.to_csv("data/clean_baseline_data.csv", sep='\t', encoding='utf-8', index=False, header=True)

# Data Analysis

In [None]:
dataset = pd.read_csv("data/clean_baseline_data.csv", sep='\t')
dataset

In [None]:
dataset.groupby('label').size()

In [None]:
import matplotlib.pyplot as plt

label_counts = dataset.groupby('label').size()

plt.figure()
label_counts.plot(kind='bar')
plt.title('Distribution of Labels')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Baseline Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.dummy import DummyClassifier
from sklearn import metrics

X = dataset['full_text']
y = dataset['label']

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

model = LogisticRegression(C=1.0)
model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))

#Data Augmentation

In [None]:
dataset.iloc[0].to_dict()["full_text"]

In [None]:
import nlpaug.augmenter.word as naw
import nltk
nltk.download('averaged_perceptron_tagger_eng')

def augment_text(df, aug, times, augmented_data):
  for index, row in df.iterrows():
    text = row["full_text"]
    label = row["label"]
    for _ in range(times):
      augmented_text = aug.augment(text)
      augmented_data["id"].append("1")
      augmented_data["full_text"].append(augmented_text[0])
      augmented_data["label"].append(label)


augmented_data = {"id": [], "full_text": [], "label": []}

In [None]:
aug = naw.SynonymAug(aug_src='wordnet')

catastrophizing = dataset[dataset["label"] == "catastrophizing"]
augment_text(catastrophizing, aug, 3, augmented_data)

emotional_reasoning = dataset[dataset["label"] == "emotional reasoning"]
augment_text(emotional_reasoning, aug, 3, augmented_data)

mind_reading = dataset[dataset["label"] == "mind reading"]
augment_text(mind_reading, aug, 3, augmented_data)

In [None]:
aug = naw.RandomWordAug(action="swap")

catastrophizing = dataset[dataset["label"] == "catastrophizing"]
augment_text(catastrophizing, aug, 3, augmented_data)

emotional_reasoning = dataset[dataset["label"] == "emotional reasoning"]
augment_text(emotional_reasoning, aug, 3, augmented_data)

mind_reading = dataset[dataset["label"] == "mind reading"]
augment_text(mind_reading, aug, 3, augmented_data)

In [None]:
aug = naw.RandomWordAug()

catastrophizing = dataset[dataset["label"] == "catastrophizing"]
augment_text(catastrophizing, aug, 3, augmented_data)

emotional_reasoning = dataset[dataset["label"] == "emotional reasoning"]
augment_text(emotional_reasoning, aug, 3, augmented_data)

mind_reading = dataset[dataset["label"] == "mind reading"]
augment_text(mind_reading, aug, 3, augmented_data)

In [None]:
len(augmented_data["full_text"])

In [None]:
augmented_data_df = pd.DataFrame(augmented_data)
augmented_data_df

In [None]:
more_data = pd.concat([augmented_data_df, dataset], axis=0)
more_data

In [None]:
import matplotlib.pyplot as plt

label_counts = more_data.groupby('label').size()

plt.figure()
label_counts.plot(kind='bar')
plt.title('Distribution of Labels')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.dummy import DummyClassifier
from sklearn import metrics
from sklearn import svm


X = more_data['full_text']
y = more_data['label']

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Vectorize the text data
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

model = svm.SVC(decision_function_shape='ovo', probability=True)
# model = svm.LinearSVC()
model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
le.classes_

In [None]:
from joblib import dump, load

# Save both the model and vectorizer
dump(model, "models/logistic_model.pkl")
dump(tfidf, "models/tfidf_vectorizer.pkl")