## RoBERTa introduction and Credits


Twitter-roBERTa-base for Sentiment Analysis

This is a roBERTa-base model trained on ~58M tweets and finetuned for sentiment analysis with the TweetEval benchmark. This model is suitable for English (for a similar multilingual model, see XLM-T).

Reference Paper: TweetEval (Findings of EMNLP 2020).
Git Repo: Tweeteval official repository.
Labels: 0 -> Negative; 1 -> Neutral; 2 -> Positive


reference paper: https://aclanthology.org/2020.findings-emnlp.148/


the code used in this notebook as been extracted from the following repository: https://huggingface.co/blog/bert-101#7-how-to-get-started-using-bert


```
# from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)


# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
res = []
for text in texts:
  try:
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

  # # TF
  # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
  # model.save_pretrained(MODEL)

  # text = "Good night 😊"
  # encoded_input = tokenizer(text, return_tensors='tf')
  # output = model(encoded_input)
  # scores = output[0][0].numpy()
  # scores = softmax(scores)
    d = {"text": text}
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        d[f"{l}"] = f'{np.round(float(s),4)}'
    res.append(d)
  except:
    d = {"text": text, "labels":"NA"}
    res.append(d)

```



## Functions to run



In [None]:
# Preprocessing functions
def preprocess(text):
  new_text = []
  for t in text.split(" "):
    t = 'http' if t.startswith('http') else t
    new_text.append(t)
  return " ".join(new_text)

import re
def preprocess1(text):
   cleaned_text = re.sub(r'http[s]?://\S+', '', text)
   cleaned_text_no_specials = re.sub(r'[^a-zA-Z0-9\s]', '', cleaned_text)
   return cleaned_text_no_specials

In [None]:
#Truncating excessive length of posts function
def truncate_doc(doc, max_len=350):
  tokens = doc.split()
  truncated_tokens = tokens[:max_len]
  truncated_doc = " ".join(truncated_tokens)
  return truncated_doc

#defining the sentiment analysis function
def sentiment_analysis(texts):
  from transformers import AutoModelForSequenceClassification
  from transformers import TFAutoModelForSequenceClassification
  from transformers import AutoTokenizer
  import numpy as np
  from scipy.special import softmax
  import csv
  import urllib.request

  labels=[]
  mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
  with urllib.request.urlopen(mapping_link) as f:
      html = f.read().decode('utf-8').split("\n")
      csvreader = csv.reader(html, delimiter='\t')
  labels = [row[1] for row in csvreader if len(row) > 1]

  model = AutoModelForSequenceClassification.from_pretrained(MODEL)
  model.save_pretrained(MODEL)
  res = []
  for text in texts:
    try:
      encoded_input = tokenizer(text, return_tensors='pt')
      output = model(**encoded_input)
      scores = output[0][0].detach().numpy()
      scores = softmax(scores)
      d = {"text": text}
      ranking = np.argsort(scores)
      ranking = ranking[::-1]
      for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        d[f"{l}"] = f'{np.round(float(s),4)}'
      res.append(d)

    except:
      try:
        text = truncate_doc(text)
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        d = {"text": text}
        ranking = np.argsort(scores)
        ranking = ranking[::-1]
        for i in range(scores.shape[0]):
          l = labels[ranking[i]]
          s = scores[ranking[i]]
          d[f"{l}"] = f'{np.round(float(s),4)}'
        res.append(d)
      except:
        d = {"text": text, "labels":"Error length"}
        res.append(d)

  return res


In [None]:
# Run this only once in the entire notebook, if run twice it will provide an error. Interrup run time and rerun everything if needed
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

## **Objective of the notebook**: Analyze the in sentiment in the post and in the answer/reply after the comment
(we will use only the posts which have comment and reply)

## Depression comments and replies

In [None]:
#load data
import pandas as pd
df1 = pd.read_csv("depression_comments_replies.csv")

In [None]:
#cleaning
df1['Post'] = df1['title'] + ' ' + df1['text']
df1['Post_cleaned'] = df1.Post.apply(preprocess1)
df1['Comment_cleaned'] = df1.Comment.apply(preprocess1)
df1['Answer_cleaned'] = df1.Answer.apply(preprocess1)

In [None]:
depression_pca = df1[['Post','Post_cleaned','Comment','Comment_cleaned','Answer','Answer_cleaned']]
depression_pca.head()

Unnamed: 0,Post,Post_cleaned,Comment,Comment_cleaned,Answer,Answer_cleaned
0,If 10 years ago someone told me that in 10 yea...,If 10 years ago someone told me that in 10 yea...,aimless walks can be good to clear you head tho,aimless walks can be good to clear you head tho,"Yup, that's why I do them. Especially since I ...",Yup thats why I do them Especially since I don...
1,If 10 years ago someone told me that in 10 yea...,If 10 years ago someone told me that in 10 yea...,I feel you on this. I was always the most ambi...,I feel you on this I was always the most ambit...,Yup. My parents always did everything for me a...,Yup My parents always did everything for me as...
2,If 10 years ago someone told me that in 10 yea...,If 10 years ago someone told me that in 10 yea...,"Ah, the classic I'm so bored and lonely ill ju...",Ah the classic Im so bored and lonely ill just...,"Well, after about a hundred of these walks you...",Well after about a hundred of these walks you ...
3,I cried in front of my family today. They ende...,I cried in front of my family today They ended...,That honestly sucks! I hate when parents do th...,That honestly sucks I hate when parents do tha...,No I’m not okay 😭,No Im not okay
4,I cried in front of my family today. They ende...,I cried in front of my family today They ended...,***I suffered from depression. It went away wh...,I suffered from depression It went away when I...,I can move away but I’m just afraid that I won...,I can move away but Im just afraid that I wont...


In [None]:
texts_p = [t for t in depression_pca['Post_cleaned']]
texts_c = [t for t in depression_pca['Comment_cleaned']]
texts_a = [t for t in depression_pca['Answer_cleaned']]

In [None]:
import pandas as pd

res_p = sentiment_analysis(texts_p)
res_c = sentiment_analysis(texts_c)
res_a = sentiment_analysis(texts_a)
sentiment_depression_pp = pd.DataFrame(res_p)
sentiment_depression_c= pd.DataFrame(res_c)
sentiment_depression_a = pd.DataFrame(res_a)

In [None]:
sentiment_depression_pp.to_csv("sentiment_depression_pp")
sentiment_depression_c.to_csv("sentiment_depression_c")
sentiment_depression_a.to_csv("sentiment_depression_a")

total dataset creation

In [None]:
sentiment_depression_pp.rename(columns={'text': 'post','neutral':'neutral_p', 'negative':'negative_p', 'positive':'positive_p' }, inplace=True)
sentiment_depression_c.rename(columns={'text': 'comment', 'neutral':'neutral_c', 'negative':'negative_c', 'positive':'positive_c'}, inplace=True)
sentiment_depression_a.rename(columns={'text': 'answer','neutral':'neutral_a', 'negative':'negative_a', 'positive':'positive_a'}, inplace=True)
concatenated_df = pd.concat([sentiment_depression_pp, sentiment_depression_c, sentiment_depression_a], axis=1)
concatenated_df.to_csv("sentiment_depression_pca_scores")

sentiment difference

In [None]:
sentiment_df = pd.read_csv("sentiment_depression_pca_scores")
sentiment_df.head()
sentiment_df["neutral_diff"] = sentiment_df['neutral_p'] - sentiment_df['neutral_a']
sentiment_df["positive_diff"] = sentiment_df['positive_p'] - sentiment_df['positive_a']
sentiment_df["negative_diff"] = sentiment_df['negative_p'] - sentiment_df['negative_a']

In [None]:
avg_diff_neutral = sentiment_df['neutral_diff'].sum()/(len(sentiment_df))
print(avg_diff_neutral)
#the replies are more likely to be labeled neutral than the posts by 0.08

avg_diff_negative = sentiment_df['negative_diff'].sum()/(len(sentiment_df))
print(avg_diff_negative)
#the replies are less likely to be labeled negative than the posts by 0.301

avg_diff_positive = sentiment_df['positive_diff'].sum()/(len(sentiment_df))
print(avg_diff_positive)
#the replies are more likely to be labeled positive than the posts by 0.216

#-0.08590868326334733
#0.3014790641871626
#-0.21557225554889023

-0.08590868326334733
0.3014790641871626
-0.21557225554889023


In [None]:
sentiment_df.to_csv("sentiment_depression.csv")

## Anxiety Comments and Replies

In [None]:
#load data
import pandas as pd
df1 = pd.read_csv("anxiety_comments_replies.csv")

In [None]:
#cleaning
df1['Post'] = df1['title'] + ' ' + df1['text']
df1['Post_cleaned'] = df1.Post.apply(preprocess1)
df1['Comment'] = df1['Comment'].astype(str)
df1['Comment_cleaned'] = df1.Comment.apply(preprocess1)
df1['Answer_cleaned'] = df1.Answer.apply(preprocess1)

In [None]:
anxiety_pca = df1[['Post','Post_cleaned','Comment','Comment_cleaned','Answer','Answer_cleaned']]
anxiety_pca.head()

Unnamed: 0,Post,Post_cleaned,Comment,Comment_cleaned,Answer,Answer_cleaned
0,"Professor here, if you have missed class, plea...",Professor here if you have missed class please...,This is honestly so helpful. Thank you,This is honestly so helpful Thank you,No problem. I suffer from anxiety as well and ...,No problem I suffer from anxiety as well and \...
1,"Professor here, if you have missed class, plea...",Professor here if you have missed class please...,So funny because I am also a professor and I a...,So funny because I am also a professor and I a...,I kept reading the posts and thinking a differ...,I kept reading the posts and thinking a differ...
2,"Professor here, if you have missed class, plea...",Professor here if you have missed class please...,I've had professors straight up make fun of me...,Ive had professors straight up make fun of me ...,That’s some bullshit.,Thats some bullshit
3,"Professor here, if you have missed class, plea...",Professor here if you have missed class please...,I wish you were my professor. I get so anxious...,I wish you were my professor I get so anxious ...,Even if we are super caring people we rarely n...,Even if we are super caring people we rarely n...
4,"Professor here, if you have missed class, plea...",Professor here if you have missed class please...,It's great that there are teachers/professors ...,Its great that there are teachersprofessors li...,That’s absurd. I’m sorry that happened.,Thats absurd Im sorry that happened


In [None]:
texts_p = [t for t in anxiety_pca['Post_cleaned']]
texts_c = [t for t in anxiety_pca['Comment_cleaned']]
texts_a = [t for t in anxiety_pca['Answer_cleaned']]

In [None]:
import pandas as pd

res_p = sentiment_analysis(texts_p)
res_c = sentiment_analysis(texts_c)
res_a = sentiment_analysis(texts_a)
sentiment_anxiety_pp = pd.DataFrame(res_p)
sentiment_anxiety_c= pd.DataFrame(res_c)
sentiment_anxiety_a = pd.DataFrame(res_a)

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
sentiment_anxiety_pp.to_csv("sentiment_anxiety_pp")
sentiment_anxiety_c.to_csv("sentiment_anxiety_c")
sentiment_anxiety_a.to_csv("sentiment_anxiety_a")

Total dataset creation

In [None]:
sentiment_anxiety_pp.rename(columns={'text': 'post','neutral':'neutral_p', 'negative':'negative_p', 'positive':'positive_p' }, inplace=True)
sentiment_anxiety_c.rename(columns={'text': 'comment', 'neutral':'neutral_c', 'negative':'negative_c', 'positive':'positive_c'}, inplace=True)
sentiment_anxiety_a.rename(columns={'text': 'answer','neutral':'neutral_a', 'negative':'negative_a', 'positive':'positive_a'}, inplace=True)
concatenated_df = pd.concat([sentiment_anxiety_pp, sentiment_anxiety_c, sentiment_anxiety_a], axis=1)
concatenated_df.to_csv("sentiment_anxiety_pca_scores")

Sentiment difference

In [None]:
sentiment_df = pd.read_csv("sentiment_anxiety_pca_scores")
sentiment_df.head()
sentiment_df["neutral_diff"] = sentiment_df['neutral_p'] - sentiment_df['neutral_a']
sentiment_df["positive_diff"] = sentiment_df['positive_p'] - sentiment_df['positive_a']
sentiment_df["negative_diff"] = sentiment_df['negative_p'] - sentiment_df['negative_a']

In [None]:
avg_diff_neutral = sentiment_df['neutral_diff'].sum()/(len(sentiment_df))
print(avg_diff_neutral)
#the replies are more likely to be labelled than the posts by 0.066

avg_diff_negative = sentiment_df['negative_diff'].sum()/(len(sentiment_df))
print(avg_diff_negative)
#the replies are less likely to be labelled negative than the posts by 0.149

avg_diff_positive = sentiment_df['positive_diff'].sum()/(len(sentiment_df))
print(avg_diff_positive)
#the replies are more likely to be labelled positive than the posts by 0.08

#-0.06572979836847775
#0.14931693089118053
#-0.08359141142065568

-0.06572979836847775
0.14931693089118053
-0.08359141142065568


In [None]:
sentiment_df.to_csv("sentiment_anxiety.csv")

## Anger Comments and Replies

In [None]:
#load data
import pandas as pd
df1 = pd.read_csv("anger_comments_replies.csv")

In [None]:
#cleaning
df1['Post'] = df1['title'] + ' ' + df1['text']
df1['Post_cleaned'] = df1.Post.apply(preprocess1)
df1['Comment_cleaned'] = df1.Comment.apply(preprocess1)
df1['Answer_cleaned'] = df1.Answer.apply(preprocess1)

In [None]:
anger_pca = df1[['Post','Post_cleaned','Comment','Comment_cleaned','Answer','Answer_cleaned']]
anger_pca.head()

Unnamed: 0,Post,Post_cleaned,Comment,Comment_cleaned,Answer,Answer_cleaned
0,Not a single person has upvoted anything I've ...,Not a single person has upvoted anything Ive p...,You are on Reddit for the wrong reason.,You are on Reddit for the wrong reason,Thats fair,Thats fair
1,Not a single person has upvoted anything I've ...,Not a single person has upvoted anything Ive p...,well hey. Happy cake day.,well hey Happy cake day,Its a bit late but thanks :&gt;,Its a bit late but thanks gt
2,Not a single person has upvoted anything I've ...,Not a single person has upvoted anything Ive p...,Happy late cake day! Hope your day is going well,Happy late cake day Hope your day is going well,Thanks mate,Thanks mate
3,Not a single person has upvoted anything I've ...,Not a single person has upvoted anything Ive p...,Happy cake day!! I'm just one person but I hop...,Happy cake day Im just one person but I hope i...,Thanks. Really.\n\nEdit: Why the fuck are peop...,Thanks Really\n\nEdit Why the fuck are people ...
4,Not a single person has upvoted anything I've ...,Not a single person has upvoted anything Ive p...,Happy cake day! Sorry about the missing upvotes.,Happy cake day Sorry about the missing upvotes,Who in the name of fuck downvoted this comment,Who in the name of fuck downvoted this comment


In [None]:
texts_p = [t for t in anger_pca['Post_cleaned']]
texts_c = [t for t in anger_pca['Comment_cleaned']]
texts_a = [t for t in anger_pca['Answer_cleaned']]

In [None]:
import pandas as pd

res_p = sentiment_analysis(texts_p)
res_c = sentiment_analysis(texts_c)
res_a = sentiment_analysis(texts_a)
sentiment_anger_pp = pd.DataFrame(res_p)
sentiment_anger_c= pd.DataFrame(res_c)
sentiment_anger_a = pd.DataFrame(res_a)

In [None]:
sentiment_anger_pp.to_csv("sentiment_anger_pp")
sentiment_anger_c.to_csv("sentiment_anger_c")
sentiment_anger_a.to_csv("sentiment_anger_a")

Total dataset creation

In [None]:
sentiment_anger_pp.rename(columns={'text': 'post','neutral':'neutral_p', 'negative':'negative_p', 'positive':'positive_p' }, inplace=True)
sentiment_anger_c.rename(columns={'text': 'comment', 'neutral':'neutral_c', 'negative':'negative_c', 'positive':'positive_c'}, inplace=True)
sentiment_anger_a.rename(columns={'text': 'answer','neutral':'neutral_a', 'negative':'negative_a', 'positive':'positive_a'}, inplace=True)
concatenated_df = pd.concat([sentiment_anger_pp, sentiment_anger_c, sentiment_anger_a], axis=1)
concatenated_df.to_csv("sentiment_anger_pca_scores")

Sentiment difference

In [None]:
concatenated_df = pd.read_csv("sentiment_anger_pca_scores")
concatenated_df["neutral_diff"] = concatenated_df['neutral_p'] - concatenated_df['neutral_a']
concatenated_df["positive_diff"] = concatenated_df['positive_p'] - concatenated_df['positive_a']
concatenated_df["negative_diff"] = concatenated_df['negative_p'] - concatenated_df['negative_a']

In [None]:
avg_diff_neutral = concatenated_df['neutral_diff'].sum()/(len(concatenated_df))
print(avg_diff_neutral)
#the replies are more neutral than the posts by 0.19

avg_diff_negative = concatenated_df['negative_diff'].sum()/(len(concatenated_df))
print(avg_diff_negative)
#the replies are less negative than the posts by 0.40

avg_diff_positive = concatenated_df['positive_diff'].sum()/(len(concatenated_df))
print(avg_diff_positive)
#the replies are more positive than the posts by 0.22

#-0.1688781884057971
#0.42887855072463765
#-0.2600025362318841

-0.1688781884057971
0.42887855072463765
-0.2600025362318841


In [None]:
sentiment_df.to_csv("sentiment_anger.csv")

## Relationship comments and replies

In [None]:
#load data
import pandas as pd
df1 = pd.read_csv("relationship_comments_replies.csv")

In [None]:
#cleaning
df1['Post'] = df1['title'] + ' ' + df1['text']
df1['Post_cleaned'] = df1.Post.apply(preprocess1)
df1['Comment_cleaned'] = df1.Comment.apply(preprocess1)
df1['Answer_cleaned'] = df1.Answer.apply(preprocess1)

In [None]:
relationship_pca = df1[['Post','Post_cleaned','Comment','Comment_cleaned','Answer','Answer_cleaned']]
relationship_pca.head()

Unnamed: 0,Post,Post_cleaned,Comment,Comment_cleaned,Answer,Answer_cleaned
0,To whoever needs to hear this. He made you fee...,To whoever needs to hear this He made you feel...,thank you for this,thank you for this,"Just remember, you are more than what one pers...",Just remember you are more than what one perso...
1,To whoever needs to hear this. He made you fee...,To whoever needs to hear this He made you feel...,This is so true. Why do his actions of treatin...,This is so true Why do his actions of treating...,i’m sorry it took me so long to see this. he m...,im sorry it took me so long to see this he mak...
2,To whoever needs to hear this. He made you fee...,To whoever needs to hear this He made you feel...,needed this. thanks OP,needed this thanks OP,You are worth more than even you know. Never l...,You are worth more than even you know Never le...
3,To whoever needs to hear this. He made you fee...,To whoever needs to hear this He made you feel...,"I needed to see this, even if i had to change ...",I needed to see this even if i had to change i...,i only said he because i felt if i made it gen...,i only said he because i felt if i made it gen...
4,To whoever needs to hear this. He made you fee...,To whoever needs to hear this He made you feel...,:( I can’t escape the thoughts that came after...,I cant escape the thoughts that came afterwar...,just remember you already know you can do it w...,just remember you already know you can do it w...


In [None]:
texts_p = [t for t in relationship_pca['Post_cleaned']]
texts_c = [t for t in relationship_pca['Comment_cleaned']]
texts_a = [t for t in relationship_pca['Answer_cleaned']]

In [None]:
import pandas as pd

res_p = sentiment_analysis(texts_p)
res_c = sentiment_analysis(texts_c)
res_a = sentiment_analysis(texts_a)
sentiment_relationship_pp = pd.DataFrame(res_p)
sentiment_relationship_c= pd.DataFrame(res_c)
sentiment_relationship_a = pd.DataFrame(res_a)

In [None]:
sentiment_relationship_pp.to_csv("sentiment_relationship_pp")
sentiment_relationship_c.to_csv("sentiment_relationship_c")
sentiment_relationship_a.to_csv("sentiment_relationship_a")

Total dataset creation

In [None]:
sentiment_relationship_pp.rename(columns={'text': 'post','neutral':'neutral_p', 'negative':'negative_p', 'positive':'positive_p' }, inplace=True)
sentiment_relationship_c.rename(columns={'text': 'comment', 'neutral':'neutral_c', 'negative':'negative_c', 'positive':'positive_c'}, inplace=True)
sentiment_relationship_a.rename(columns={'text': 'answer','neutral':'neutral_a', 'negative':'negative_a', 'positive':'positive_a'}, inplace=True)
concatenated_df = pd.concat([sentiment_relationship_pp, sentiment_relationship_c, sentiment_relationship_a], axis=1)
concatenated_df.to_csv("sentiment_relationship_pca_scores")

Total dataset creation

In [None]:
concatenated_df = pd.read_csv("sentiment_relationship_pca_scores")
concatenated_df["neutral_diff"] = concatenated_df['neutral_p'] - concatenated_df['neutral_a']
concatenated_df["positive_diff"] = concatenated_df['positive_p'] - concatenated_df['positive_a']
concatenated_df["negative_diff"] = concatenated_df['negative_p'] - concatenated_df['negative_a']

In [None]:
avg_diff_neutral = concatenated_df['neutral_diff'].sum()/(len(concatenated_df))
print(avg_diff_neutral)
#the replies are more likely to be labelled neutral than the posts by 0.026

avg_diff_negative = concatenated_df['negative_diff'].sum()/(len(concatenated_df))
print(avg_diff_negative)
#the replies are less likely to be labelled negative than the posts by 0.283

avg_diff_positive = concatenated_df['positive_diff'].sum()/(len(concatenated_df))
print(avg_diff_positive)
#the replies are more likely to be labelled positive than the posts by0.257

-0.025946361746361748
0.282808316008316
-0.25686819126819127


In [None]:
sentiment_df.to_csv("sentiment_relationship.csv")

## Stress comments and replies

In [None]:
#load data
import pandas as pd
df1 = pd.read_csv("stress_comments_replies.csv")

In [None]:
#cleaning
df1['Post'] = df1['title'] + ' ' + df1['text']
df1['Post_cleaned'] = df1.Post.apply(preprocess1)
df1['Comment_cleaned'] = df1.Comment.apply(preprocess1)
df1['Answer_cleaned'] = df1.Answer.apply(preprocess1)

In [None]:
stress_pca = df1[['Post','Post_cleaned','Comment','Comment_cleaned','Answer','Answer_cleaned']]
stress_pca.head()

Unnamed: 0,Post,Post_cleaned,Comment,Comment_cleaned,Answer,Answer_cleaned
0,"To everyone with stress, here you go a online ...",To everyone with stress here you go a online b...,"This made me smile, thanks :)",This made me smile thanks,Np,Np
1,"To everyone with stress, here you go a online ...",To everyone with stress here you go a online b...,"To everyone with stress, here you go a online ...",To everyone with stress here you go a online b...,&gt;!5 e !&lt;,gt5 e lt
2,Does anyone else feel CONSTANTLY stressed? My ...,Does anyone else feel CONSTANTLY stressed My w...,It sounds like you want to achieve a state whe...,It sounds like you want to achieve a state whe...,That actually sounds pretty fitting! Thank you...,That actually sounds pretty fitting Thank you ...
3,Does anyone else feel CONSTANTLY stressed? My ...,Does anyone else feel CONSTANTLY stressed My w...,yes ALL the time. its super draining imo :( \n...,yes ALL the time its super draining imo \ndis...,It’s SO refreshing to hear someone else feels ...,Its SO refreshing to hear someone else feels t...
4,Does anyone else feel CONSTANTLY stressed? My ...,Does anyone else feel CONSTANTLY stressed My w...,What are your thoughts about relaxing? Do you ...,What are your thoughts about relaxing Do you f...,Yes!! I definitely was told growing up that re...,Yes I definitely was told growing up that rela...


In [None]:
texts_p = [t for t in stress_pca['Post_cleaned']]
texts_c = [t for t in stress_pca['Comment_cleaned']]
texts_a = [t for t in stress_pca['Answer_cleaned']]

In [None]:
import pandas as pd

res_p = sentiment_analysis(texts_p)
res_c = sentiment_analysis(texts_c)
res_a = sentiment_analysis(texts_a)
sentiment_stress_pp = pd.DataFrame(res_p)
sentiment_stress_c= pd.DataFrame(res_c)
sentiment_stress_a = pd.DataFrame(res_a)

In [None]:
sentiment_stress_pp.to_csv("sentiment_stress_pp")
sentiment_stress_c.to_csv("sentiment_stress_c")
sentiment_stress_a.to_csv("sentiment_stress_a")

Total dataset creation

In [None]:
sentiment_stress_pp.rename(columns={'text': 'post','neutral':'neutral_p', 'negative':'negative_p', 'positive':'positive_p' }, inplace=True)
sentiment_stress_c.rename(columns={'text': 'comment', 'neutral':'neutral_c', 'negative':'negative_c', 'positive':'positive_c'}, inplace=True)
sentiment_stress_a.rename(columns={'text': 'answer','neutral':'neutral_a', 'negative':'negative_a', 'positive':'positive_a'}, inplace=True)
concatenated_df = pd.concat([sentiment_stress_pp, sentiment_stress_c, sentiment_stress_a], axis=1)
concatenated_df.to_csv("sentiment_stress_pca_scores")

Sentiment difference

In [None]:
concatenated_df = pd.read_csv("sentiment_stress_pca_scores")
concatenated_df["neutral_diff"] = concatenated_df['neutral_p'] - concatenated_df['neutral_a']
concatenated_df["positive_diff"] = concatenated_df['positive_p'] - concatenated_df['positive_a']
concatenated_df["negative_diff"] = concatenated_df['negative_p'] - concatenated_df['negative_a']

In [None]:
avg_diff_neutral = concatenated_df['neutral_diff'].sum()/(len(concatenated_df))
print(avg_diff_neutral)
#the replies are more likely to be labelled neutral than the posts by 0.105

avg_diff_negative = concatenated_df['negative_diff'].sum()/(len(concatenated_df))
print(avg_diff_negative)
#the replies are less likely to be labelled negative than the posts by 0.413

avg_diff_positive = concatenated_df['positive_diff'].sum()/(len(concatenated_df))
print(avg_diff_positive)
#the replies are more likely to be labelled positive than the posts by 0.308

-0.1051151497005988
0.41336610778443117
-0.3082520958083832


In [None]:
sentiment_df.to_csv("sentiment_stress.csv")

## Analysis all files comments and replies

In [None]:
#import all files and merge the dataset
import pandas as pd
f0 = pd.read_csv("anger_comments_replies.csv")
f0['group'] = 'anger'
f1 = pd.read_csv("anxiety_comments_replies.csv")
f1['group'] = 'anxiety'
f2 = pd.read_csv("depression_comments_replies.csv")
f2['group'] = 'depression'
f3 = pd.read_csv("relationship_comments_replies.csv")
f3['group'] = 'relationship'
f4 = pd.read_csv("stress_comments_replies.csv")
f4['group'] = 'stress'
df1 = pd.concat([f0, f1, f2, f3, f4], axis=0)
df1.head()
#print(len(f0), len(f1), len(f2), len(f3), len(f4), len(f0)+len(f1)+len(f2)+len(f3)+len(f4),len(file_tot))

Unnamed: 0,title,text,author,Comment,Answer,group
0,Not a single person has upvoted anything I've ...,I know this is such a stupid pet peeve but still.,[deleted],You are on Reddit for the wrong reason.,Thats fair,anger
1,Not a single person has upvoted anything I've ...,I know this is such a stupid pet peeve but still.,[deleted],well hey. Happy cake day.,Its a bit late but thanks :&gt;,anger
2,Not a single person has upvoted anything I've ...,I know this is such a stupid pet peeve but still.,[deleted],Happy late cake day! Hope your day is going well,Thanks mate,anger
3,Not a single person has upvoted anything I've ...,I know this is such a stupid pet peeve but still.,[deleted],Happy cake day!! I'm just one person but I hop...,Thanks. Really.\n\nEdit: Why the fuck are peop...,anger
4,Not a single person has upvoted anything I've ...,I know this is such a stupid pet peeve but still.,[deleted],Happy cake day! Sorry about the missing upvotes.,Who in the name of fuck downvoted this comment,anger


In [None]:
df1['Post'] = df1['title'] + ' ' + df1['text']
df1['Post_cleaned'] = df1.Post.apply(preprocess1)
df1['Answer_cleaned'] = df1.Answer.apply(preprocess1)
df1['Comment'] = df1['Comment'].astype(str)
df1['Comment_cleaned'] = df1.Comment.apply(preprocess1)

In [None]:
all_pca = df1[['Post','Post_cleaned','Comment','Comment_cleaned','Answer','Answer_cleaned']]
all_pca.head()

Unnamed: 0,Post,Post_cleaned,Comment,Comment_cleaned,Answer,Answer_cleaned
0,Not a single person has upvoted anything I've ...,Not a single person has upvoted anything Ive p...,You are on Reddit for the wrong reason.,You are on Reddit for the wrong reason,Thats fair,Thats fair
1,Not a single person has upvoted anything I've ...,Not a single person has upvoted anything Ive p...,well hey. Happy cake day.,well hey Happy cake day,Its a bit late but thanks :&gt;,Its a bit late but thanks gt
2,Not a single person has upvoted anything I've ...,Not a single person has upvoted anything Ive p...,Happy late cake day! Hope your day is going well,Happy late cake day Hope your day is going well,Thanks mate,Thanks mate
3,Not a single person has upvoted anything I've ...,Not a single person has upvoted anything Ive p...,Happy cake day!! I'm just one person but I hop...,Happy cake day Im just one person but I hope i...,Thanks. Really.\n\nEdit: Why the fuck are peop...,Thanks Really\n\nEdit Why the fuck are people ...
4,Not a single person has upvoted anything I've ...,Not a single person has upvoted anything Ive p...,Happy cake day! Sorry about the missing upvotes.,Happy cake day Sorry about the missing upvotes,Who in the name of fuck downvoted this comment,Who in the name of fuck downvoted this comment


In [None]:
texts_p = [t for t in all_pca['Post_cleaned']]
texts_c = [t for t in all_pca['Comment_cleaned']]
texts_a = [t for t in all_pca['Answer_cleaned']]

In [None]:
#sentiment
import pandas as pd

res_p = sentiment_analysis(texts_p)
res_c = sentiment_analysis(texts_c)
res_a = sentiment_analysis(texts_a)
sentiment_all_pp = pd.DataFrame(res_p)
sentiment_all_c= pd.DataFrame(res_c)
sentiment_all_a = pd.DataFrame(res_a)

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
sentiment_all_pp.to_csv("sentiment_all_pp")
sentiment_all_c.to_csv("sentiment_all_c")
sentiment_all_a.to_csv("sentiment_all_a")

total dataset creation

In [None]:
sentiment_all_pp.rename(columns={'text': 'post','neutral':'neutral_p', 'negative':'negative_p', 'positive':'positive_p' }, inplace=True)
sentiment_all_c.rename(columns={'text': 'comment', 'neutral':'neutral_c', 'negative':'negative_c', 'positive':'positive_c'}, inplace=True)
sentiment_all_a.rename(columns={'text': 'answer','neutral':'neutral_a', 'negative':'negative_a', 'positive':'positive_a'}, inplace=True)
concatenated_df = pd.concat([sentiment_all_pp, sentiment_all_c, sentiment_all_a], axis=1)
concatenated_df.to_csv("sentiment_all_pca_scores")

Sentiment difference

In [None]:
sentiment_df = pd.read_csv("sentiment_all_pca_scores")
sentiment_df.head()
sentiment_df["neutral_diff"] = sentiment_df['neutral_p'] - sentiment_df['neutral_a']
sentiment_df["positive_diff"] = sentiment_df['positive_p'] - sentiment_df['positive_a']
sentiment_df["negative_diff"] = sentiment_df['negative_p'] - sentiment_df['negative_a']

In [None]:
avg_diff_neutral = sentiment_df['neutral_diff'].sum()/(len(sentiment_df))
print(avg_diff_neutral)
#the replies are more likely to be labelled neutral than the posts by 0.085

avg_diff_negative = sentiment_df['negative_diff'].sum()/(len(sentiment_df))
print(avg_diff_negative)
#the replies are less likely to be labelled negative than the posts by 0.263

avg_diff_positive = sentiment_df['positive_diff'].sum()/(len(sentiment_df))
print(avg_diff_positive)
#the replies are more likely to be labelled positive than the posts by 0.178

#-0.08510781025395305
#0.26345107211308105
#-0.17834614877815047

-0.08510781025395305
0.26345107211308105
-0.17834614877815047


In [None]:
sentiment_df.to_csv("sentiment_all.csv")

In [None]:
import pandas as pd
sentiment_df = pd.read_csv("sentiment_all.csv")

In [None]:
sentiment_df['max_p'] = sentiment_df[['positive_p','negative_p','neutral_p']].idxmax(axis=1)
sentiment_df['max_c'] = sentiment_df[['positive_c','negative_c','neutral_c']].idxmax(axis=1)
sentiment_df['max_a'] = sentiment_df[['positive_a','negative_a','neutral_a']].idxmax(axis=1)
sentiment_df.to_csv("dataset_r_sentiment.csv")

In [None]:
import pandas as pd
sentiment_df = pd.read_csv("dataset_r_sentiment.csv")
#realized graph on R studio

In [None]:
#CHANGE ANALYSIS BY TYPE OF COMMENT
sentiment_df_pos = sentiment_df[sentiment_df.max_c == "positive_c"]
sentiment_df_neg = sentiment_df[sentiment_df.max_c == "negative_c"]
sentiment_df_neu = sentiment_df[sentiment_df.max_c == "neutral_c"]

sentiment_df_pos.to_csv("sentiment_df_pos.csv")
sentiment_df_neg.to_csv("sentiment_df_neg.csv")
sentiment_df_neu.to_csv("sentiment_df_neu.csv")

In [None]:
#POSITIVE COMMENTS ONLY
sentiment_df = sentiment_df_pos
avg_diff_neutral = sentiment_df['neutral_diff'].sum()/(len(sentiment_df))
print(avg_diff_neutral)

avg_diff_negative = sentiment_df['negative_diff'].sum()/(len(sentiment_df))
print(avg_diff_negative)

avg_diff_positive = sentiment_df['positive_diff'].sum()/(len(sentiment_df))
print(avg_diff_positive)

#-0.02854088678583947
#0.24572868909919382
#-0.21719258675078865

-0.02854088678583947
0.24572868909919382
-0.21719258675078865


In [None]:
#NEGATIVE COMMENTS ONLY
sentiment_df = sentiment_df_neg
avg_diff_neutral = sentiment_df['neutral_diff'].sum()/(len(sentiment_df))
print(avg_diff_neutral)

avg_diff_negative = sentiment_df['negative_diff'].sum()/(len(sentiment_df))
print(avg_diff_negative)

avg_diff_positive = sentiment_df['positive_diff'].sum()/(len(sentiment_df))
print(avg_diff_positive)

#-0.11158477479966841
#0.2422783227410887
#-0.13069570323293725

-0.11158477479966841
0.2422783227410887
-0.13069570323293725


In [None]:
#NEUTRAL COMMENTS ONLY
sentiment_df = sentiment_df_neu
avg_diff_neutral = sentiment_df['neutral_diff'].sum()/(len(sentiment_df))
print(avg_diff_neutral)

avg_diff_negative = sentiment_df['negative_diff'].sum()/(len(sentiment_df))
print(avg_diff_negative)

avg_diff_positive = sentiment_df['positive_diff'].sum()/(len(sentiment_df))
print(avg_diff_positive)

#-0.12008933617701945
#0.3313358304452146
#-0.21124790722474007

-0.12008933617701945
0.3313358304452146
-0.21124790722474007
