In [None]:
import json
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from tqdm.notebook import tqdm
from datetime import datetime
import glob
import re
from collections import Counter

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from string import punctuation

In [None]:
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
anxiety = pd.read_csv('Copy of anxiety.csv')
depression = pd.read_csv('Copy of depression_dataset.csv')

In [None]:
anxiety = anxiety[anxiety['rawContent'].notna()]

In [None]:
def pre_processing(text):
    # Remove hyperlinks
    text = re.sub(r'https?:\/\/\S+', '', text)

    # Remove mentions (@)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)

    # Remove newline character
    text = re.sub(r'\n','', text)

    # Everything except letters, numbers, and hashtags are replaced with a space.
    text = re.sub(r"[^A-Za-z0-9#]+", ' ', text)

    # Remove any extra spaces between words, and trailing or leading spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

depression['tweetCleaned'] = depression['rawContent'].apply(pre_processing)
anxiety['tweetCleaned'] = anxiety['rawContent'].apply(pre_processing)

In [None]:
# Tokenization and Lowercasing
text1 = depression['tweetCleaned']
text2 = anxiety['tweetCleaned']

d_tokens_list = [word_tokenize(i) for i in text1]
a_tokens_list = [word_tokenize(i) for i in text2]

d_lc_tokens_list = []
a_lc_tokens_list = []

for i in d_tokens_list:
    d_lc_tokens_list.append([token.lower() for token in i])

for i in a_tokens_list:
    a_lc_tokens_list.append([token.lower() for token in i])


In [None]:
stop_words = set(stopwords.words('english'))

# Add the punctuation in the stop words set.
stop_words.update(punctuation)
stop_words.add("...")
stop_words.add("..")
stop_words.add("e.g")
depression_stop_words = stop_words
anxiety_stop_words = stop_words
depression_stop_words.add("depression")
anxiety_stop_words.add("anxiety")

depression_filtered_sentence = []
for i in d_lc_tokens_list:
    depression_filtered_sentence.append([token for token in i if token not in depression_stop_words])

anxiety_filtered_sentence = []
for i in a_lc_tokens_list:
    anxiety_filtered_sentence.append([token for token in i if token not in anxiety_stop_words])

# Remove Numbers
depression_filtered_sentence = [ ' '.join(i) for i in depression_filtered_sentence ]
depression_filtered_sentence = [ re.sub(r'\d+', '', sentence) for sentence in depression_filtered_sentence ]

anxiety_filtered_sentence = [ ' '.join(i) for i in anxiety_filtered_sentence ]
anxiety_filtered_sentence = [ re.sub(r'\d+', '', sentence) for sentence in anxiety_filtered_sentence ]


In [None]:
# Stemming or Lemmatization

stemming = False
if stemming:
  stemmer = SnowballStemmer("english", ignore_stopwords=True)
  d_stemmed_tokens_list = []
  a_stemmed_tokens_list = []

  # Remove certain stemmed words - placeholder!!!
  words_to_remove = []

  for i in depression_filtered_sentence:
    d_stemmed_tokens = [stemmer.stem(j) for j in i.split()]
    d_stemmed_tokens = [token for token in d_stemmed_tokens if token not in words_to_remove]
    d_stemmed_tokens_list.append(d_stemmed_tokens)

  for i in anxiety_filtered_sentence:
    a_stemmed_tokens = [stemmer.stem(j) for j in i.split()]
    a_stemmed_tokens = [token for token in a_stemmed_tokens if token not in words_to_remove]
    a_stemmed_tokens_list.append(a_stemmed_tokens)

  # Number of tokens
  duniques = np.unique([tok for doc in d_stemmed_tokens_list for tok in doc])
  print("Number of tokens after stemming: {}\n".format(len(duniques)))

  # Number of tokens
  auniques = np.unique([tok for doc in a_stemmed_tokens_list for tok in doc])
  print("Number of tokens after stemming: {}\n".format(len(auniques)))

  print('After stemming depression:')
  for i in d_stemmed_tokens_list[:10]:
    for j in i:
      print(j,end=" ")
    print(" ")

  print('After stemming anxiety:')
  for i in a_stemmed_tokens_list[:10]:
    for j in i:
      print(j,end=" ")
    print(" ")

else:
  lemmatizer = WordNetLemmatizer()
  d_lemmatized_tokens_list = []
  a_lemmatized_tokens_list = []

  for i in depression_filtered_sentence:
      d_lemmatized_tokens_list.append([lemmatizer.lemmatize(j) for j in i.split()])

  for i in anxiety_filtered_sentence:
      a_lemmatized_tokens_list.append([lemmatizer.lemmatize(j) for j in i.split()])

  # number of tokens
  duniques = np.unique([tok for doc in d_lemmatized_tokens_list for tok in doc])
  print("Number of tokens after lemmatization depression: {}\n".format(len(duniques)))

  auniques = np.unique([tok for doc in a_lemmatized_tokens_list for tok in doc])
  print("Number of tokens after lemmatization depression: {}\n".format(len(auniques)))

  print('After lemmatization depression:')
  for i in d_lemmatized_tokens_list[:10]:
      for j in i:
          print(j, end=" ")
      print(" ")

  print('After lemmatization anxiety:')
  for i in a_lemmatized_tokens_list[:10]:
      for j in i:
          print(j, end=" ")
      print(" ")

  # Set stemmed to lemmatized tokens to continue working with the same list
  d_stemmed_tokens_list = d_lemmatized_tokens_list
  a_stemmed_tokens_list = a_lemmatized_tokens_list

Number of tokens after lemmatization depression: 24629

Number of tokens after lemmatization depression: 36075

After lemmatization depression:
forced making decision move residential care bullying stopped age diagnosed developed autistic trates  
watching season dexter  
pretty sure maybe lazy idk  
strive like half friend happy like seeing happy half trying top help  
adhd benrey adhd w psychotic feature w psychotic feature r said  
pretty sure past day feeling pretty good feel like actually better hard explain like feeling happier expressive always background  
bad friend better understanding know need take manic episode amp move quickly try fix  
mood stable c ee  
dad told attitude ruined easter dinner vibe sorry randy  
said understand father deciding amp getting used fact time go husband called hour saw grandfather father passed point amp get treatment  
After lemmatization anxiety:
would go  
know pain ptsd due medical problem life  
might autism get help instead ignored  
like

In [None]:
# Calculate Tfidf values

d_list_tfidf = [" ".join(stemmed_list) for stemmed_list in d_stemmed_tokens_list]
vectorizer = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.5)
X = vectorizer.fit_transform(d_list_tfidf)

# Print the top tfidf values for the first document
feature_names = vectorizer.get_feature_names_out()
feature_index = X[0, :].nonzero()[1]
tfidf_scores = zip(feature_index, [X[0, x] for x in feature_index])

sorted_tfidf_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)

i = 0
for w, s in [(feature_names[i], score) for i, score in sorted_tfidf_scores]:
    print(w, s)
    i += 1
    if i == 10:
      break

autistic 0.4357963955724096
stopped 0.43381869492313113
move 0.4264762121313035
age 0.3935155037947275
making 0.3525004858776144
care 0.31837688103805917
diagnosed 0.24397950605968452


In [None]:
# Create unigrams and bigrams per document
unigrams = []
bigrams = []

for stemmed_tokens in d_stemmed_tokens_list:
  unigrams.append(stemmed_tokens)
  bigrams.append(list(ngrams(stemmed_tokens, 2)))

In [None]:
# Calculate Tfidf values

a_list_tfidf = [" ".join(stemmed_list) for stemmed_list in a_stemmed_tokens_list]
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(a_list_tfidf)


# Print the top tfidf values for the first document
feature_names = vectorizer.get_feature_names_out()
feature_index = X[0, :].nonzero()[1]
tfidf_scores = zip(feature_index, [X[0, x] for x in feature_index])

sorted_tfidf_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)

i = 0
for w, s in [(feature_names[i], score) for i, score in sorted_tfidf_scores]:
    print(w, s)
    i += 1
    if i == 10:
      break

would 0.7430273991075669
go 0.6692609985464898
