#### Project Objectives
What is the relationship between the usage of social networks and mental health? (+/-) <br>
What are the topics related to mental health and social networks?<br>
Can we predict some mental health metric based on the usage of social networks?

#### Similar works
A work that is similar to mine is the [second place in the arquivo.pt award in 2022](ref:https://sobre.arquivo.pt/en/meet-the-winners-of-the-arquivo-pt-award-2022/).<br>
The referred work also explores the mental health subject, and goes through a topic modelling proccess.

#### Data aquisition
Firstly request all the articles regarding mental health, then, filter them by social networks terms. <br>After that,place all the relevant data in a dataframe structure and persist it in a file.

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import concurrent.futures
import time

In [None]:
# Max items the api can retrieve.
API_MAX_ITEMS = str(500)

# Terms related with mental health.
# ref: https://www.psicanaliseclinica.com/doencas-mentais/
mental_health_terms = ['"saude mental"', 'saude mental', '"saúde mental"', 'saúde mental', 'depressão', 'ansiedade', 'burnout', 'anorexia', 'bulimia', 'esquizofrenia']
# Terms related with social networks.
social_net_terms = ['redes sociais', 'rede social', 'Facebook', 'Instagram', 'Twitter', 'LinkedIn', 'Tik-tok']

# Years to be analyzed.
years = range(2004, 2021)

In [3]:
def make_request(url):
    response = requests.get(url)
    content_type = response.headers.get("content-type")
    response_content = ''
    
    if response.status_code == 200:        
        if 'application/json' in content_type:
            response_content = response.json()
        elif 'text/plain' or 'text/html' in content_type:
            response_content = response.text

    return response.status_code, response_content

### Web Scrapping functions

In [4]:
# For Observador news site articles
def obs_extract_text_from_class(url):
    classes = ['longform_content', 'content', 'article-body-content', 'article-body', 'amp-wp-article-content']
    code, text = make_request(url)
    if code != 200:
        code, text = make_request(url)
        
    # Parse the HTML content
    soup = BeautifulSoup(text, "html.parser")
    full_text = ''
    flag = -1
    # Find all elements with the specified class
    for c in classes:
        #print('Searching for class ', c)
        if flag == 1:
            #print('Exiting for loop.')
            break
        # Find the specific class
        divs = soup.find_all('div', class_ = c)
        #print(divs)
        if divs:
            flag = 1
            #print('Found class: ', c)
            for div in divs:
                p_tags = div.find_all('p')
                for p in p_tags:
                    full_text += p.text        

    # Strip trailing whitespace
    full_text = full_text.strip()
    # Remove \r\n and extra spaces
    clean_text = " ".join(full_text.split())

    return clean_text

In [5]:
# For Publico news site articles
def extract_text_from_class(url, class_name):
    code, text = make_request(url)
    if code != 200:
        code, text = make_request(url)
        
    # Parse the HTML content
    soup = BeautifulSoup(text, "html.parser")
    full_text = ''

    # Find all elements with the specified class
    if class_name == 'p':
        elements = soup.find_all('p')
    else:
        # Find the specific class
        divs = soup.find_all('div', class_=class_name)
        for div in divs:
            p_tags = div.find_all('p')
            for p in p_tags:
                full_text += p.text        
    
    if class_name == 'p':
        # Concatenate all text from the elements into a single string
        full_text = " ".join(element.get_text(strip=True) for element in elements)
    
    # Strip trailing whitespace
    full_text = full_text.strip()
    # Remove \r\n and extra spaces
    clean_text = " ".join(full_text.split())

    return clean_text

### Getting articles from Observador

In [7]:
obs_substrs = ['/Facebook','Partilhe os factos Partilhar: Incorporar:', '©', '(.)']

In [8]:
def process_start_end_date(quarter, year):
    start = '0' + str(int(quarter)+1)
    end = '0' + str(int(quarter)+3)
    
    if len(start) > 2 :
        start = start[-2:]
        end = end[-2:]
    
    start = str(year) + start + '01'
    end = str(year) + end + '31'

    return start, end

def find_substrings(text, substrings):
    found_term = -1
    for term in substrings: 
        # find returns -1 when it doesnt find the term. 
        term_index = text.find(term)
        if term_index > 0: 
            #print(f'Row:{i} - found term: {term}')
            found_term = term_index
            break
    return found_term

In [9]:
# Initialize the lists
news_site = []
tstamp = []
title = []
text = []
quarters = ['00', '03', '06', '09']
linkToNoFrame = []
linkToArchive = []

def process_url(site, term, start, end, retries=3, delay=15):
    url = f'https://arquivo.pt/textsearch?q={term}&from={start}&to={end}&siteSearch={site}&dedupValue=1&maxItems={API_MAX_ITEMS}&fields=linkToArchive,linkToNoFrame,tstamp,title'
    #print(f"Processing URL: {url}")  # Debugging statement
    for attempt in range(retries):
        time.sleep(1)  # Wait before making the request
        response_code, response_json = make_request(url)
        if response_code == 200:
            #print(f"Response received for URL: {url}")  # Debugging statement
            results = []
            for item in response_json['response_items']:
                if item['linkToNoFrame'].find('/programas/') > -1 or item['title'].find('Tudo sobre:') > -1:
                    continue
                full_text = obs_extract_text_from_class(item['linkToNoFrame'])
                if full_text and find_substrings(full_text, mental_health_terms) != -1 and item['title'] not in title:
                    results.append({
                        'site': site.replace('www.', ''),
                        'tstamp': item['tstamp'],
                        'title': item['title'],
                        'text': full_text,
                        'linkToNoFrame': item['linkToNoFrame'],
                        'linkToArchive': item['linkToArchive']
                    })
            return results
        else:
            print(f"Failed to get response for URL: {url} with status code: {response_code}, attempt {attempt + 1}/{retries}")  # Debugging statement
            time.sleep(delay)  # Wait before retrying
    return []

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []
    for site in ['www.observador.pt', 'observador.pt']:
        for t in mental_health_terms:
            term = t.replace(' ', '%20')
            for y in range(2014, 2021):
                for i in range(len(quarters)):
                    start, end = process_start_end_date(quarters[i], y)
                    futures.append(executor.submit(process_url, site, term, start, end))
    
    for future in concurrent.futures.as_completed(futures):
        results = future.result()  # Get the results from the future
        # if results:
        #     print(f"Results received: {results}")  # Debugging statement
        for result in results:
            news_site.append(result['site'])
            tstamp.append(result['tstamp'])
            title.append(result['title'])
            text.append(result['text'])
            linkToNoFrame.append(result['linkToNoFrame'])
            linkToArchive.append(result['linkToArchive'])

print("Processing complete.")
print(f"Total articles found: {len(title)}")

  k = self.parse_starttag(i)


Processing complete.
Total articles found: 2991


In [10]:
data = {
    'news_site': news_site,
    'tstamp': tstamp,
    'title': title,
    'text': text,
    'linkToNoFrame': linkToNoFrame,
    'linkToArchive': linkToArchive,
}

obs_df = pd.DataFrame.from_dict(data)
obs_df.describe(include='all')

Unnamed: 0,news_site,tstamp,title,text,linkToNoFrame,linkToArchive
count,2991,2991,2991,2991,2991,2991
unique,1,2988,2760,2729,2988,2988
top,observador.pt,20191101182217,Depressão. Perceberam que a tristeza que senti...,Dias antes das eleições presidenciais nos EUA ...,https://arquivo.pt/noFrame/replay/201911011822...,https://arquivo.pt/wayback/20191101182217/http...
freq,2991,2,5,4,2,2


### Preprocessing 

In [11]:
def remove_substrings(content, substrs):
    for s in substrs:
        content = content.replace(s, '.')
    content = content.replace('..', '.').strip()
    return content

def remove_by_regex(content, regex, replaced_by):
    matches = re.findall(regex, content)
    for matched in matches:
        content = content.replace(matched, replaced_by)
    return content

def fix_sentence_with_regex(content, regex, replaced_by):
    matches = re.findall(regex, content)
    for matched in matches:
        content = content.replace(matched, replaced_by + matched[-1])
    return content

In [29]:
obs_df_preprocessed = obs_df.copy()
for i in range(obs_df.shape[0]):
    text = obs_df.loc[i, 'text']
    text = re.sub(r'http\S+', '', text)
    text = remove_substrings(text, obs_substrs)
    text = remove_by_regex(text, r'[A-ZÀ-Ý]+(?:\s[A-ZÀ-Ý]+)*/OBSERVADOR', '.')
    text = remove_by_regex(text, r'[A-Za-zÀ-Ýà-ý]+(?:\s[A-Za-zÀ-Ýà-ý]+)*/Observador', '.')
    text = text.replace('facebook', 'Facebook').replace('instagram', 'Instagram').replace('twitter', 'Twitter').replace('linkedin', 'LinkedIn').replace('tik-tok', 'Tik-tok')
    
    for term in mental_health_terms + social_net_terms:
        pattern = rf'{term}[a-zA-Z]'
        text = fix_sentence_with_regex(text, pattern, f'{term}.')

    found_term = find_substrings(text, social_net_terms)
    # if no term was found the variable found_term < 0
    if found_term < 0:
        obs_df_preprocessed = obs_df_preprocessed.drop(i, axis=0)
    else:
        obs_df_preprocessed.loc[i, 'text'] = text

obs_df_preprocessed = obs_df_preprocessed.reset_index(drop=True)

In [30]:
obs_df_preprocessed.describe(include='all')

Unnamed: 0,news_site,tstamp,title,text,linkToNoFrame,linkToArchive
count,670,670,670,670,670,670
unique,1,669,625,623,669,669
top,observador.pt,20200319034042,"Da esperança à ansiedade. Como olham bascos, g...",O surto de Covid-19 decretou o fecho de escola...,https://arquivo.pt/noFrame/replay/202003190340...,https://arquivo.pt/wayback/20200319034042/http...
freq,670,2,3,3,2,2


In [14]:
# for i in range(obs_df_preprocessed.shape[0]):
#     print(obs_df_preprocessed.loc[i, 'text'], '\n')

### Getting articles from Publico

In [15]:
# Cleaning lists
news_site = []
tstamp = []
title = []
text = []
quarters = ['00', '03', '06', '09']
linkToNoFrame = []
linkToArchive = []

def process_url(site, term, start, end, retries=3, delay=15):
    url = f'https://arquivo.pt/textsearch?q={term}&from={start}&to={end}&siteSearch={site}&dedupValue=1&maxItems={API_MAX_ITEMS}&fields=linkToArchive,linkToNoFrame,tstamp,title'
    #print(f"Processing URL: {url}")  # Debugging statement
    for attempt in range(retries):
        time.sleep(1)  # Wait before making the request
        response_code, response_json = make_request(url)
        if response_code == 200:
            #print(f"Response received for URL: {url}")  # Debugging statement
            results = []
            if response_code == 200:
                for item in response_json['response_items']:
                    year = int(item['tstamp'][:4])
                    if year < 2005:
                        class_name = "p"
                    elif year >= 2005 and year < 2009:
                        class_name = "texto"
                    elif year >= 2009 and year < 2013:
                        class_name = "noticia"
                    elif year >= 2013 and year < 2017:
                        class_name = "entry-body"
                    else:
                        class_name = "story__body"
                        
                    full_text = extract_text_from_class(item['linkToNoFrame'], class_name)
                    if full_text and find_substrings(full_text, mental_health_terms) != -1 and item['title'] not in title:
                        results.append({
                            'site': site.replace('www.', ''),
                            'tstamp': item['tstamp'],
                            'title': item['title'],
                            'text': full_text,
                            'linkToNoFrame': item['linkToNoFrame'],
                            'linkToArchive': item['linkToArchive']
                        })
            return results
        else:
            print(f"Failed to get response for URL: {url} with status code: {response_code}, attempt {attempt + 1}/{retries}")  # Debugging statement
            time.sleep(delay)  # Wait before retrying
    return []

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []
    for site in ['www.publico.pt', 'publico.pt']:
        for t in mental_health_terms:
            term = t.replace(' ', '%20')
            for y in years:
                for i in range(len(quarters)):
                    start, end = process_start_end_date(quarters[i], y)
                    futures.append(executor.submit(process_url, site, term, start, end))
    
    for future in concurrent.futures.as_completed(futures):
        results = future.result()  # Get the results from the future
        # if results:
        #     print(f"Results received: {results}")  # Debugging statement
        for result in results:
            news_site.append(result['site'])
            tstamp.append(result['tstamp'])
            title.append(result['title'])
            text.append(result['text'])
            linkToNoFrame.append(result['linkToNoFrame'])
            linkToArchive.append(result['linkToArchive'])

print("Processing complete.")
print(f"Total articles found: {len(title)}")

  k = self.parse_starttag(i)


Failed to get response for URL: https://arquivo.pt/textsearch?q=burnout&from=20150401&to=20150631&siteSearch=www.publico.pt&dedupValue=1&maxItems=500&fields=linkToArchive,linkToNoFrame,tstamp,title with status code: 429, attempt 1/3
Failed to get response for URL: https://arquivo.pt/textsearch?q=burnout&from=20150701&to=20150931&siteSearch=www.publico.pt&dedupValue=1&maxItems=500&fields=linkToArchive,linkToNoFrame,tstamp,title with status code: 429, attempt 1/3
Failed to get response for URL: https://arquivo.pt/textsearch?q=burnout&from=20151001&to=20151231&siteSearch=www.publico.pt&dedupValue=1&maxItems=500&fields=linkToArchive,linkToNoFrame,tstamp,title with status code: 429, attempt 1/3
Failed to get response for URL: https://arquivo.pt/textsearch?q=burnout&from=20160101&to=20160331&siteSearch=www.publico.pt&dedupValue=1&maxItems=500&fields=linkToArchive,linkToNoFrame,tstamp,title with status code: 429, attempt 1/3
Failed to get response for URL: https://arquivo.pt/textsearch?q=burn

In [16]:
# Creating the DataFrame
data = {
    'news_site': news_site,
    'tstamp': tstamp,
    'title': title,
    'text': text,
    'linkToNoFrame': linkToNoFrame,
    'linkToArchive': linkToArchive,
}

publico_df = pd.DataFrame.from_dict(data)
publico_df.describe(include='all')

Unnamed: 0,news_site,tstamp,title,text,linkToNoFrame,linkToArchive
count,277,277,277,277,277,277
unique,1,277,263,264,277,277
top,publico.pt,20110705105955,Sobreviver à ansiedade | REVISTA_2 | PÚBLICO,Sob o mote “A Depressão dói. Mas pode deixar d...,https://arquivo.pt/noFrame/replay/201107051059...,https://arquivo.pt/wayback/20110705105955/http...
freq,277,1,3,3,1,1


### Preprocessing articles

#### Functions

In [17]:
def remove_after_substrings(content, substrs):
    for s in substrs:
        index = content.find(s)
        if index != -1:
            content = content[:index].strip() + '.'
            content = content.replace('..', '.')
    return content

def remove_photo_quote(content):
    regex = r'Foto[A-Z]{1}'
    matches = re.findall(regex, content)
    for matched in matches:
        content = content.replace(matched, matched[-1])
    return content

In [None]:
substrs = ['/Facebook','PUBA carregar...', 'PUB', 'Subscrever×', 'voltar ao índice', 'O melhor do Público no emailSubscreva gratuitamente as newsletters e receba o melhor da actualidade e os trabalhos mais profundos do Público.', 'Continuar a ler', 'A carregar', 'i-albumgrafia', 'i-album', 'Fotografia', 'Partilhar citaçãoPartilhar no FacebookPartilhar no Twitter']
after_substrs = ['Mais populares', 'Ler mais', '©', 'Notícia publicada', 'Notícia actualizada', 'Texto editado por']

def preprocess_data(text):
    text = re.sub(r'http\S+', '', text)
    text = remove_after_substrings(text, after_substrs)
    text = remove_substrings(text, substrs)
    text = remove_photo_quote(text)
    text = text.replace('facebook', 'Facebook').replace('instagram', 'Instagram').replace('twitter', 'Twitter').replace('linkedin', 'LinkedIn').replace('tik-tok', 'Tik-tok')
    for term in mental_health_terms + social_net_terms:
        pattern = rf'{term}[a-zA-Z]'
        text = fix_sentence_with_regex(text, pattern, f'{term}.')
    return text

#### Preprocessing

In [19]:
publico_df_preprocessed = publico_df.copy()

for i in range(publico_df.shape[0]):
    text = preprocess_data(publico_df.loc[i, 'text'])
    found_term = find_substrings(text, social_net_terms)
    # if no term was found the variable found_term < 0
    if found_term < 0:
        publico_df_preprocessed = publico_df_preprocessed.drop(i, axis=0)
    else:
        publico_df_preprocessed.loc[i, 'text'] = text

publico_df_preprocessed = publico_df_preprocessed.reset_index(drop=True)

In [20]:
print(publico_df_preprocessed.shape)
publico_df_preprocessed.head()

(35, 6)


Unnamed: 0,news_site,tstamp,title,text,linkToNoFrame,linkToArchive
0,publico.pt,20190410191430,"Harry, Meghan e Oprah juntos para uma série so...",Harry e Meghan anunciaram nesta quarta-feira o...,https://arquivo.pt/noFrame/replay/201904101914...,https://arquivo.pt/wayback/20190410191430/http...
1,publico.pt,20110629190213,Programa do XIX Governo Constitucional - Polít...,- Garantindo aos ex‐combatentes a manutenção d...,https://arquivo.pt/noFrame/replay/201106291902...,https://arquivo.pt/wayback/20110629190213/http...
2,publico.pt,20180803172219,"Óbito | Morreu Rick Genest, o modelo conhecido...","Aos 32 anos, Rick Genest (1985-2018) foi encon...",https://arquivo.pt/noFrame/replay/201808031722...,https://arquivo.pt/wayback/20180803172219/http...
3,publico.pt,20181113184825,"Incêndios | Miley Cyrus, Neil Young entre os q...",Os cantores Miley Cyrus e Neil Young estão ent...,https://arquivo.pt/noFrame/replay/201811131848...,https://arquivo.pt/wayback/20181113184825/http...
4,publico.pt,20181203192427,Empreendedorismo | (Mulheres + “Startups”) x S...,Se abrir uma empresa de tecnologia é uma taref...,https://arquivo.pt/noFrame/replay/201812031924...,https://arquivo.pt/wayback/20181203192427/http...


In [21]:
# for i in range(publico_df_preprocessed.shape[0]):
#     print(publico_df_preprocessed.loc[i, 'text'], '\n')

### Persist the data

In [31]:
df = pd.concat([publico_df, obs_df], axis=0, ignore_index=True) 
df.describe(include='all')

Unnamed: 0,news_site,tstamp,title,text,linkToNoFrame,linkToArchive
count,3268,3268,3268,3268,3268,3268
unique,2,3265,3022,2993,3265,3265
top,observador.pt,20200319034042,Depressão. Perceberam que a tristeza que senti...,Dias antes das eleições presidenciais nos EUA ...,https://arquivo.pt/noFrame/replay/202003190340...,https://arquivo.pt/wayback/20200319034042/http...
freq,2991,2,5,4,2,2


In [32]:
df = df.drop_duplicates(subset=['text'],keep=False)
df = df.drop_duplicates(subset=['title'],keep=False).reset_index(drop=True)
df.describe(include='all')

Unnamed: 0,news_site,tstamp,title,text,linkToNoFrame,linkToArchive
count,2681,2681,2681,2681,2681,2681
unique,2,2681,2681,2681,2681,2681
top,observador.pt,20110705105955,Rede de cuidados continuados de saúde mental a...,As unidades e equipas de cuidados continuados ...,https://arquivo.pt/noFrame/replay/201107051059...,https://arquivo.pt/wayback/20110705105955/http...
freq,2435,1,1,1,1,1


In [33]:
df_preprocessed = pd.concat([publico_df_preprocessed, obs_df_preprocessed], axis=0, ignore_index=True) 
df_preprocessed.describe(include='all')

Unnamed: 0,news_site,tstamp,title,text,linkToNoFrame,linkToArchive
count,705,705,705,705,705,705
unique,2,704,657,656,704,704
top,observador.pt,20200319034042,"Da esperança à ansiedade. Como olham bascos, g...",O surto de Covid-19 decretou o fecho de escola...,https://arquivo.pt/noFrame/replay/202003190340...,https://arquivo.pt/wayback/20200319034042/http...
freq,670,2,3,3,2,2


In [34]:
df_preprocessed = df_preprocessed.drop_duplicates(subset=['text'],keep=False)
df_preprocessed = df_preprocessed.drop_duplicates(subset=['title'],keep=False).reset_index(drop=True)
df_preprocessed.describe(include='all')

Unnamed: 0,news_site,tstamp,title,text,linkToNoFrame,linkToArchive
count,589,589,589,589,589,589
unique,2,589,589,589,589,589
top,observador.pt,20190410191430,"Harry, Meghan e Oprah juntos para uma série so...",Harry e Meghan anunciaram nesta quarta-feira o...,https://arquivo.pt/noFrame/replay/201904101914...,https://arquivo.pt/wayback/20190410191430/http...
freq,562,1,1,1,1,1


In [35]:
# Persist into a csv file - a column for journal name, publication date, news title, article text, linkToNoFrame, linkToArchive
df.to_csv('raw_data.csv')
df_preprocessed.to_csv('preprocessed_data.csv')