## Get Info

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re


def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove tags
    text = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", text)
    # Remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    text = text.strip()
    return text


def clean_digits(text):
    return int(text.split()[1].replace(',', ''))


def get_data(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup


def get_div_list(soup):
    num_list = ['One', 'Two', 'Three', 'Four',
                'Five', 'Six', 'Seven', 'Eight', 'Nine']
    div_list = []
    for num in num_list:
        link = soup.find('li', {'class': 'subNav'+num}).find('a')['href']
        div_list.append(link)
    return div_list


def get_all_topics(div_list):
    all_topic_titles = []
    all_topic_links = []

    for div_link in div_list:
        topic_titles = []
        topic_links = []

        sp = get_data(div_link)
        results = sp.find_all('li', {'class': 'cat-item'})
        for i, item in enumerate(results):
            topic_titles.append(clean_text(item.find('a').contents[0]))
            topic_links.append(item.find('a')['href'])
        all_topic_titles.extend(topic_titles[1:])
        all_topic_links.extend(topic_links[1:])
    return list(zip(all_topic_titles, all_topic_links))


def get_topic_detail(topic_link):

    big_list = []

    for x in range(15):
        try:
            topic_link_page = topic_link + f'/page/{x+1}'

            s = get_data(topic_link_page)

            small_list = []
            results = s.find_all('div', {'class': 'your-stories__text'})
            for article in results:
                item = article.find('a').contents[0]
                item = clean_text(item)
                small_list.append(item)
            big_list.extend(small_list)
        except:
            pass

    return big_list


def get_topic_info(topic_link):

    big_list_title = []
    big_list_view = []

    for x in range(15):
        try:
            topic_link_page = topic_link + f'/page/{x+1}'

            sp = get_data(topic_link_page)

            
            title_results = sp.find_all('div', {'class': 'your-stories__text'})
            small_list_title = []
            for title_item in title_results:
                title = title_item.find('a').contents[0]
                title = clean_text(title)
                small_list_title.append(title)
                
                
            link_results = sp.find_all('div', {'class': 'your-stories__content'})
            small_list_view = []
            for link_item in link_results:
                sub_link = link_item.find('a')['href']
                
                s = get_data(sub_link)
                view = s.find("span", title=re.compile("Page Views")).text
                view = clean_digits(view)
                small_list_view.append(view)
                
            big_list_title.extend(small_list_title)
            big_list_view.extend(small_list_view)
        except:
            pass

    return big_list_title, big_list_view

In [2]:
url = 'https://www.themix.org.uk/'

soup = get_data(url)
div_list = get_div_list(soup)
# Money page is different from other pages, modify it to make it work
div_list[5] = div_list[5] + '/benefits'
print(div_list)

['https://www.themix.org.uk/sex-and-relationships', 'https://www.themix.org.uk/your-body', 'https://www.themix.org.uk/mental-health', 'https://www.themix.org.uk/drink-and-drugs', 'https://www.themix.org.uk/housing', 'https://www.themix.org.uk/money/benefits', 'https://www.themix.org.uk/work-and-study', 'https://www.themix.org.uk/crime-and-safety', 'https://www.themix.org.uk/travel-and-lifestyle']


In [143]:
all_topic_titles = list(zip(*get_all_topics(div_list)))[0]
all_topic_links = list(zip(*get_all_topics(div_list)))[1]

In [53]:
topic_catalog = dict()

for topic_title, topic_link in get_all_topics(div_list):
    topic_catalog[topic_title] = get_topic_detail(topic_link)

topic_catalog

{'abortion': ['what are the new abortion laws in northern ireland',
  'abortion the law',
  'dealing with an abortion',
  'the abortion procedure',
  'abortion in ireland',
  'recovering after an abortion'],
 'consent': ['too wasted for sex'],
 'family life': ['how i recovered from abuse',
  'grief and bereavement',
  'christmas your relationship survival guide',
  'ambassador voices leaving your echo chamber',
  'interview what s it like talking to your family about tech use',
  'things you wish your parents knew about the way you use tech',
  'young dads caring for your mental health',
  'my mum was domestically abused',
  'how to recognise emotional abuse in relationships',
  'exams and the pressure to do well',
  'how to cope when mother s day or father s day is difficult',
  'my sister took her own life',
  '',
  'surviving valentine s day',
  'your first christmas with their family',
  'when a family member has dementia',
  'expert chat relationship problems at christmas',
  'chi

In [54]:
import json

with open('article.txt', 'w') as file:
     file.write(json.dumps(topic_catalog)) 

### Create content and view dataframe

In [None]:
url = 'https://www.themix.org.uk/'

soup = get_data(url)
div_list = get_div_list(soup)
# Money page is different from other pages, modify it to make it work
div_list[5] = div_list[5] + '/benefits'
all_topic_links = list(zip(*get_all_topics(div_list)))[1]

# title_list = []
# view_list = []

# for i in range(len(all_topic_links)):
#     topic_link = all_topic_links[i]
#     title_list.append(get_topic_info(topic_link)[0])
#     view_list.append(get_topic_info(topic_link)[1])
    
# title_list

In [45]:
def get_topic_view_df(all_topic_links):
    title_list = []
    view_list = []

    for i in range(len(all_topic_links)):
        topic_link = all_topic_links[i]
        title_list.extend(get_topic_info(topic_link)[0])
        view_list.extend(get_topic_info(topic_link)[1])

    df = pd.DataFrame({'title': title_list, 'views': view_list})
    return df

In [46]:
url = 'https://www.themix.org.uk/'

soup = get_data(url)
div_list = get_div_list(soup)
# Money page is different from other pages, modify it to make it work
div_list[5] = div_list[5] + '/benefits'
all_topic_links = list(zip(*get_all_topics(div_list)))[1]

article_df = get_topic_view_df(all_topic_links)
article_df.to_csv('article_content.csv')
article_df

Unnamed: 0,title,views
0,what are the new abortion laws in northern ire...,2194
1,abortion the law,13017
2,dealing with an abortion,10711
3,the abortion procedure,19274
4,abortion in ireland,6662
...,...,...
2001,amsterdam drug laws,8333
2002,travel insurance,3143
2003,ethical travel,2473
2004,travelling with a disability,2556


## Get Support

In [47]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import lxml
import re


def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove tags
    text = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", text)
    # Remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    text = text.strip()
    return text


def get_data(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup


def get_categories(url):
    soup = get_data(url)
    results = soup.find_all('h3', {'class': 'CategoryNameHeading'})
    categories = []
    cat_links = []
    for item in results:
        categories.append(clean_text(item.find('a').contents[0]))
        cat_links.append(item.find('a')['href'])
    return categories, cat_links

In [48]:
forum_url = 'https://community.themix.org.uk/'
discuss_board = get_categories(forum_url)[0]
discuss_board.append("recent discussion")
discuss_board

['start here',
 'introduce yourself',
 'help desk',
 'group chat announcements',
 'anything goes',
 'politics and debate',
 'travel and free time',
 'it s exam time',
 'health and wellbeing',
 'coronavirus covid',
 'sex relationships',
 'gender sexuality',
 'drink drugs',
 'home law money',
 'care experienced zone',
 'work volunteering',
 'student life',
 'change the world',
 'represent young people',
 'fix the mix',
 'articles',
 'recent discussion']

In [49]:
group_chat = ['group chat', 'community quiz', 'watch club', 'young carer', 
              'general chat', 'support chat', 'support circle', 'expert chat']

In [50]:
speak_to_team = ['speak to our team', 'helpline', 'email us', 'one to one chat',  'counselling', 'crisis messenger']

In [51]:
find_local_service = ['find local service']

In [52]:
support_dict = dict()

support_dict['discuss board'] = discuss_board
support_dict['group chat'] = group_chat
support_dict['speak to team'] = speak_to_team
support_dict['find local service'] = find_local_service

support_dict

{'discuss board': ['start here',
  'introduce yourself',
  'help desk',
  'group chat announcements',
  'anything goes',
  'politics and debate',
  'travel and free time',
  'it s exam time',
  'health and wellbeing',
  'coronavirus covid',
  'sex relationships',
  'gender sexuality',
  'drink drugs',
  'home law money',
  'care experienced zone',
  'work volunteering',
  'student life',
  'change the world',
  'represent young people',
  'fix the mix',
  'articles',
  'recent discussion'],
 'group chat': ['group chat',
  'community quiz',
  'watch club',
  'young carer',
  'general chat',
  'support chat',
  'support circle'],
 'speak to team': ['speak to our team',
  'helpline',
  'email us',
  'one to one chat',
  'counselling',
  'crisis messenger'],
 'find local service': ['find local service']}

In [53]:
import json

with open('support.txt', 'w') as file:
     file.write(json.dumps(support_dict)) 

## Apps and Tools | Get Involved

In [54]:
app_tool = ['apps and tools', 'stressheads', 'home truths', 'motimator', 'our apps', 
            'is my relationship healthy', 'define me']

In [55]:
volunteer = ['volunteering', 'helpline volunteering', 'get involved', 'jobs', 'support the mix s loneliness campaign',
             'young ambassador', 'youth representative', 'online volunteer counsellor', 'group chat moderator',
            'support chat moderator', 'general chat moderator', 'digital connector', 'boards moderator', 
             'relationship squad', 'helpline call taker volunteer',  'skills donation', 
             'service innovators committee member']

In [56]:
skill_up = ['courses', 'boost your skills', 'problem solving essential skills for problem', 
            'course problem', 'course brand you', 'money works', 'consent be better at sex',
           'conflict resolution your relationships course', 'anxiety depression course' ]

In [57]:
app_dict = dict()
app_dict['apps and tools'] = app_tool

import json
with open('app.txt', 'w') as file:
     file.write(json.dumps(app_dict)) 

In [58]:
volunteer_dict = dict()
volunteer_dict['volunteer'] = volunteer

import json
with open('volunteer.txt', 'w') as file:
     file.write(json.dumps(volunteer_dict)) 

In [59]:
skill_dict = dict()
skill_dict['skill up'] = skill_up

import json
with open('skill.txt', 'w') as file:
     file.write(json.dumps(skill_dict)) 

## News and Research

In [60]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import lxml
import re


def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove tags
    text = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", text)
    # Remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    # Remove whitespace of the begining and the end of the text
    text = text.strip()
    return text

def clean_digits(text):
    return int(text.split()[1].replace(',', ''))


def get_data(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup


def get_sub_list(soup):
    num_list = ['One', 'Two', 'Three', 'Four',
                'Five', 'Six', 'Seven']
    div_list = []
    for num in num_list:
        link = soup.find('li', {'class': 'subNav'+num}).find('a')['href']
        div_list.append(link)
    return div_list





def get_name_and_link(news_url):
    soup = get_data(news_url)
    cat_list = ['menu-item-17440', 'menu-item-17854', 'menu-item-19129', 'menu-item-19297', 
                'menu-item-18049', 'menu-item-17767', 'menu-item-17439']

    names = []
    links = []
    for cat in cat_list:
        result = soup.find('li', {'class': cat})
        names.append(clean_text(result.find('a').text))
        links.append(result.find('a')['href'])
    return names, links



In [61]:
news_url = 'https://www.themix.org.uk/news-and-research/news'
get_name_and_link(news_url)

(['news',
  'blogs',
  'case studies',
  'research',
  'the mix newsletter',
  'resources',
  'heads together'],
 ['https://www.themix.org.uk/news-and-research/news',
  'https://www.themix.org.uk/news-and-research/blogs',
  'https://www.themix.org.uk/news-and-research/case-studies',
  'https://www.themix.org.uk/news-and-research/research',
  'https://www.themix.org.uk/news-and-research/the-mix-newsletter',
  'https://www.themix.org.uk/news-and-research/resources',
  'https://www.themix.org.uk/news-and-research/heads-together'])

### News info

In [62]:
def get_news_info(news_main_link):

    big_list_name = []
    big_list_view = []
    for x in range(15):
        try:
            sub_link = news_main_link + f'/page/{x+1}'
            soup = get_data(sub_link)

            name_list = []
            view_list = []

            results = soup.find_all('div', {'class': 'your-stories__content'})
            for sec in results:
                fin_link = sec.find('a')['href']
                s = get_data(fin_link)
                title = clean_text(s.find('h1', {'class': 'label-news'}).text)
                view = s.find("span", title=re.compile("Page Views")).text
                view = clean_digits(view)

                name_list.append(title)
                view_list.append(view)

            big_list_name.extend(name_list)
            big_list_view.extend(view_list)

        except:
            pass
    return big_list_name, big_list_view

In [63]:
news_main_link = 'https://www.themix.org.uk/news-and-research/news'
news_df = pd.DataFrame({'title': get_news_info(news_main_link)[0], 'views': get_news_info(news_main_link)[1]})
news_df 

Unnamed: 0,title,views
0,the mix supporters are invited to take part in...,1262
1,young people suffer from skin hunger as loneli...,111
2,self harm among young people has got worse dur...,175
3,the mix has partnered with hollister co in aid...,302
4,the mix s new winter campaign with schuh will ...,359
...,...,...
112,do it transfer documentation,7185
113,how young people use mobile phones to seek help,7593
114,mobile focus groups round two,6673
115,focus on mobile,7374


### Blog info

In [64]:
def get_blog_info(blog_main_link):
    big_list_name = []
    big_list_view = []

    for x in range(15):
        try:
            sub_link = blog_main_link + f'/page/{x+1}'
            soup = get_data(sub_link)

            name_list = []
            view_list = []

            results = soup.find_all('div', {'class': 'masonry-boxes__item'})
            for item in results:
                sub_link = item.find('a')['href']
                s = get_data(sub_link)
                name = clean_text(s.find('h1', {'class': 'label-blog'}).text)
                view = s.find("span", title=re.compile("Page Views")).text
                view = clean_digits(view)

                name_list.append(name)
                view_list.append(view)

            big_list_name.extend(name_list)
            big_list_view.extend(view_list)
        except:
            pass

    return big_list_name, big_list_view

In [65]:
blog_main_link = 'https://www.themix.org.uk/news-and-research/blogs'
blog_df = pd.DataFrame({'title': get_blog_info(blog_main_link)[0], 'views': get_blog_info(blog_main_link)[1]})
blog_df

Unnamed: 0,title,views
0,volunteering the mix,3720
1,youth voice steering group,3752
2,my summerunfiltered at the mix,2320
3,a community coming of age,2201
4,how to overcome digital challenges facing the ...,12220
...,...,...
92,insights from mobile support research,6283
93,co creation sessions to shape ambitious new st...,7368
94,exploration into young people s behaviour and ...,6714
95,recommended reading resources exploring how yo...,7080


### Case Study info

In [66]:
def get_cs_info(cs_main_link):
    big_list_name = []
    big_list_view = []

    for x in range(15):
        try:
            sub_link = cs_main_link + f'/page/{x+1}'
            soup = get_data(sub_link)

            name_list = []
            view_list = []

            results = soup.find_all('div', {'class': 'masonry-boxes__item'})
            for item in results:
                sub_link = item.find('a')['href']
                s = get_data(sub_link)
                name = clean_text(
                    s.find('h1', {'class': 'label-case_study'}).text)
                view = s.find("span", title=re.compile("Page Views")).text
                view = clean_digits(view)

                name_list.append(name)
                view_list.append(view)

            big_list_name.extend(name_list)
            big_list_view.extend(view_list)
        except:
            pass

    return big_list_name, big_list_view

In [67]:
cs_main_link = 'https://www.themix.org.uk/news-and-research/case-studies'
case_study_df = pd.DataFrame({'title': get_cs_info(cs_main_link)[0], 'views': get_cs_info(cs_main_link)[1]})
case_study_df

Unnamed: 0,title,views
0,harry twohig making a positive difference by v...,5420
1,emily found support at the mix,5064
2,glen wiseman sexual health expert and voluntee...,5211
3,elijah hall supporting young people at the mix,4987
4,thomas and james running the london marathon i...,1791
5,jane the mix s chat expert for under s,7396
6,mike from community member to support expert,6746
7,alice matharu proud to be a digital volunteer ...,6006
8,rhys managing his own health by supporting others,5859
9,jacob people found support at the mix and is n...,6498


### Research info

In [68]:
def get_research_title(research_link):
    soup = get_data(research_link)

    results = soup.find_all('div', {'class': 'campaigns__item-content'})
    name_list = []
    for item in results:
        name = clean_text(item.text)
        name_list.append(name)

    return name_list

In [69]:
research_link = 'https://www.themix.org.uk/news-and-research/research'

get_research_title(research_link)

['quarterly data trends impacting young people during lockdown april june',
 'young people and loneliness during the pandemic',
 'young people and self harm during the pandemic',
 'young people and self harm',
 'delivering digital mental health services that work',
 'youth employability pinning down the future of digital badges',
 'the role of digital badges for young people',
 'connected generation report',
 'connecting the dots',
 'hidden homelessness in young people',
 'supporting young people through mobile technology',
 'the role of online offline support for young people who self harm']

In [70]:
news_dict = dict()

news_dict['news'] = get_news_info(news_main_link)[0]
news_dict['blog'] = get_blog_info(blog_main_link)[0]
news_dict['case study'] = get_cs_info(cs_main_link)[0]
news_dict['research'] = get_research_title(research_link)

In [71]:
news_dict

{'news': ['the mix supporters are invited to take part in captaintom to celebrate captain tom s legacy',
  'young people suffer from skin hunger as loneliness increases for under s during the pandemic',
  'self harm among young people has got worse during lockdown',
  'the mix has partnered with hollister co in aid of world teen mental wellness day',
  'the mix s new winter campaign with schuh will support young people with family relationships over christmas',
  'brand new research on bullying shows young men are more likely to be currently bullied',
  'the mix represents young people in public health england phe s new every mind matters campaign',
  'the mix partners with facebook to help families talk about tech use',
  'the mix has launched a brand new coronavirus information hub for young people',
  'the mix and gymshark are launching a partnership to support young people s mental health',
  'the mix celebrates the kindness of young people for mental health awareness week',
  'the

In [72]:
import json

with open('news.txt', 'w') as file:
     file.write(json.dumps(news_dict)) 

## Your Voice

In [73]:
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove tags
    text = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", text)
    # Remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    # Remove whitespace 
    text = text.strip()
    return text


def clean_digits(text):
    return int(text.split()[1].replace(',', ''))


def get_data(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup


def get_story_info(story_url):
    big_title_list = []
    big_view_list = []

    for x in range(30):
        try:
            sub_link = story_url + f'/page/{x+1}'
            soup = get_data(sub_link)
            results = soup.find_all('div', {'class': 'your-stories__content'})

            title_list = []
            view_list = []
            for item in results:
                link = item.find('a')['href']
                title = item.find(
                    'div', {'class': 'flag__body'}).find('a').text
                title = clean_text(title)
                s = get_data(link)
                view = s.find("span", title=re.compile("Page Views")).text
                view = clean_digits(view)
                title_list.append(title)
                view_list.append(view)

            big_title_list.extend(title_list)
            big_view_list.extend(view_list)
        except:
            pass

    return big_title_list, big_view_list


In [74]:
story_url = 'https://www.themix.org.uk/your-voices/submissions'
get_story_info(story_url)

(['isolation',
  'how my disability made me love myself',
  'i love me because',
  'a different perspective',
  'let s talk about men',
  'is the internet bad',
  'men are still human',
  'what your parents might think',
  'it has to get better',
  'i missed my mum and dad',
  'university',
  'moving day yourhometruth',
  'joining the army',
  'young clueless and living alone',
  'sex change',
  'little games blast worries away',
  'whether i have recovered or not',
  'have i recovered',
  'why i love myself',
  'career advice for digital',
  'canada',
  'how your surroundings can help you recover',
  'eli',
  'relaxing after exams',
  'greater recognition for deaf young people campaign',
  'my boyfriend stops me from giving up on myself',
  'schizophrenia health storylines',
  'why i love myself',
  'if i had a whole day to relax',
  'giving up caring what others thought',
  'financial independence understanding financial jargon',
  'getting your finances in line',
  'the government s

In [75]:
story_dict = dict()
story_dict['story'] = get_story_info(story_url)[0]

In [76]:
import json

with open('story.txt', 'w') as file:
     file.write(json.dumps(story_dict)) 

In [77]:
story_df = pd.DataFrame({'title': get_story_info(story_url)[0], 'views': get_story_info(story_url)[1]})
story_df

Unnamed: 0,title,views
0,isolation,19787
1,how my disability made me love myself,14184
2,i love me because,12088
3,a different perspective,13170
4,let s talk about men,20216
...,...,...
312,aimie s happy box,3655
313,our relationship isn t defined by my bpd,44
314,my happy box hann,95
315,sadness,2966


In [78]:
content_df = pd.concat([article_df, story_df, news_df, case_study_df, blog_df], axis=0)
content_df.to_csv('content.csv')