# Web Scrapping
#### Project: Extract feedback suggestions from Mutual website
#### Source: feedback.mutual.app
#### Monetization: Non-commercial use
#### Author: Pedro Sanhueza

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

In [2]:
# 12 seconds

page = 1
rows = []
rows_votes = []
rows_merged = []
url_base = 'feedback.mutual.app'

while page:
    url = f'https://{url_base}/?page={page}&order=popular&filter=all#controls' # url for each page. only change the page number
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # List of dict: mulitple attributes
    info = soup.find_all('div',{'class':'sInfo'})
    for attribute in info:
        row = {}
        row['page'] = page
        row['id'] = attribute.find('a')['href'].split('/')[2]
        row['suggestion'] = attribute.find('a')['href'].split('/')[-1].replace('-',' ').capitalize()
        row['author'] = attribute.find_all('strong')[0].get_text()
        start = attribute.find_all('span')[0].get_text(strip=True).find('(')+1
        end = attribute.find_all('span')[0].get_text(strip=True).find(')')
        row['date_created'] = attribute.find_all('span')[0].get_text(strip=True)[start:end].replace('\'','20')
        link = attribute.find('a')['href']
        row['suggestion_link'] = f'https://feedback.mutual.app{link}'
        try:
            row['last_upvoted'] = attribute.find_all('span',{'class':'sLastComment'})[0].get_text(strip=True)[9:]
        except:
            pass
        row['comments'] = attribute.find_all('span',{'class':'sLabel'})[0].get_text(strip=True)[10:]
        try:
            row['status'] = attribute.find_all('div',{'class':'sLabels'})[0].get_text(strip=True).replace('Pinned','')
        except:
            pass
        rows.append(row)

    # List of dict: 'votes' attribute
    votes= soup.find_all('div',{'class':'sNumbers'})
    for attribute in votes:
        row = {}
        row['votes'] = attribute.find('a').get_text(strip=True)[:-5].replace('K','000')
        rows_votes.append(row)

    
    # if there is not a next page, stop loop
    page+=1
    if soup.find_all('li')[-1].find('a') == None:
        page = False

# Merge both list of dict into new list
for idx, row in enumerate(rows):
    rows_merged.append({**row,**rows_votes[idx]})
    
    # # ************* DESCRIPTION DICTIONARARY *************
    # for attribute in info:
    #     url = f'https://feedback.mutual.app'+attribute.find('a')['href']
    #     response = requests.get(url)
    #     soup = BeautifulSoup(response.text, 'html.parser')
    #     row = {}
    #     p = soup.find('div',{'class':'suggestionDescription'}).get_text().strip()
    #     row['Description'] = p.encode('ascii','replace').decode().replace('???','\'')
    #     rows_description.append(row)


In [None]:
data = pd.DataFrame(rows_merged)

date = datetime.now().strftime("%d-%m-%Y %H%M%S") # get local time as string

filePath = "../Mutual - Historical Data/MutualFeedback " + date + " - Webscrape.csv"

data.to_csv(filePath, index=True) # save to file path

data

In [None]:
pd.options.display.max_rows = 10

In [3]:
data = pd.DataFrame(rows_merged)

In [None]:
data[data.votes.str.contains('\.')]

In [None]:
data.votes.iloc[3]

In [13]:
# change dtype
data = data.astype({'votes': 'float','comments': 'float'}, copy=True)

# add column with months
data['date_created_month'] = data['date_created'].apply(
    lambda x:
    x.split(' ')[1].replace(',','')
    if 'days' not in x and 'yesterday' not in x and 'today' not in x
    else datetime.now().strftime('%h'))

In [6]:
pd.options.display.max_rows = None

In [8]:
data['date_created_month'].value_counts()

KeyError: 'date_created_month'

In [9]:
data['date_created'].value_counts()

07 Jul          37
02 Mar, 2021    15
09 Jul          14
22 Feb, 2021     7
19 Jun, 2021     6
20 Aug           5
28 Jul           4
24 Sep           4
01 Sep           4
02 Jul, 2021     4
29 Jul           4
04 Apr, 2021     4
08 Jul           4
05 Jul           4
06 Jul, 2021     4
01 Mar, 2021     4
08 Jun, 2021     4
17 Jul, 2021     3
16 Mar, 2021     3
11 Jun           3
21 Mar, 2021     3
27 Jul           3
26 Jun, 2021     3
29 Jun, 2021     3
26 Aug           3
22 Nov           3
21 Aug           3
25 Mar, 2021     3
12 Jul           3
03 Aug           3
18 Mar, 2021     3
03 Oct           3
30 Mar, 2021     3
14 Jun           3
19 Aug           3
19 Jul           3
15 Mar, 2021     3
04 Dec, 2021     3
01 Jul           3
25 Aug           3
23 Jul           3
30 Jul           2
31 Aug, 2021     2
16 Jun           2
12 Nov, 2021     2
25 May, 2021     2
31 May, 2021     2
11 Oct, 2021     2
09 Aug, 2021     2
02 Dec, 2021     2
20 May, 2021     2
19 May, 2021     2
28 Jul, 2021

# Explanatory Data Analysis

In [None]:
import plotly.express as px

In [None]:
data_month = data['date_created_month'].value_counts().reset_index()
order = {'index':['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']}

fig = px.bar(
    data_month,
    x='index',
    y='date_created_month',
    color='date_created_month',
    # color_continuous_scale=px.colors.sequential.Viridis,
    color_continuous_scale=str(px.colors.named_colorscales()[15])+'_r',
    category_orders=order,
    width=1200,
    height=400,
    text_auto=True,
    labels={"index":"Month","date_created_month":"Amount of Feedback"},
    title="Feedback per Month")

fig.update_traces(textfont_size=15)

fig.show()

In [None]:

data_status = data['status'].value_counts().reset_index()
x_axis = data_status['index']

fig = px.bar(
    data_status,
    x='index',
    y='status',
    color='status',
    # color_continuous_scale=px.colors.sequential.Viridis,
    color_continuous_scale=str(px.colors.named_colorscales()[27])+'_r',
    width=1200,
    height=400,
    text_auto=True,
    title="Status Amount",
    labels={"index":"Category","status":"Status"},
)

fig.update_layout(xaxis={'categoryorder':'total descending'})

fig.update_traces(textfont_size=16)

fig.show('png')

In [None]:
# Top 10 (by vote count) features already implemented:
data[data.status=='Done'][:10].sort_values('votes', ascending=False)

In [None]:
# Top 10 votes features not done:
data[data.status!='Done'].sort_values('votes', ascending=False)[:10].iloc[:,1:]

In [None]:
data.head()

In [None]:
import altair as alt
from vega_datasets import data

source = data.wheat()

base = alt.Chart(source).encode(x='year:O')

bar = base.mark_bar().encode(y='wheat:Q')

line =  base.mark_line(color='red').encode(
    y='wages:Q'
)

(bar + line).properties(width=600)

In [None]:
data.wheat()