# Web Scrapping
#### Project: Extract feedback suggestions from Mutual website
#### Source: feedback.mutual.app
#### Monetization: Non-commercial use
#### Author: Pedro Sanhueza

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

In [2]:
# 12 seconds

page = 1
rows = []
rows_votes = []
rows_merged = []
url_base = 'feedback.mutual.app'

while page:
    url = f'https://{url_base}/?page={page}&order=popular&filter=all#controls' # url for each page. only change the page number
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # List of dict: mulitple attributes
    info = soup.find_all('div',{'class':'sInfo'})
    for attribute in info:
        row = {}
        row['page'] = page
        row['id'] = attribute.find('a')['href'].split('/')[2]
        row['suggestion'] = attribute.find('a')['href'].split('/')[-1].replace('-',' ').capitalize()
        row['author'] = attribute.find_all('strong')[0].get_text()
        start = attribute.find_all('span')[0].get_text(strip=True).find('(')+1
        end = attribute.find_all('span')[0].get_text(strip=True).find(')')
        row['date_created'] = attribute.find_all('span')[0].get_text(strip=True)[start:end].replace('\'','20')
        link = attribute.find('a')['href']
        row['suggestion_link'] = f'https://feedback.mutual.app{link}'
        try:
            row['last_upvoted'] = attribute.find_all('span',{'class':'sLastComment'})[0].get_text(strip=True)[9:]
        except:
            pass
        row['comments'] = attribute.find_all('span',{'class':'sLabel'})[0].get_text(strip=True)[10:]
        try:
            row['status'] = attribute.find_all('div',{'class':'sLabels'})[0].get_text(strip=True).replace('Pinned','')
        except:
            pass
        rows.append(row)

    # List of dict: 'votes' attribute
    votes= soup.find_all('div',{'class':'sNumbers'})
    for attribute in votes:
        row = {}
        row['votes'] = attribute.find('a').get_text(strip=True)[:-5].replace('K','000')
        rows_votes.append(row)

    
    # if there is not a next page, stop loop
    page+=1
    if soup.find_all('li')[-1].find('a') == None:
        page = False

# Merge both list of dict into new list
for idx, row in enumerate(rows):
    rows_merged.append({**row,**rows_votes[idx]})
    
    # # ************* DESCRIPTION DICTIONARARY *************
    # for attribute in info:
    #     url = f'https://feedback.mutual.app'+attribute.find('a')['href']
    #     response = requests.get(url)
    #     soup = BeautifulSoup(response.text, 'html.parser')
    #     row = {}
    #     p = soup.find('div',{'class':'suggestionDescription'}).get_text().strip()
    #     row['Description'] = p.encode('ascii','replace').decode().replace('???','\'')
    #     rows_description.append(row)


In [None]:
data = pd.DataFrame(rows_merged)

date = datetime.now().strftime("%d-%m-%Y %H%M%S") # get local time as string

filePath = "../Mutual - Historical Data/MutualFeedback " + date + " - Webscrape.csv"

data.to_csv(filePath, index=True) # save to file path

data

In [None]:
pd.options.display.max_rows = 10

In [3]:
data = pd.DataFrame(rows_merged)

In [None]:
pd.DataFrame(rows)

In [4]:
data

Unnamed: 0,page,id,suggestion,author,date_created,suggestion_link,last_upvoted,comments,status,votes
0,1,161005,See who i have liked,Moses,"19 Feb, 2021",https://feedback.mutual.app/suggestions/161005...,today,84,Feedback Needed,1.3000
1,1,170983,Watch an ad to see a person who likes you,Anonymous,"19 Mar, 2021",https://feedback.mutual.app/suggestions/170983...,today,37,Feedback Needed,685
2,1,161962,Last onlinerecently active,Moses,"22 Feb, 2021",https://feedback.mutual.app/suggestions/161962...,today,54,Feedback Needed,618
3,1,177816,Skip a profile save for later,Brian,"11 Apr, 2021",https://feedback.mutual.app/suggestions/177816...,today,42,Feedback Needed,560
4,1,164928,Filter church activity,Moses,"01 Mar, 2021",https://feedback.mutual.app/suggestions/164928...,22 Nov,48,Released,487
...,...,...,...,...,...,...,...,...,...,...
483,10,200115,Set an age dealbreaker,Becca,"26 Jul, 2021",https://feedback.mutual.app/suggestions/200115...,"26 Jul, '21",0,Closed,1
484,10,196186,Hacer un encuentro por ciudades,Belkis Mercedes,"06 Jul, 2021",https://feedback.mutual.app/suggestions/196186...,"06 Jul, '21",0,Closed,1
485,10,194556,Enviar mensagem direta antes de macth mutuo ve...,Saulo Costa de Oliveira,"29 Jun, 2021",https://feedback.mutual.app/suggestions/194556...,"29 Jun, '21",0,Closed,1
486,10,190199,Messages,G,"09 Jun, 2021",https://feedback.mutual.app/suggestions/190199...,"09 Jun, '21",0,Closed,1


In [None]:
data[data.votes.str.contains('\.')]

In [None]:
data.votes.iloc[3]

In [None]:
# change dtype
data = data.astype({'votes': 'float','comments': 'float'}, copy=True)

# add column with months
data['date_created_month'] = data['date_created'].apply(
    lambda x:
    x.split(' ')[1].replace(',','')
    if 'days' not in x
    else datetime.now().strftime('%h'))

In [None]:
pd.options.display.max_rows = None

In [None]:
data['date_created_month'].value_counts()

# Explanatory Data Analysis

In [None]:
import plotly.express as px

In [None]:
data_month = data['date_created_month'].value_counts().reset_index()
order = {'index':['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']}

fig = px.bar(
    data_month,
    x='index',
    y='date_created_month',
    color='date_created_month',
    # color_continuous_scale=px.colors.sequential.Viridis,
    color_continuous_scale=str(px.colors.named_colorscales()[15])+'_r',
    category_orders=order,
    width=1200,
    height=400,
    text_auto=True,
    labels={"index":"Month","date_created_month":"Amount of Feedback"},
    title="Feedback per Month")

fig.update_traces(textfont_size=15)

fig.show()

In [None]:

data_status = data['status'].value_counts().reset_index()
x_axis = data_status['index']

fig = px.bar(
    data_status,
    x='index',
    y='status',
    color='status',
    # color_continuous_scale=px.colors.sequential.Viridis,
    color_continuous_scale=str(px.colors.named_colorscales()[27])+'_r',
    width=1200,
    height=400,
    text_auto=True,
    title="Status Amount",
    labels={"index":"Category","status":"Status"},
)

fig.update_layout(xaxis={'categoryorder':'total descending'})

fig.update_traces(textfont_size=16)

fig.show('png')

In [None]:
# Top 10 (by vote count) features already implemented:
data[data.status=='Done'][:10].sort_values('votes', ascending=False)

In [None]:
# Top 10 votes features not done:
data[data.status!='Done'].sort_values('votes', ascending=False)[:10].iloc[:,1:]

In [None]:
data.head()

In [None]:
import altair as alt
from vega_datasets import data

source = data.wheat()

base = alt.Chart(source).encode(x='year:O')

bar = base.mark_bar().encode(y='wheat:Q')

line =  base.mark_line(color='red').encode(
    y='wages:Q'
)

(bar + line).properties(width=600)

In [None]:
data.wheat()