In [None]:
from bs4 import BeautifulSoup
import urllib.request as req
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from time import sleep
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

## 01. Data Crawling

In [None]:
# Chrome Setup
options = webdriver.ChromeOptions()
options.add_experimental_option("prefs", {'intl.accept_languates': 'en,en_US'})
driver = webdriver.Chrome(options = options)

In [None]:
# scrap basic app infos 
def crawl_basicInfo():
    doc = BeautifulSoup(driver.page_source, "html.parser")

    app_titles.append(doc.find(class_ = "AHFaub").text)
    app_categories.append(doc.find_all(class_ = "hrTbp R8zArc")[0].text)
    app_developers.append(doc.find_all(class_ = "hrTbp R8zArc")[1].text)
    app_ratings.append(float(doc.find(class_ = "BHMmbe").text))
    app_reviewCnts.append(int(doc.find(class_ = "AYi5wd TBRnV").text.replace(",", "")))
    app_lastUpdates.append(doc.find_all(class_ = "htlgb")[0].text)
    app_downloads.append(doc.find_all(class_ = "htlgb")[4].text)

In [None]:
# Read more reviews
def click_readMore():
    driver.find_element_by_xpath('//*[@id="fcxH9b"]/div[4]/c-wiz/div/div[2]/div/div/main/div/div[1]/div[6]/div').click()

In [None]:
# Load target amount of reviews
def load_reviews():
    currCnt = 0
    while currCnt < targetCnt:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        driver.find_element_by_tag_name("body").send_keys(Keys.PAGE_UP)
        sleep(2)

        currCnt = len(driver.find_elements_by_class_name("X43Kjb"))

        try: 
            driver.find_element_by_xpath("//*[@id='fcxH9b']/div[4]/c-wiz[2]/div/div[2]/div/div/main/div/div[1]/div[2]/div[2]/div").click()
        except NoSuchElementException:
            pass

In [None]:
# Click all 'Full Review' Buttons
def click_fullReviews():
    fullBtn = driver.find_elements_by_xpath("//button[@jsaction='click:TiglPc']")

    for btn in fullBtn:
        driver.execute_script("arguments[0].click();", btn)

In [None]:
# Crawl review datas 
def crawl_reviews():
    doc = BeautifulSoup(driver.page_source, "html.parser")

    n = doc.find_all(class_ = "X43Kjb")
    names.extend([temp.text for temp in n[4:targetCnt+4]])

    d = doc.find_all(class_ = "p2TkOb")
    dates.extend([temp.text for temp in d[4:targetCnt+4]])

    r = driver.find_elements_by_xpath("//span[@class='nt2C1d']/div[@class='pf5lIe']/div[@role='img']")
    for temp in r[4:targetCnt+4]:
        ratings.append(int(re.findall('\d', temp.get_attribute('aria-label'))[0]))

    uv = doc.find_all(class_ = "jUL89d y92BAb")
    upVotes.extend([int(temp.text) for temp in uv[4:targetCnt+4]])

    lc = driver.find_elements_by_xpath("//span[contains(@jsname, 'fbQN7e')]")
    sc = driver.find_elements_by_xpath("//span[contains(@jsname, 'bN97Pc')]")
    
    del lc[0:4]
    del sc[0:4]
    
    comments.extend([t.text if t.text != '' else lc[i].text for i, t in enumerate(sc[0:targetCnt])])

In [None]:
# Start crawling with pre-defined functions
for i in range(0, 2):
    driver.get(urls[i])
    crawl_basicInfo()
    click_readMore()
    load_reviews()
    click_fullReviews()
    crawl_reviews()

In [None]:
# DataFrame with ZOOM and Teams' overview information
df_overview = pd.DataFrame({"Title":app_titles,
                            "Category":app_categories,
                            "Developer":app_developers,
                            "Rating":app_ratings,
                            "# of Reviews":app_reviewCnts,
                            "# of Downloads":app_downloads,
                            "Last Update":app_lastUpdates})

In [None]:
# Seperate DataFrames for user reviews of  ZOOM and Teams
doubleCnt = targetCnt * 2;

In [None]:
df_zoom = pd.DataFrame({"Reviewer":names[0:targetCnt],
                          "Date":dates[0:targetCnt],
                          "Rating":ratings[0:targetCnt],
                          "UpVote":upVotes[0:targetCnt],
                          "Comment":comments[0:targetCnt]})

In [None]:
df_teams = pd.DataFrame({"Reviewer":names[targetCnt:doubleCnt],
                          "Date":dates[targetCnt:doubleCnt],
                          "Rating":ratings[targetCnt:doubleCnt],
                          "UpVote":upVotes[targetCnt:doubleCnt],
                          "Comment":comments[targetCnt:doubleCnt]})

In [None]:
plt.bar(['ZOOM', 'Microsoft Teams'], [len(df_zoom.index), len(df_teams.index)], color='Red', alpha=0.4)
plt.title("Number of Reviews by Application", fontsize=15)
plt.ylabel('Review Counts', fontsize=13)
plt.show()