# SPIDER
## SPam IDentifier for E-commerce Reviews

In [63]:
import re
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta as time_ago
from dateutil.parser import parser
import pandas as pd
import numpy as np
import requests
import string
import random
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
import time

# Scraper Functions

In [64]:
def make_folder(fn):
    """
    Creates a folder fn if it doesn't exist
    """
    if not os.path.exists(fn):
        os.makedirs(fn)

In [65]:
def make_nested_folders(folder_structure=None):
    """
    folder_structure (dict):  {'outer_folder1': ['subfolders', 'of', 'outer_folder1']}
        Creates (in current dir):
            outer_folder1/
                      subfolders/
                      of/
                      outer_folder1/
    """
    if folder_structure is not None:
        for parent_folder, children in folder_structure.items():
            make_folder(parent_folder)
            for child_folder in children:
                make_folder(parent_folder + '/' + child_folder)

In [66]:
def download_html(browser, asin, page_number, reason=''):
    source = browser.page_source
    html_fn = asin + '/html/' + str(page_number) + '_' + reason + '.html'
    with open(html_fn, 'w', encoding='utf-8') as handler:
        handler.write(source)

In [67]:
def nap(lb=0, ub=1):
    """
    sleep scrape for a randomly generated time in seconds,
    set a lower bound (lb) and upper (ub) for the RGN.
    meant to hopefully avoid robot checks.
    """
    time.sleep(random.uniform(lb, ub))

In [128]:
def pause_up_up(browser, current_height):
    """
    another function to avoid robot checks-
    scrolls up a random amount of pixels 3-4 times with micronaps fit in.
    """
    y = current_height
    for i in range(random.randint(3,4)):
        y_wiggle = y - random.randint(80,140)
        browser.execute_script("window.scrollTo(0, %s)" % y_wiggle)
        y = y_wiggle
        nap(.3, 1.15)

In [152]:
def control_scroll(browser):
    scroll_heights = [151, 340, 555]
    nap(.75, 2.2)
    for page_y in scroll_heights:
        print(page_y)
        y_wiggle = page_y + random.randint(-150,100)
        browser.execute_script("window.scrollTo(0, %s)" % y_wiggle)
        nap(.3, .8)
        if random.randint(1,2) > 1:
            nap(1.4, 2.6)
        else:
#             pause_up_up(browser, y_wiggle)
            nap(.2, .6)

In [153]:
def get_already_scraped(prods_or_profs='products'):
    if prods_or_profs[:4] == 'prod':
        dir_contents = os.listdir("products")
    else:
        dir_contents = os.listdir("profiles")
    return dir_contents

In [154]:
def finish_incomplete_scrapes():
    already_scraped = get_already_scraped('products')
    incomplete_asins = {}
    for name in already_scraped:
        data_path = "products/" + name + "/raw/"
        for fn in os.listdir(data_path):
            if ('ERROR' in fn) and (len(os.listdir(data_path)) == 1):
                df = pd.read_csv(data_path + fn)
                incomplete_asins[name] = df
                print(name)
    return incomplete_asins

In [155]:
def aggregate_data(prods_or_profs='products'):
    already_scraped = get_already_scraped(prods_or_profs)
    agg_data = []
    target_is_product = prods_or_profs[:4] == 'prod'
    clean_purl = lambda x: x.split('/ref')[0]

    for name in already_scraped:
        if target_is_product:
            data_path = "products/" + name + "/raw/complete.csv"
        else:
            data_path = "profiles/" + name + "/profile.csv"
        if os.path.exists(data_path):
            try:
                df = pd.read_csv(data_path)
                if ('body' in df.columns) and ('asin' not in df.columns):
                    df['asin'] = [name] * df.shape[0]
                agg_data.append(df)
            except:
                print(name)

        else:
            print(name, 'has incomplete data')
        
    agg_df = pd.concat(agg_data)
    agg_df = agg_df[~agg_df.duplicated()]
    if target_is_product:
        agg_df['profile_url'] = agg_df['profile_url'].apply(clean_purl)
    else:
        agg_df['profile_url'] = agg_df['profile_url'].apply(lambda x: x[:-1])
    return agg_df

In [156]:
def scrape_sus_profiles(sus_urls):
    all_pr = []
    for sus_url in sus_urls:
        browser, pr = scrape_sus_profile(sus_url)
        all_pr.extend(pr)
        browser.close()
    return all_pr

In [157]:
def scrape_asins(asins):
    all_rd = []
    for asin in asins:
        browser, rd = start_scrape(asin)
        browser.close()
        print('done with ', asin)
        nap(1,2)
        all_rd.append(rd)
    return all_rd

In [158]:
# def get_sus_asins():
#     """do after scraping profile pages (profile_urls)"""
#     pdf = aggregate_data('prof')
#     adf = aggregate_data()
    
#     pdf_new_asins = pdf[~pdf.asin.isin(adf.asin.unique())]
#     return pdf_new_asins

In [159]:
# def get_sus_profiles():
#     """do after scraping review pages (asins)"""
#     pdf = aggregate_data('prof')
#     scraped_profs = set(pdf['profile_url'].unique())

#     adf = aggregate_data()
#     adf = adf[adf['stars'] == 5]
#     agb_asin = adf.groupby(['asin'])
#     sus_prof_list = []
#     for g, adf_purl in agb_asin['profile_url']:
#         review_profs = set(adf_purl.values)
#         overlap = scraped_profs & review_profs
#         if (len(overlap) > 1) and (len(overlap) < len(scraped_profs)):
#             sus_prof_list.extend(
#                 list(review_profs - scraped_profs)
#             )
#     sus_profs = adf[adf['profile_url'].isin(sus_prof_list)]
#     #     adf = adf[~adf['asin'].isin(pdf['asin'].unique())]
#     return sus_profs

# Review Scraper Functions

In [160]:
def conjure_url(asin, page_number):
    url = 'https://www.amazon.com/product-reviews/' + asin + '/&reviewerType=all_reviews&pageNumber=' + str(page_number)
    return url

In [161]:
def parse_helpfuls(review_soup):
    try:
        num_helpfuls = review_soup.find('span',
                                        {
                                            'data-hook': 'helpful-vote-statement'}
                                        ).text.strip().split(' ')[0]
        if len(num_helpfuls) > 2:
            num_helpfuls = 1
        else:
            num_helpfuls = int(num_helpfuls)
    except (AttributeError, ValueError):
        num_helpfuls = 0
    return num_helpfuls

In [162]:
def get_profile_url(review_soup):
    try:
        author_block = review_soup.find('span', {'data-hook': 'review-author', 'class': 'a-size-base'})
        author_name = author_block.findChildren()[-1]
        profile_url = author_name['href']
    except:
        print('profile url: something went wrong!')
        return False
    return profile_url

In [163]:
def parse_review_page(soup):
    review_data_list = []
    page_soup = soup.find_all(class_='a-section review')
    for review_soup in page_soup:
        profile_url = get_profile_url(review_soup)
        num_helpfuls = parse_helpfuls(review_soup)

        try:
            stars = float(review_soup.find(class_='a-icon-alt').text[:4])
        except:
            print('parsing stars: something went wrong!')
            stars = None

        try:
            date_posted = review_soup.find(class_='a-size-base a-color-secondary review-date'
                                           ).text.strip('on').strip()
            try:
                date_posted = parser().parse(date_posted)
            except:
                print('date parsesr fucked up')
                
        except:
            print('parsing date: something went wrong!')
            date_posted = None

        try:
            body = review_soup.find(class_='a-size-base review-text').text
        except:
            print('parsing body: something went wrong!')
            body = None
            
        try:
            is_verified = review_soup.find(
                class_='a-size-mini a-color-state a-text-bold').text == 'Verified Purchase'
        except:
            is_verified = False

        review_dic = {
            'profile_url': profile_url,
            'stars': stars,
            'date_posted': date_posted,
            'body': body,
            'num_helpfuls': num_helpfuls,
            'is_verified': is_verified
        }
        review_data_list.append(review_dic)
    return review_data_list

In [164]:
def get_page_count(browser,soup):
    try:
        number_of_pages = int(soup.find_all(class_='page-button')[-1].text)
    except:
        browser.refresh()
        nap(1.25, 1.5)
        try:
            number_of_pages = int(soup.find_all(class_='page-button')[-1].text)
        except:
            number_of_pages = 1
    return number_of_pages

In [167]:
def start_scrape(asin, path_to_chromedriver=None):
    if not path_to_chromedriver:
        path_to_chromedriver = os.path.abspath('chromedriver')
    browser = webdriver.Chrome(path_to_chromedriver)
    make_folder('products')
    make_nested_folders({'products/' + asin: ['html', 'raw', 'processed']})
    review_data = []
    start_url = conjure_url(asin, 1)
    browser.get(start_url)
    nap(.85,1.25)
    soup = bs(browser.page_source, 'lxml')
    num_pages = get_page_count(browser, soup)
    review_data.extend(parse_review_page(soup))
    download_html(browser, 'products/' + asin, 1)
    print('Page %s of %s scraped' % (1, num_pages))
    if num_pages == 1:
        rdf = pd.DataFrame(review_data)
        rdf.to_csv('products/' + asin + '/raw/complete.csv', index=False)
        return browser, rdf
    prev_page = browser.current_url
    for page_num in range(2, num_pages + 1):
        try:
            next_button = browser.find_elements_by_class_name('a-last')[0]
            next_button.click()
            print(1)
            nap(.35,.7)
            if prev_page == browser.current_url:
                break
            prev_page = browser.current_url
#             control_scroll(browser)
            print(2)
            nap()
            soup = bs(browser.page_source, 'lxml')
            review_data.extend(parse_review_page(soup))
            print(3)
            nap()
            download_html(browser, 'products/' + asin, page_num)
            print('Page %s of %s scraped' % (page_num, num_pages))
        except:
            print('SOMETHING WENT WRONG WITH page %s of %s' % (page_num, num_pages))
            download_html(browser, 'products/' + asin, page_num, 'ERROR')
            rdf = pd.DataFrame(review_data)
            rdf.to_csv('products/' + asin + '/raw/ERROR-' + str(page_num) + '.csv', index=False)
            return browser, None           
    rdf = pd.DataFrame(review_data)
    rdf['asin'] = [asin] * rdf.shape[0]
    rdf.to_csv('products/' + asin + '/raw/complete.csv', index=False)
    return browser, rdf

# Profile Scraper Functions

In [144]:
def parse_profile_name(soup):
    try:
        profile_name = soup.find(class_='name-container').text
    except:
        profile_name = None
    return profile_name

In [77]:
def parse_num_reviews(browser):
    prof_stats = browser.find_elements_by_class_name('dashboard-desktop-stat-value')
    if len(prof_stats) != 4:
        print('PROFILE HAS MORE THAN 4 PROF STATS')
    try:
        review_count = prof_stats[1]
        num_reviews = int(review_count.text)
        review_count.click()
    except:
        num_reviews = None
        print('>>>>>>>    COULD not parse prof stats')
    return num_reviews, browser

In [78]:
def parse_prod_name(container):
    try:
        prod_name = container.text
    except:
        print('couldnt parse product name')
        prod_name = None
    return prod_name

In [79]:
def parse_rating(head_element):
    try:
        star_element = head_element.find('i')
        for c in star_element['class'][::-1]:
            if 'a-star' in c:
                rating = c.split('-')[-1]
                if rating.isdigit():
                    stars = int(rating)
                    return stars
                else:
                    stars = c
                    print('@#$@#$@# error parsing rating')
    except:
        print('parsing rating ERROR #!@$#@$')
        stars = None
    return stars

In [80]:
def parse_review_title(head_element):
    try:
        review_title = head_element.find(class_='glimpse-review-title').text
    except:
        review_title = None
        print(' encountered      review     title    e rr ro ro rrrr oror ')
    return review_title

In [81]:
def parse_body_snippet(container):
    try:
        body_snippet = container.text
    except:
        print('no review snippet to be found ' * 8)
        body_snippet = None
    return body_snippet

In [82]:
def scrape_sus_profile(profile_url, path_to_chromedriver=None):
    def suffix_to_url(x): return 'https://www.amazon.com' + x
    if not path_to_chromedriver:
        path_to_chromedriver = os.path.abspath('chromedriver')
    browser = webdriver.Chrome(path_to_chromedriver)
    profile_id = profile_url.split('account.')[-1].split('/')[0]
    make_folder('profiles')
    make_nested_folders({'profiles/' + profile_id: ['html']})
    start_url = suffix_to_url(profile_url)
    browser.get(start_url)
    nap(1.6, 3.7)
    browser.execute_script("window.scrollTo(0, %s)" % 550)
    
    num_reviews, browser = parse_num_reviews(browser)
    nap(1.9, 2.5)
    browser.execute_script("window.scrollTo(0, %s)" % 650)

    
    
    soup = bs(browser.page_source, 'lxml')
    profile_name = parse_profile_name(soup)
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
    nap(2, 2.5)
    review_cards = browser.find_elements_by_id('glimpse-ephemeral-metadata')
    if review_cards == []:
        nap(1.4, 1.7)
        review_cards = browser.find_elements_by_id('glimpse-ephemeral-metadata')
    containers = soup.findAll(class_='glimpse-flex-container')
    num_public_reviews = len(review_cards)
    profile_reviews = []
    if num_public_reviews == 0:
        print('no public reviews')
        review_dic = {
            'profile_id': profile_id,
            'profile_name': profile_name,
            'num_reviews': num_reviews,
            'num_public_reviews': num_public_reviews,
            'asin': None,
            'review_date': None,
            'prod_name': None,
            'stars': None,
            'review_title': None,
            'body_snippet': None,
            'profile_url': profile_url
        }
        profile_reviews.append(review_dic)
        return browser, profile_reviews
    browser.execute_script("window.scrollTo(0, %s)" % 350)
    nap(1, 1.5)
    browser.execute_script("window.scrollTo(0, %s)" % 950)
    nap(.25, .6)
    review_cards = browser.find_elements_by_id('glimpse-ephemeral-metadata')
    containers = soup.findAll(class_='glimpse-flex-container')
    num_public_reviews = len(review_cards)
    if num_public_reviews != num_reviews:
        print('---  There are %s reviews missing from profile' %
              (num_reviews - num_public_reviews))

    if len(containers) / num_public_reviews != 3.0:
        
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        nap(.4, .7)
        review_cards = browser.find_elements_by_id('glimpse-ephemeral-metadata')
        containers = soup.findAll(class_='glimpse-flex-container')
        num_public_reviews = len(review_cards)
        print(len(containers), num_public_reviews)
        print('containers != 3 * num public reviews')
    download_html(browser,'profiles/' +  profile_id, 1)
    
    for i, product_review in enumerate(review_cards):
        if 3*i + 2 >= len(containers):
            break
        try:
            prod_asin = product_review.get_attribute('data-product-id')
        except:
            print('ERROR PARSING ASIN')
            prod_asin = None

        try:
            review_date = product_review.get_attribute(
                'data-activity-timestamp')
            if len(review_date) < 10:
                print('CANT PARSE REVIEW DATE')
            else:
                review_date = dt.fromtimestamp(int(review_date[:10]))
        except:
            print(review_date)
            print('ERROR PARSING review_date')
            review_date = None

        prod_name = parse_prod_name(containers[3*i])
        try:
            head_element = containers[3*i + 1]
            stars = parse_rating(head_element)
            review_title = parse_review_title(head_element)
        except:
            print('couldnt find head element    -    no review title OR product name')
            review_title = None
            stars = None
        body_snippet = parse_body_snippet(containers[3*i + 2])
        review_dic = {
            'profile_id': profile_id,
            'profile_name': profile_name,
            'num_reviews': num_reviews,
            'num_public_reviews': num_public_reviews,
            'asin': prod_asin,
            'review_date': review_date,
            'prod_name': prod_name,
            'stars': stars,
            'review_title': review_title,
            'body_snippet': body_snippet,
            'profile_url': profile_url
        }
        profile_reviews.append(review_dic)
    pr_df = pd.DataFrame(profile_reviews)
    pr_df.to_csv('profiles/' + profile_id + '/profile.csv', index=False)
    return browser, profile_reviews

# I/O functions

In [83]:
def get_already_scraped(prods_or_profs='products'):
    if prods_or_profs[:4] == 'prod':
        dir_contents = os.listdir("products")
    else:
        dir_contents = os.listdir("profiles")
    return dir_contents

In [84]:
def aggregate_data(prods_or_profs='products'):
    already_scraped = get_already_scraped(prods_or_profs)
    agg_data = []
    target_is_product = prods_or_profs[:4] == 'prod'
    clean_purl = lambda x: x.split('/ref')[0]

    for name in already_scraped:
        if target_is_product:
            data_path = "products/" + name + "/raw/complete.csv"
        else:
            data_path = "profiles/" + name + "/profile.csv"
        if os.path.exists(data_path):
            try:
                df = pd.read_csv(data_path)
                if ('body' in df.columns) and ('asin' not in df.columns):
                    df['asin'] = [name] * df.shape[0]
                agg_data.append(df)
            except:
                print(name)

        else:
            print(name, 'has incomplete data')
        
    agg_df = pd.concat(agg_data)
    agg_df = agg_df[~agg_df.duplicated()]
    if target_is_product:
        agg_df['profile_url'] = agg_df['profile_url'].apply(clean_purl)
    else:
        agg_df['profile_url'] = agg_df['profile_url'].apply(lambda x: x[:-1])
    return agg_df

In [85]:
def load_all_data():
    rev_df = aggregate_data()
    prof_df = aggregate_data('profiles')
    rev_df['date_posted'] = pd.DatetimeIndex(rev_df['date_posted'])
    prof_df['review_date'] = pd.DatetimeIndex(prof_df['review_date'])
    return rev_df, prof_df

# Feature Extraction Utilities

In [86]:
def count_dates_overlap(date_array1, date_array2):
    """
    Calculates a score to measure the overlap in review activity periods 
    """
    date1_max, date1_min = date_array1.max(), date_array1.min()
    date2_max, date2_min = date_array2.max(), date_array2.min()
    num_d1_in_d2 = 0
    for date1 in date_array1:
        if (date1 >= date2_min) and (date1 <= date2_max):
            num_d1_in_d2 += 1
    num_d2_in_d1 = 0
    for date2 in date_array2:
        if (date2 >= date1_min) and (date2 <= date1_max):
            num_d2_in_d1 += 1
    date_overlap_score = (num_d1_in_d2 + num_d2_in_d1) \
                         / (date_array1.shape[0] + date_array2.shape[0])
    return date_overlap_score

In [87]:
def filter_by_req_words(df, required_words=[]):
    req_words_search = '|'.join([s.lower() for s in required_words])
    df_filt = df[df['body'].str.lower().str.contains(req_words_search, na=False)]
    return df_filt

# Profile-local feature extractors

In [88]:
def get_burstiness(prof_df, profile_id, tau=28):
    user_df = prof_df[prof_df['profile_id'] == profile_id]
    if user_df.shape[0] <= 1:
        return 0
    else:
        activity_span = (user_df.review_date.max() - user_df.review_date.min()).days
        if activity_span > 0 and activity_span < tau:
            burstiness = activity_span / tau
            return np.round(1 - burstiness)
        else:
            return 0

In [89]:
def get_star_ratio(prof_df, profile_id, neg_below=5):
    user_ratings = prof_df[prof_df['profile_id'] == profile_id].stars
    num_neg = (user_ratings < neg_below).sum()
    num_pos = user_ratings.shape[0] - num_neg
    if num_neg == 0:
        ratio = num_pos **1.6
    else:
        ratio = num_pos / num_neg
    return ratio

In [90]:
def process_profile_features(prof_df, profile_id):
    if 'burstiness' not in prof_df.columns:
        prof_df['burstiness'] = 0.0
    prof_df_copy = prof_df.set_index('profile_id')
    prof_df_copy.at[profile_id, 'burstiness'] = get_burstiness(prof_df_copy.reset_index(), profile_id)
    
    if 'star_ratio' not in prof_df.columns:
        prof_df['star_ratio'] = 0.0
    prof_df_copy = prof_df.set_index('profile_id')
    prof_df_copy.at[profile_id, 'star_ratio'] = get_star_ratio(prof_df_copy.reset_index(), profile_id)
    
    return prof_df_copy.reset_index()

In [91]:
def batch_profile_features(prof_df, profile_ids=[]):
    if profile_ids == []:
        profile_ids = prof_df.profile_id.unique()
    for profile_id in profile_ids:
        prof_df = process_profile_features(prof_df, profile_id)
    return prof_df

# Profile-Profile feature extractors

In [92]:
def get_rel_burstiness(prof_df, profile_id1, profile_id2):
    """
    must share at least N reviewed product(s)
     - or only the reviews of the shared products
    """
    user1_dates = pd.DatetimeIndex(prof_df[prof_df['profile_id'] == profile_id1]['review_date'].values).date
    user2_dates = pd.DatetimeIndex(prof_df[prof_df['profile_id'] == profile_id2]['review_date'].values).date
    date_overlap_score = count_dates_overlap(user1_dates, user2_dates)
    return date_overlap_score

In [93]:
def get_time_stats(prof_df, profile_id1, profile_id2):
    timestamps1 = pd.DatetimeIndex(prof_df[prof_df['profile_id'] == profile_id1].review_date.values)
    timestamps2 = pd.DatetimeIndex(prof_df[prof_df['profile_id'] == profile_id2].review_date.values)
    timedelts = []
    for ts1 in timestamps1:
        delt = np.abs((timestamps2 - ts1).total_seconds())
        nearest_neighbor = delt.min() / 60 / 60
        timedelts.append(nearest_neighbor)
    for ts2 in timestamps2:
        delt = np.abs((timestamps1 - ts2).total_seconds())
        nearest_neighbor = delt.min() / 60 / 60
        timedelts.append(nearest_neighbor)
    return np.mean(timedelts), np.std(timedelts)

# ASIN-local feature extractors

In [94]:
def get_verified_ratio(rev_df, asin, five_star_only=True):
    asin_df = rev_df[rev_df['asin'] == asin]
    if five_star_only:
        is_verif = asin_df[asin_df['stars'] == 5]['is_verified']
    else:
        is_verif = asin_df['is_verified']
    num_verified = is_verif.sum()
    num_unverified = is_verif.shape[0] - num_verified
    if num_verified == 0:
        ratio =  num_unverified ** 1.6
    else:
        ratio = num_unverified / num_verified
    return ratio

In [95]:
def get_helpful_ratio(rev_df, asin):
    asin_df = rev_df[rev_df['asin'] == asin]
    
    mean_helpfuls_neg = asin_df[asin_df['stars'] <=2]['num_helpfuls'].mean()
    if np.isnan(mean_helpfuls_neg):
        mean_helpfuls_neg = 0

    mean_helpfuls_pos = asin_df[asin_df['stars'] >2]['num_helpfuls'].mean()
    if np.isnan(mean_helpfuls_pos):
        mean_helpfuls_pos = 0

    if mean_helpfuls_pos == 0:
        ratio = mean_helpfuls_neg ** 1.6
    else:
        ratio = mean_helpfuls_neg / mean_helpfuls_pos
    return ratio

In [96]:
def get_num_callouts(rev_df, asin):
    asin_df = rev_df[rev_df['asin'] == asin]
    callout_df = asin_df[asin_df['stars'] <=2]
    red_flags = ['fake review', 'scam', 'fake profile', 'beware', 'bogus', 't real', 'dont believe', "report", "don't believe", "don't buy", "dont buy", 'reviews aren']
    filt_df = filter_by_req_words(callout_df, required_words=red_flags)
    return len(filt_df)

In [97]:
def process_asin_features(rev_df, asin):
    if 'helpful_ratio' not in rev_df.columns:
        rev_df['helpful_ratio'] = 0.0
    rev_df_copy = rev_df.set_index('asin')
    rev_df_copy.at[asin, 'helpful_ratio'] = get_helpful_ratio(rev_df_copy.reset_index(), asin)
    
    if 'num_callouts' not in rev_df.columns:
        rev_df_copy['num_callouts'] = 0
    rev_df_copy.at[asin, 'num_callouts'] = get_num_callouts(rev_df_copy.reset_index(), asin)
    
    if 'verified_ratio' not in rev_df.columns:
        rev_df_copy['verified_ratio'] = 0
    rev_df_copy.at[asin, 'verified_ratio'] = get_verified_ratio(rev_df_copy.reset_index(), asin)
    return rev_df_copy.reset_index()

In [98]:
def batch_asin_features(rev_df, asins=[]):
    if asins == []:
        asins = rev_df.asin.unique()
    for asin in asins:
        rev_df = process_asin_features(rev_df, asin)
    return rev_df

# SPIDER operational functions:

In [99]:
def get_scraped_profiles(prof_df):
    return

## What if for profiles/ASINS that *do not* exist ==> interpret as fake?
## Limit to first-order cnxns

In [168]:
b = start_scrape('B07BLWSBWK')

Page 1 of 9 scraped
1
2
3
Page 2 of 9 scraped
1
2
3
Page 3 of 9 scraped
1
2
3
Page 4 of 9 scraped
1
2
3
Page 5 of 9 scraped
1
2
3
Page 6 of 9 scraped
1
2
3
Page 7 of 9 scraped
1
2
3
Page 8 of 9 scraped
1
2
3
Page 9 of 9 scraped


In [169]:
rev_df = aggregate_data()

In [170]:
rev_df

Unnamed: 0,body,date_posted,is_verified,num_helpfuls,profile_url,stars,asin
0,This item was a replacement for a Flex (1). I ...,2018-04-27,True,0,/gp/profile/amzn1.account.AFBZUFC5S75JKPU56KQL...,5.0,B07BLWSBWK
1,Love the minimalist band that is able to commu...,2018-04-27,True,0,/gp/profile/amzn1.account.AGBDG4FYBNKK3CPLD7TJ...,5.0,B07BLWSBWK
2,Came in no time.. really amazing.. wasn't expe...,2018-04-27,True,0,/gp/profile/amzn1.account.AGKCIQO4GUOM4F2LC3IW...,5.0,B07BLWSBWK
3,I just love the fitbit 2 flex. It helps keep m...,2018-04-27,True,0,/gp/profile/amzn1.account.AEMMCTOCBNI37V7NJPIT...,5.0,B07BLWSBWK
4,Love it. I wear it all the time to track my st...,2018-04-27,True,0,/gp/profile/amzn1.account.AHARNFAN7RV3Y6MXRRT3...,5.0,B07BLWSBWK
5,So much more accurate than counting steps on m...,2018-04-27,True,0,/gp/profile/amzn1.account.AF4DIVRDXN2UVRGAAW4J...,5.0,B07BLWSBWK
6,I loved it b/c I swim for exercise. Unfortunat...,2018-04-27,True,0,/gp/profile/amzn1.account.AF24536FRX6PKK7AS3W7...,5.0,B07BLWSBWK
7,"So far, this is exactly what I need. I've had ...",2018-04-27,True,0,/gp/profile/amzn1.account.AGBDG4FYBNKK3CPLD7TJ...,5.0,B07BLWSBWK
8,Love this Fitbit! Arrived next day with my Ama...,2018-04-27,True,0,/gp/profile/amzn1.account.AE7Y7PYI2YQ76IXOKOFM...,5.0,B07BLWSBWK
9,Amazing product and the fact I can track every...,2018-04-27,True,0,/gp/profile/amzn1.account.AF24536FRX6PKK7AS3W7...,5.0,B07BLWSBWK


In [None]:
rev_df, prof_df = load_all_data()
rev_df['infected'] = 0
prof_df['infected'] = 0
rev_df.set_index('asin', inplace=True)
rev_df.loc['B076Z8GLWY']['infected'] = 1
rev_df_0 = rev_df.loc['B076Z8GLWY']
prof_df_0 

In [None]:
prof_df[prof_df.asin == 'B076Z8GLWY']

In [None]:
prof_df[prof_df['profile_name'] == 'Carol Perez']

In [None]:
prof_df.profile_id.value_counts()

In [None]:
batch_profile_features(prof_df)

In [None]:
prof_df.profile_id.value_counts()

In [None]:
rev_df.helpful_ratio

In [None]:
process_asin_features(rev_df, 'B076Z8GLWY').loc['B076Z8GLWY']

In [None]:
rev_df['infected'] = 0
prof_df['infected'] = 0
rev_df.set_index('asin', inplace=True)
rev_df.loc['B076Z8GLWY']['infected'] = 1
prof_df.set_index('asin', inplace=True)
prof_df.loc['B076Z8GLWY']['infected'] = 1

In [None]:
rev_df.profile_url.value_counts().index[:20]

In [None]:
rev_df.shape

In [None]:
rev_df.loc[prof_df.index].shape

In [None]:
rev_df.loc['B076Z8GLWY']

In [None]:
prof_df.profile_id.unique().shape

In [None]:
prof_df.shape

In [None]:
rev_df.asin.value_counts()

In [None]:
asin1 = 'B0756YXM7H'
asin2 = 'B079DQJCWQ'
asin3 = 'B075V14MGM'
get_helpful_ratio(rev_df, asin1)

In [None]:
get_num_callouts(rev_df, asin1)

In [None]:
get_helpful_ratio(rev_df, asin2)

In [None]:
rdf = rev_df[rev_df.asin == asin2]
rdf = rdf[rdf.stars <= 2]
print('\n'.join(rdf.body.values))

In [None]:
get_num_callouts(rev_df, asin2)

In [None]:
profile_id1 = 'AE4CTXTBIWWV2W22O7MBFDTFTQEQ'
profile_id2 = 'AFVSSQNCP4TJME43JJ5W2IDHP57A'
get_star_ratio(prof_df, profile_id1)

In [None]:
get_star_ratio(prof_df, profile_id2)

In [None]:
get_burstiness(prof_df, profile_id=profile_id1)

In [None]:
get_burstiness(prof_df, profile_id=profile_id2)

In [None]:
get_rel_burstiness(prof_df, profile_id1=profile_id1, profile_id2=profile_id2)

In [None]:
get_time_stats(prof_df, profile_id1, profile_id2)