 See results_notebook.py for a more complicated analysis example (used to replicate some analyses from a previous SERP-related paper).

 This file is meant to provide a very quick starting point for writing up other analyses.

 # Current data format
 Currently, the node.js scraping code (see collect.js)
 saves 3 result files per SERP scraped:
 * a .json file with
 device object used by puppeteer ("device"), date collection started ("dateStr"),
 date of collection ("dataAtSave"), user-specified query category (queryCat),
 file queries came from ("queryFile"), device name ("deviceName"),
 url accessed ("link"), the search engine or website ("platform"),
 the query made ("target"), and finally, a huge array of link elements ("linkElements")
 * a .png file that is a full screenshot of the SERP
 * a .mhtml snapshot of the website that can be opened in a web browser (this is experimental, apparently)

 Files are named by datetime of script start to avoid accidental overwrite.

 This script (analysis.py) includes code which stitches together a visual representation of
 links and their coordinates (obtained using getBoundingClientRect) alongside screenshots
 so search can perform visual validation -- compare the link representation (easy to do quant analyses)
 with the png representation and make sure they match up!

 we'll use pandas and friends for this quick analysis.

In [1]:
# defaults
import json
import glob
from pprint import pprint
from collections import defaultdict
from urllib.parse import unquote
import os

# scipy
import pandas as pd
import numpy as np

# plotting / images
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from PIL import Image

# helpers for this project
from helpers import (
    infinite_defaultdict, recurse_print_infinitedict, extract,
    is_mobile,
)
from my_constants import CONSTANTS

DO_COORDS = False
SAVE_PLOTS = False


In [2]:
# new data will be in results/*. This is an example!
filename = 'nov5_example/Chrome on Windows/https---www.google.com-search?q=protests/Thu Nov 05 2020 14-12-08 GMT-0600 (Central Standard Time).json'
with open(filename, 'r', encoding='utf8') as f:
    d = json.load(f)
d.keys()


dict_keys(['device', 'dateStr', 'linkElements', 'queryCat', 'queryFile', 'errMsg', 'deviceName', 'link', 'platform', 'target', 'dateAtSave'])

In [3]:
# print all details except the actual links (which is huge)
{k: v for k, v in d.items() if k != 'linkElements'}


{'dateAtSave': 'Thu Nov 05 2020 14:12:25 GMT-0600 (Central Standard Time)',
 'dateStr': 'Thu Nov 05 2020 14:12:08 GMT-0600 (Central Standard Time)',
 'device': {'name': 'Chrome on Windows',
  'userAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
  'viewport': {'height': 768, 'width': 1024}},
 'deviceName': 'Chrome on Windows',
 'errMsg': '',
 'link': 'https://www.google.com/search?q=protests',
 'platform': 'google',
 'queryCat': 'test',
 'queryFile': 0,
 'target': 'protests'}

In [4]:
df = pd.DataFrame(d['linkElements'])
df.head()


Unnamed: 0,top,left,bottom,right,href,parentText,parentClasses,classes,text
0,64.0,11.0,108.0,121.0,,Skip to main contentAccessibility helpa.duf3{c...,[S6VXfe],[gyPpGe],Skip to main content
1,64.0,11.0,108.0,121.0,https://support.google.com/websearch/answer/18...,Skip to main contentAccessibility helpa.duf3{c...,[S6VXfe],[gyPpGe],Accessibility help
2,86.0,11.0,130.0,121.0,,a.duf3{color:#70757a;float:right;font-style:it...,[],[gyPpGe],Accessibility feedback
3,24.0,1085.0,64.0,1125.0,https://www.google.com/intl/en/about/products?...,,[gb_Vf],[gb_D],
4,26.0,1137.0,62.0,1233.0,https://accounts.google.com/ServiceLogin?hl=en...,Sign in,[gb_5e],"[gb_4, gb_5, gb_le, gb_8c]",Sign in


In [5]:
from analyze_links import analyze_links_df
analyzed = analyze_links_df(df)
analyzed.head()


Unnamed: 0,top,left,bottom,right,href,parentText,parentClasses,classes,text,width,...,reddit_in,reddit_appears,reddit_appears_rh,reddit_appears_lh,reddit_appears_noscroll_lb,reddit_appears_lh_noscroll_lb,reddit_appears_noscroll_mg,reddit_appears_lh_noscroll_mg,reddit_appears_noscroll_ub,reddit_appears_lh_noscroll_ub
0,64.0,11.0,108.0,121.0,,Skip to main contentAccessibility helpa.duf3{c...,[S6VXfe],[gyPpGe],Skip to main content,110.0,...,False,False,False,False,False,False,False,False,False,False
1,64.0,11.0,108.0,121.0,https://support.google.com/websearch/answer/18...,Skip to main contentAccessibility helpa.duf3{c...,[S6VXfe],[gyPpGe],Accessibility help,110.0,...,False,False,False,False,False,False,False,False,False,False
2,86.0,11.0,130.0,121.0,,a.duf3{color:#70757a;float:right;font-style:it...,[],[gyPpGe],Accessibility feedback,110.0,...,False,False,False,False,False,False,False,False,False,False
3,24.0,1085.0,64.0,1125.0,https://www.google.com/intl/en/about/products?...,,[gb_Vf],[gb_D],,40.0,...,False,False,False,False,False,False,False,False,False,False
4,26.0,1137.0,62.0,1233.0,https://accounts.google.com/ServiceLogin?hl=en...,Sign in,[gb_5e],"[gb_4, gb_5, gb_le, gb_8c]",Sign in,96.0,...,False,False,False,False,False,False,False,False,False,False


In [6]:
analyzed.domain.value_counts()

www.google.com                    74
en.wikipedia.org                   5
support.google.com                 4
                                   4
webcache.googleusercontent.com     3
www.foxnews.com                    3
policies.google.com                2
apnews.com                         2
www.nytimes.com                    2
www.cnn.com                        2
www.youtube.com                    2
accounts.google.com                1
nypost.com                         1
www.azfamily.com                   1
maps.google.com                    1
www.aljazeera.com                  1
www.washingtonpost.com             1
www.buzzfeednews.com               1
economictimes.indiatimes.com       1
www.cbsnews.com                    1
www.usatoday.com                   1
www.aeinstein.org                  1
denver.cbslocal.com                1
Name: domain, dtype: int64

In [7]:
tmp = analyzed.sort_values('top')[['top', 'left', 'domain', 'href']]
# drop "google" domains
tmp[~tmp.domain.str.contains('google')]

Unnamed: 0,top,left,domain,href
108,0.0,0.0,,
0,64.0,11.0,,
11,75.0,561.3125,,
2,86.0,11.0,,
39,212.0,2161.0,nypost.com,https://nypost.com/2020/11/05/woman-charged-wi...
38,212.0,1941.0,www.cbsnews.com,https://www.cbsnews.com/video/pro-president-tr...
37,212.0,1721.0,www.foxnews.com,https://www.foxnews.com/us/nypd-charge-protest...
35,212.0,1281.0,www.nytimes.com,https://www.nytimes.com/2020/11/05/us/election...
34,212.0,1061.0,denver.cbslocal.com,https://denver.cbslocal.com/2020/11/05/riots-d...
33,212.0,841.0,www.usatoday.com,https://www.usatoday.com/story/news/investigat...
