This script read the centurylink data collected earlier in this project and created a new file (adrs_for_quantum.csv) of the addresses that we expect had redirect to a quantum fiber page.

In [1]:
import time
# import requests
import pandas as pd
from ast import literal_eval
import numpy as np
import csv

In [2]:
start = time.perf_counter()
print(f'{time.strftime("%H:%M:%S", time.localtime())}: Loading Data...')

# read in data with proper formatting
cl_data = pd.read_csv('../data_out/test_2_data.csv', header=None, names=['gid_code', 'address', 'coverage_response', 'offers']).replace({np.nan: 'None'})
# convert json strings to dicts
cl_data["coverage_response"] = cl_data["coverage_response"].apply(literal_eval)
cl_data["offers"] = cl_data["offers"].apply(literal_eval)

print(cl_data['coverage_response'])
cl_data

13:16:14: Loading Data...
0        {'status': 0, 'message': 'SUCCESS', 'addrValIn...
1        {'status': 0, 'message': 'SUCCESS', 'addrValIn...
2        {'status': 0, 'message': 'SUCCESS', 'addrValIn...
3        {'status': 0, 'message': 'SUCCESS', 'addrValIn...
4        {'status': 1, 'message': 'Address_Validation_S...
                               ...                        
22196    {'status': 0, 'message': 'GREEN - exact match'...
22197    {'status': 0, 'message': 'GREEN - exact match'...
22198    {'status': 0, 'message': 'GREEN - exact match'...
22199    {'status': 0, 'message': 'GREEN - exact match'...
22200    {'status': 0, 'message': 'GREEN - exact match'...
Name: coverage_response, Length: 22201, dtype: object


Unnamed: 0,gid_code,address,coverage_response,offers
0,530330112001,552 S CLOVERDALE ST SEATTLE WA 98108 USA,"{'status': 0, 'message': 'SUCCESS', 'addrValIn...","{'fixedWirelessQualified': False, 'offersList'..."
1,530330112001,545 S SULLIVAN ST SEATTLE WA 98108 USA,"{'status': 0, 'message': 'SUCCESS', 'addrValIn...","{'fixedWirelessQualified': False, 'offersList'..."
2,530330112001,552 S CLOVERDALE ST SEATTLE WA 98108 USA,"{'status': 0, 'message': 'SUCCESS', 'addrValIn...","{'fixedWirelessQualified': False, 'offersList'..."
3,530330112001,552 S CLOVERDALE ST SEATTLE WA 98108 USA,"{'status': 0, 'message': 'SUCCESS', 'addrValIn...","{'fixedWirelessQualified': False, 'offersList'..."
4,530330112001,714 S DONOVAN ST SEATTLE WA 98108 USA,"{'status': 1, 'message': 'Address_Validation_S...",
...,...,...,...,...
22196,530330071022,315 1ST AVE W SEATTLE WA 98119 USA,"{'status': 0, 'message': 'GREEN - exact match'...","{'fixedWirelessQualified': False, 'groupId': N..."
22197,530330071022,315 1ST AVE W SEATTLE WA 98119 USA,"{'status': 0, 'message': 'GREEN - exact match'...","{'fixedWirelessQualified': False, 'groupId': N..."
22198,530330071022,520 3RD AVE W SEATTLE WA 98119 USA,"{'status': 0, 'message': 'GREEN - exact match'...","{'fixedWirelessQualified': False, 'groupId': N..."
22199,530330071022,117 W MERCER ST SEATTLE WA 98119 USA,"{'status': 0, 'message': 'GREEN - exact match'...","{'fixedWirelessQualified': False, 'groupId': N..."


In [3]:
adrs_missing = cl_data[cl_data['coverage_response'].isna()]['address'].values
# print(adrs_missing)
print(cl_data['address'].values)

['552 S  CLOVERDALE ST SEATTLE WA 98108 USA'
 '545 S  SULLIVAN ST SEATTLE WA 98108 USA'
 '552 S  CLOVERDALE ST SEATTLE WA 98108 USA' ...
 '520 3RD AVE W SEATTLE WA 98119 USA'
 '117 W  MERCER ST SEATTLE WA 98119 USA'
 '509 1ST AVE W SEATTLE WA 98119 USA']


In [4]:
duplicates = cl_data[cl_data.duplicated(subset='address', keep=False)].sort_values(by='address')
indexes_to_save = []
# for each tuple of duplicates, we want to keep the one with:
# 1: a list of offers,
# 2: a success/green coverage response
# 3: a yellow coverage response
# 4: any status 0 coverage response
# 5: any status 1 coverage response
# 6: any coverage response
# 7: any row

u = duplicates['address'].unique()
for adr in u: # for each batch of duplicates
    rows = duplicates[duplicates['address'].values == adr]
    found_row = False
    restriction = 0
    while(not found_row): # for each restriction
        for i in rows.index: # check if any of the duplicates meet the restriction
            if restriction == 0 and duplicates.loc[i].offers and duplicates.loc[i].offers['offersList']: 
                indexes_to_save.append(i)
                found_row = True
                break
            elif restriction == 1 and duplicates.loc[i].coverage_response and \
                    (('green' in duplicates.loc[i].coverage_response['message'].lower()) or \
                     ('success' in duplicates.loc[i].coverage_response['message'].lower())): 
                indexes_to_save.append(i)
                found_row = True
                break
            elif restriction == 2 and duplicates.loc[i].coverage_response and \
                ('yellow' in duplicates.loc[i].coverage_response['message'].lower()):
                indexes_to_save.append(i)
                found_row = True
                break
            elif restriction == 3 and duplicates.loc[i].coverage_response and duplicates.loc[i].coverage_response['status'] == 0:
                indexes_to_save.append(i)
                found_row = True
                break
            elif restriction == 4 and duplicates.loc[i].coverage_response and duplicates.loc[i].coverage_response['status'] == 1:
                indexes_to_save.append(i)
                found_row = True
                break
            elif restriction == 5 and duplicates.loc[i].coverage_response:
                indexes_to_save.append(i)
                found_row = True
                break
            elif restriction == 6:
                indexes_to_save.append(i)
                found_row = True
                break
            else:
                pass
        restriction += 1
indexes_to_drop = duplicates.index.difference(indexes_to_save) # select indexes that are unwanted duplicates
cl_data.drop(index=indexes_to_drop, inplace=True) # drop those indexes in place
print(sum(cl_data.duplicated(subset='address', keep=False)))# confirm there are no more duplicates

0


In [5]:
# with open('../res/adrs_for_quantum.csv', 'w', newline='') as outfile:
#     write_out = csv.writer(outfile)
#     for adr in adrs_missing:
#         write_out.writerow([adr])
# cl_data.address.values

In [6]:
import sys
# adding qscraper to the system path
sys.path.insert(0, '../src')
from quantum_scraper import QuantumScraper

# # get addresses
addresses = cl_data['address'].values
# print(f'got some addresses! \n{addresses[:10]}')
# # init scraper
g = QuantumScraper(outfolder='../data_out/new_quantum_run', run_num=42)
g.scrape(addresses[420:])

13:16:45: initializing scraper.
13:16:45: Scraper started.
13:16:45: Run 42 with 10 addresses START.
13:16:45:     requesting dwsids...
13:17:51:     requesting csrfs...
13:18:01:     joining session...
13:18:27:     saving pages...
13:18:34: Run 42 with 10 addresses COMPLETE.
13:18:34: Run 43 with 10 addresses START.
13:18:34:     requesting dwsids...
13:18:42:     requesting csrfs...
13:18:53:     joining session...
13:19:14:     saving pages...
13:19:21: Run 43 with 10 addresses COMPLETE.
13:19:21: Run 44 with 10 addresses START.
13:19:21:     requesting dwsids...
13:19:28:     requesting csrfs...
13:19:38:     joining session...
13:19:54:     saving pages...
13:20:02: Run 44 with 10 addresses COMPLETE.
13:20:02: Run 45 with 10 addresses START.
13:20:02:     requesting dwsids...
13:20:16:     requesting csrfs...
13:20:29:     joining session...
13:20:46:     saving pages...
13:21:21: Run 45 with 10 addresses COMPLETE.
13:21:21: Run 46 with 10 addresses START.
13:21:21:     requestin

ValueError: can only parse strings

  with loop.timer(seconds, ref=ref) as t:
