This script processes the test_2_data.csv file. It first removes duplicate rows, then extracts useful data into a new table

In [1]:
import time
# import requests
import pandas as pd
from ast import literal_eval
import numpy as np
import csv

In [2]:
# load data
start = time.perf_counter()
print(f'{time.strftime("%H:%M:%S", time.localtime())}: Loading Data...')

# read in data with proper formatting
cl_data = pd.read_csv('../data_out/test_2_data.csv', header=None, names=['gid_code', 'address', 'coverage_response', 'offers']).replace({np.nan: 'None'})
# convert json strings to dicts
cl_data["coverage_response"] = cl_data["coverage_response"].apply(literal_eval)
cl_data["offers"] = cl_data["offers"].apply(literal_eval)

print(f'{time.strftime("%H:%M:%S", time.gmtime(time.perf_counter() - start))}: Data loaded.')
cl_data

13:29:44: Loading Data...
00:01:09: Data loaded.


Unnamed: 0,gid_code,address,coverage_response,offers
0,530330112001,552 S CLOVERDALE ST SEATTLE WA 98108 USA,"{'status': 0, 'message': 'SUCCESS', 'addrValIn...","{'fixedWirelessQualified': False, 'offersList'..."
1,530330112001,545 S SULLIVAN ST SEATTLE WA 98108 USA,"{'status': 0, 'message': 'SUCCESS', 'addrValIn...","{'fixedWirelessQualified': False, 'offersList'..."
2,530330112001,552 S CLOVERDALE ST SEATTLE WA 98108 USA,"{'status': 0, 'message': 'SUCCESS', 'addrValIn...","{'fixedWirelessQualified': False, 'offersList'..."
3,530330112001,552 S CLOVERDALE ST SEATTLE WA 98108 USA,"{'status': 0, 'message': 'SUCCESS', 'addrValIn...","{'fixedWirelessQualified': False, 'offersList'..."
4,530330112001,714 S DONOVAN ST SEATTLE WA 98108 USA,"{'status': 1, 'message': 'Address_Validation_S...",
...,...,...,...,...
22196,530330071022,315 1ST AVE W SEATTLE WA 98119 USA,"{'status': 0, 'message': 'GREEN - exact match'...","{'fixedWirelessQualified': False, 'groupId': N..."
22197,530330071022,315 1ST AVE W SEATTLE WA 98119 USA,"{'status': 0, 'message': 'GREEN - exact match'...","{'fixedWirelessQualified': False, 'groupId': N..."
22198,530330071022,520 3RD AVE W SEATTLE WA 98119 USA,"{'status': 0, 'message': 'GREEN - exact match'...","{'fixedWirelessQualified': False, 'groupId': N..."
22199,530330071022,117 W MERCER ST SEATTLE WA 98119 USA,"{'status': 0, 'message': 'GREEN - exact match'...","{'fixedWirelessQualified': False, 'groupId': N..."


In [3]:
# select all rows with duplicate addresses
duplicates = cl_data[cl_data.duplicated(subset='address', keep=False)].sort_values(by='address')
duplicates

Unnamed: 0,gid_code,address,coverage_response,offers
2280,530330061003,10 E ROANOKE ST SEATTLE WA 98102 USA,"{'status': 0, 'message': 'GREEN - exact match'...","{'fixedWirelessQualified': False, 'groupId': N..."
2287,530330061003,10 E ROANOKE ST SEATTLE WA 98102 USA,"{'status': 0, 'message': 'GREEN - exact match'...","{'fixedWirelessQualified': False, 'groupId': N..."
3970,530330014003,10021 7TH AVE NW SEATTLE WA 98177 USA,"{'status': 0, 'message': 'SUCCESS', 'addrValIn...","{'fixedWirelessQualified': False, 'offersList'..."
3973,530330014003,10021 7TH AVE NW SEATTLE WA 98177 USA,"{'status': 0, 'message': 'SUCCESS', 'addrValIn...","{'fixedWirelessQualified': False, 'offersList'..."
19900,530330120003,10032 42ND AVE SW SEATTLE WA 98146 USA,,
...,...,...,...,...
4916,530330266002,9841 26TH AVE SW SEATTLE WA 98106 USA,,
16748,530330265002,9865 11TH AVE SW SEATTLE WA 98106 USA,"{'status': 1, 'message': 'Address_Validation_S...",
16739,530330265002,9865 11TH AVE SW SEATTLE WA 98106 USA,,
16677,530330265001,9954 3RD LN SW SEATTLE WA 98106 USA,,


In [4]:
indexes_to_save = []
# for each tuple of duplicates, we want to keep the one with:
# 1: a list of offers,
# 2: a success/green coverage response
# 3: a yellow coverage response
# 4: any status 0 coverage response
# 5: any status 1 coverage response
# 6: any coverage response
# 7: any row

u = duplicates['address'].unique()
for adr in u: # for each batch of duplicates
    rows = duplicates[duplicates['address'].values == adr]
    found_row = False
    restriction = 0
    while(not found_row): # for each restriction
        for i in rows.index: # check if any of the duplicates meet the restriction
            if restriction == 0 and duplicates.loc[i].offers and duplicates.loc[i].offers['offersList']: 
                indexes_to_save.append(i)
                found_row = True
                break
            elif restriction == 1 and duplicates.loc[i].coverage_response and \
                    (('green' in duplicates.loc[i].coverage_response['message'].lower()) or \
                     ('success' in duplicates.loc[i].coverage_response['message'].lower())): 
                indexes_to_save.append(i)
                found_row = True
                break
            elif restriction == 2 and duplicates.loc[i].coverage_response and \
                ('yellow' in duplicates.loc[i].coverage_response['message'].lower()):
                indexes_to_save.append(i)
                found_row = True
                break
            elif restriction == 3 and duplicates.loc[i].coverage_response and duplicates.loc[i].coverage_response['status'] == 0:
                indexes_to_save.append(i)
                found_row = True
                break
            elif restriction == 4 and duplicates.loc[i].coverage_response and duplicates.loc[i].coverage_response['status'] == 1:
                indexes_to_save.append(i)
                found_row = True
                break
            elif restriction == 5 and duplicates.loc[i].coverage_response:
                indexes_to_save.append(i)
                found_row = True
                break
            elif restriction == 6:
                indexes_to_save.append(i)
                found_row = True
                break
            else:
                pass
        restriction += 1

In [5]:
indexes_to_drop = duplicates.index.difference(indexes_to_save) # select indexes that are unwanted duplicates
cl_data.drop(index=indexes_to_drop, inplace=True) # drop those indexes in place
cl_data.reset_index(drop=True, inplace=True) # reset the index
print(sum(cl_data.duplicated(subset='address', keep=False)))# confirm there are no more duplicates

0


In [6]:
# expand useful data
print(cl_data.columns) # show current keys

# make new frame for expanded data
offers = pd.DataFrame()
offers['address'] = cl_data['address'] # add addresses
offers['adr_check'] = cl_data['coverage_response'] # add whole coverage data
expanded_adr_check = pd.json_normalize(offers['adr_check']) # normalize coverage data
offers = offers.drop(columns='adr_check').join(expanded_adr_check) # join normalized coverage data
offers['offers_list'] = cl_data['offers'].apply(lambda x: x['offersList'] if x else None) # pull offer lists
offers = offers.explode('offers_list').reset_index(drop=True, inplace=False) # expand offers so each offer has its own row
expanded_offers = pd.json_normalize(offers['offers_list']) # normalize offers
offers = offers.drop('offers_list', axis=1).join(expanded_offers) # join normalized offers
print(offers.columns)
offers.to_csv('../data_out/cl_expanded.csv', index=False)

Index(['gid_code', 'address', 'coverage_response', 'offers'], dtype='object')
Index(['address', 'status', 'message', 'leadIndicator', 'leadIndicatorStatus',
       'addressId', 'unitNumber', 'geoSecUnitId', 'googleInfo', 'below940',
       'existingService', 'expectedCompDate', 'lnppiMainDecision',
       'addrValInfo.result', 'addrValInfo.billingSource',
       'addrValInfo.fullAddress', 'addrValInfo.addressId',
       'addrValInfo.mduInfo.mduList', 'addrValInfo.wireCenter',
       'addrValInfo.nearMatchAddress', 'addrValInfo.nearMatchList',
       'addrValInfo.exactMatchAddress', 'addrValInfo.companyOwnerId',
       'loopQualInfo.message', 'loopQualInfo.messageDetail',
       'biwfInfo.fiberQualified', 'biwfInfo.redirectUrl', 'loopQualInfo',
       'biwfInfo', 'addrValInfo.mduInfo', 'downloadSpeed', 'uploadSpeed',
       'downloadSpeedMbps', 'uploadSpeedMbps', 'downloadDisplaySpeed',
       'uploadDisplaySpeed', 'internetTypeSortOrder', 'internetType',
       'productType', 'priceTyp

In [14]:
# select relevant columns
offers_view_1 = offers[['address', 'message', 'status', 'downloadSpeedMbps', 'uploadSpeedMbps', 'price']]
print(offers_view_1['message'].value_counts())
offers_view_1
# offers_view_1[offers_view_1.duplicated(subset='address', keep=False)]
# len(offers_view_1[offers_view_1['message'].isna()]['address'].unique())
# str(offers_view_1.iloc[23758:23760]['message'].values[0])

# 11111+6705+795+290+197+20+11+9774
# offers
# test_db = cl_data.loc[22199:22200].copy().reset_index(drop=True)
# exp_adr = pd.json_normalize(test_db['coverage_response'])
# test_db.join(exp_adr)

message
SUCCESS                                                  10275
GREEN - exact match                                       8933
Out Of Region                                              759
YELLOW - near matches                                      270
Address_Validation_Service_Error. Service Unavailable      192
No Match                                                    20
Error validating address                                    11
Name: count, dtype: int64


Unnamed: 0,address,message,status,downloadSpeedMbps,uploadSpeedMbps,price
0,552 S CLOVERDALE ST SEATTLE WA 98108 USA,SUCCESS,0.0,,,
1,714 S DONOVAN ST SEATTLE WA 98108 USA,Address_Validation_Service_Error. Service Unav...,1.0,,,
2,700 S CLOVERDALE ST SEATTLE WA 98108 USA,Out Of Region,0.0,,,
3,9141 8TH AVE S SEATTLE WA 98108 USA,Out Of Region,0.0,,,
4,551 S SULLIVAN ST SEATTLE WA 98108 USA,SUCCESS,0.0,,,
...,...,...,...,...,...,...
23755,315 1ST AVE W SEATTLE WA 98119 USA,GREEN - exact match,0.0,10,0.75,50.0
23756,520 3RD AVE W SEATTLE WA 98119 USA,GREEN - exact match,0.0,10,0.75,50.0
23757,117 W MERCER ST SEATTLE WA 98119 USA,GREEN - exact match,0.0,10,0.75,50.0
23758,509 1ST AVE W SEATTLE WA 98119 USA,GREEN - exact match,0.0,940,940,75.0


In [49]:
q = pd.read_csv('../data_out/quantum_pages.csv')
# find all rows that have addresses in the quantum pages
overlap = offers_view_1[offers_view_1['address'].isin(q['address'].unique())] 
overlap_missing = overlap[overlap['message'].isna() == True].copy() # all unique adrs with missing data
overlap_missing['message'] = "Quantum Redirect"
overlap_missing['status'] = -1
# overlap_missing['downloadSpeedMbps'] = 
overlap_missing.drop(columns=['downloadSpeedMbps', 'uploadSpeedMbps', 'price'], inplace=True)
overlap_missing
# overlap[overlap.duplicated(subset='address', keep=False).values]
# offers_view_1[offers_view_1[offers_view_1['address'].isin(q['address'].unique())].duplicated(subset='address', keep=False).values]

Unnamed: 0,address,message,status
55,845 S SULLIVAN ST SEATTLE WA 98108 USA,Quantum Redirect,-1
56,7803 8TH AVE S SEATTLE WA 98108 USA,Quantum Redirect,-1
57,819 S THISTLE ST SEATTLE WA 98108 USA,Quantum Redirect,-1
58,749 S ROSE ST SEATTLE WA 98108 USA,Quantum Redirect,-1
59,834 S CLOVERDALE ST SEATTLE WA 98108 USA,Quantum Redirect,-1
...,...,...,...
23744,533 3RD AVE W SEATTLE WA 98119 USA,Quantum Redirect,-1
23745,524 6TH AVE W SEATTLE WA 98119 USA,Quantum Redirect,-1
23746,530 4TH AVE W SEATTLE WA 98119 USA,Quantum Redirect,-1
23748,400 W REPUBLICAN ST SEATTLE WA 98119 USA,Quantum Redirect,-1
