# Task 2 - Sample of Owners

### GBQ connection, credentials

In [15]:
#import connection services to google big query and connect
from google.cloud import bigquery
from google.oauth2 import service_account

service_path = "/Users/natebender/Desktop/Wedgekey/"
service_file = 'Bender-Wedge-79e456939915.json' # change this to your authentication information  
gbq_proj_id = 'bender-wedge' # change this to your project. 
gbq_dataset_id = 'wedgeclean' # and change this to your data set ID

In [16]:
private_key = service_path + service_file

# Get your credentials
credentials = service_account.Credentials.from_service_account_file(service_path + service_file)

# And create a client to talk to GBQ
client = bigquery.Client(credentials = credentials, project=gbq_proj_id)

### Find unique owners, exclude non-owners

In [17]:
# set a query for GBQ that asks for unique owners who are not equal to 3 - the code for nonowners
query = (
    "SELECT DISTINCT card_no "
    "FROM `wedgeclean.wedge-data-all` "
    "WHERE card_no != 3 "
)

# execute the query in GBQ with `client.query`
results = client.query(
    query,
    location="US",
)

In [18]:
# view the number of items returned by the query
len(list(results))

27207

In [19]:
# view the first 10 results of the query
list(results)[:10]

[Row((52722.0,), {'card_no': 0}),
 Row((12184.0,), {'card_no': 0}),
 Row((11310.0,), {'card_no': 0}),
 Row((53287.0,), {'card_no': 0}),
 Row((23304.0,), {'card_no': 0}),
 Row((10442.0,), {'card_no': 0}),
 Row((18487.0,), {'card_no': 0}),
 Row((57275.0,), {'card_no': 0}),
 Row((52914.0,), {'card_no': 0}),
 Row((10363.0,), {'card_no': 0})]

### Take random sample of owners

In [20]:
# import the python random package to randomly sample the results of the owner query
import random

# set the owners value to empty
owners = []

# add sampled owner to the owners list
for owner in results :
    owners.append(owner[0])

# random sample owners from the query
sample = random.choices(owners, k=450)
new_sample = list(sample)

# show the sampled owners
print(sample)

[20441.0, 21446.0, 50586.0, 48839.0, 64832.0, 50426.0, 18611.0, 48648.0, 49711.0, 44262.0, 50570.0, 40960.0, 54226.0, 18820.0, 26173.0, 25429.0, 21192.0, 30494.0, 11258.0, 65400.0, 20886.0, 21922.0, 64785.0, 22445.0, 48713.0, 19705.0, 50847.0, 50286.0, 18816.0, 19417.0, 21349.0, 30417.0, 21887.0, 48016.0, 51206.0, 17850.0, 21362.0, 34654.0, 44265.0, 64348.0, 34790.0, 25905.0, 14815.0, 38138.0, 50454.0, 36133.0, 36985.0, 48595.0, 23062.0, 15632.0, 16175.0, 40897.0, 13011.0, 16286.0, 11951.0, 41259.0, 49027.0, 40400.0, 45144.0, 18939.0, 51055.0, 21727.0, 50755.0, 49446.0, 48388.0, 18593.0, 17972.0, 15424.0, 52179.0, 23654.0, 21471.0, 44335.0, 45160.0, 49690.0, 51045.0, 11627.0, 12919.0, 19990.0, 15878.0, 55898.0, 35264.0, 23663.0, 44828.0, 13310.0, 23665.0, 20108.0, 50347.0, 38439.0, 27201.0, 17358.0, 41167.0, 52104.0, 17223.0, 11825.0, 15856.0, 19315.0, 17228.0, 23257.0, 14960.0, 64256.0, 23595.0, 14160.0, 10576.0, 19077.0, 52669.0, 56636.0, 52514.0, 25068.0, 13814.0, 52140.0, 23379.0, 

### Select the sample of owners from GBQ

In [21]:
# create a new query for GBQ using the random sample of owners
query = (
    "SELECT * "
    "FROM `wedgeclean.wedge-data-all` "
    "WHERE card_no in ( "
)

for card in sample:
    query += str(card) + ", "
query = query[:-2] + ')'

# execute the query in GBQ with `client.query`
results = client.query(
    query,
    location="US",
)    

### Format the sample owners list with headers, save into text file

In [22]:
# set up headers for the text file with the data returned from the sampled owners query
headers = ["datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"]

# create and save the data returned from the sampled owners into a text file with the headers
with open('sample_owners.txt', 'w') as outfile :
    outfile.write(','.join(headers) + "\n") 
    for line in results :
        row = [str(item) for item in line]
        outfile.write(",".join(row) + "\n")