In [14]:
import pandas as pd
from collections import Counter
import numpy as np
import glob
import re
import json

In [15]:
licenses = pd.read_csv("..//data//searchResultsClean.csv")
weedmaps_not_joined = pd.read_csv("..//data//store.csv")
check_cols = ["adult_use_cultivation", "adult_use_nonstorefront", 
              "adult_use_retail", "distributor", "medical_cultivation", 
              "medical_nonstorefront", "medical_retail", "microbusiness"]

In [16]:
Counter(licenses["License Type"])

Counter({'Cannabis - Distributor Temporary License': 1432,
         'Cannabis - Retailer Temporary License': 777,
         'Cannabis - Retailer Nonstorefront Temporary License': 391,
         'Cannabis - Microbusiness Temporary License': 348,
         'Cannabis - Distributor-Transport Only Temporary License': 234,
         'Cannabis - Event Organizer Temporary License': 149,
         'Cannabis - Testing Laboratory Temporary License': 66,
         'Cannabis - Distributor License': 1,
         'Cannabis - Retailer License': 8,
         'Cannabis - Event Organizer License': 6})

In [17]:
weedmaps_not_joined.shape

(1485, 24)

## replace hyphens with nothing

In [18]:
licenses['License Number'] = licenses['License Number'].str.replace('-', '')

for col in check_cols:
    if weedmaps_not_joined[col].dtype != "float":
        weedmaps_not_joined[col] = weedmaps_not_joined[col].str.upper()
        weedmaps_not_joined[col] = weedmaps_not_joined[col].str.replace('-', '')

## Explore join statistics


In [19]:
for i in check_cols:
    try:
        merge = pd.merge(weedmaps_not_joined, 
                         licenses, 
                         left_on = i,
                         right_on = "License Number",
                         how = "outer",
                         indicator = True
        )
        print(Counter(merge["_merge"])["both"], i)
        stats = Counter(merge[merge["_merge"] == 'both']["License Type"])
        for x in stats:
            print(x, stats[x])
        print()
    except:
        print("failed", i)

failed adult_use_cultivation
242 adult_use_nonstorefront
Cannabis - Retailer Nonstorefront Temporary License 195
Cannabis - Retailer Temporary License 47

185 adult_use_retail
Cannabis - Retailer Temporary License 166
Cannabis - Retailer License 6
Cannabis - Retailer Nonstorefront Temporary License 7
Cannabis - Microbusiness Temporary License 5
Cannabis - Distributor Temporary License 1

1 distributor
Cannabis - Distributor Temporary License 1

failed medical_cultivation
111 medical_nonstorefront
Cannabis - Retailer Nonstorefront Temporary License 106
Cannabis - Retailer Temporary License 5

88 medical_retail
Cannabis - Retailer Temporary License 76
Cannabis - Retailer License 6
Cannabis - Microbusiness Temporary License 1
Cannabis - Retailer Nonstorefront Temporary License 4
Cannabis - Distributor Temporary License 1

24 microbusiness
Cannabis - Microbusiness Temporary License 24




| . | Adult use cultivation | Adult use nonstorefront | Adult use retail | Distributor | medical cultivation | medical nonstorefront | medical retail | microbusiness |
|---|---|---|---|---|---|---|---|---|
|retailer nonstorefront temp | 0 | 183 | 7 | 0 | 0 | 105 | 4 | 0 |
retailer temporary | 0 | 40 | 166 | 0 | 0 | 5 | 71 | 0
retailer | 0 | 0 | 6 | 0 | 0 | 0 | 6 | 0
microbusiness temporary | 0 | 0 | 4 | 0 | 0 | 0 | 1 | 25 |
distributor temporary | 0 | 40 | 1 | 1 | 0 | 5 | 1 | 0 |


## Actual join

In [20]:
'''
Creates a new dataframe (weedmaps_joined) which contains the joins of weedmaps stores
that had a license number in the licenses dataframe. The license numbers are joined in 
the order that they appear on check_cols.

Create another dataframe (weedmaps_not_joined) which contains all the weedmap stores that
had licenses that could not be joined to the licenses dataframe. 
'''

#weedmaps_not_joined = weedmaps
licenses_joined = pd.DataFrame()
licenses_not_joined = licenses.copy()
c = 0
for i in check_cols:
    try:
        join_on_i = pd.merge(weedmaps_not_joined, licenses, left_on = i, right_on = 'License Number', how = 'inner')
        print ("joined:", join_on_i.shape[0], ' on:', i)
        c += join_on_i.shape[0]
        
        licenses_not_joined = licenses_not_joined[~licenses_not_joined["License Number"].isin(join_on_i["License Number"])]
        #weedmaps_not_joined = weedmaps_not_joined[~weedmaps_not_joined[i].isin(join_on_i['License Number'])]
        licenses_joined = pd.concat([licenses_joined, join_on_i])
        
        
    except:
        print("none for",i)



none for adult_use_cultivation
joined: 242  on: adult_use_nonstorefront
joined: 185  on: adult_use_retail
joined: 1  on: distributor
none for medical_cultivation
joined: 111  on: medical_nonstorefront
joined: 88  on: medical_retail
joined: 24  on: microbusiness


In [21]:
latent_data_structure = {}
for license in licenses_joined["License Number"].unique():
    latent_data_structure.update({license: licenses_joined[licenses_joined["License Number"] == license].id.tolist()})
    
for license in licenses_not_joined["License Number"].unique():
    latent_data_structure.update({license: []})

In [22]:
with open("..//data//latent.json", "w") as f:
    json.dump(latent_data_structure, f)

In [23]:
len(licenses_joined["License Number"].unique())

275

In [24]:
licenses_not_joined.phone = licenses_not_joined.phone.astype(np.int64, errors = "ignore")
#licenses_not_joined = licenses_not_joined.reset_index(drop = True)
#licenses_not_joined["master_index"] = licenses_not_joined.index
licenses_not_joined.to_csv("..//data//licenses_not_joined.csv", index = False)

#licenses_joined.phone = licenses_joined.phone.astype(np.int64, errors = "ignore")
licenses_joined.to_csv('..//data//licenses_joined.csv', index = False)

In [25]:
pd.crosstab(licenses_joined["license_type"], licenses_joined["Adult-Use/Medicinal"])

Adult-Use/Medicinal,Adult-Use,BOTH,Medicinal
license_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hybrid,28,475,2
medical,5,56,28
recreational,15,33,0


In [26]:
c

651