# Introduction
For the experiment, we need to reassign the mapping between the experimental unit and the treatment.  
Unfortunately, determining the new assignments means we need some information from other serivces.  Here they are:
1. First and foremost, we need to get the data that caused this mess in the first place - the seed materials that are (some incorrectly!) assigned to the entries in Sets.
2. With the material-to-entry IDs in hand, we need to go to Velocity Materials and get the catalog IDs that were assigned to the combination element in each experiment level. (Remember, Set material is assigned on the lot and/or inventory level!)

With this information in hand, we can identify which entry CAN go to which experimental unit and which are irreparably foobar...

In [1]:
import json
from getpass import getuser
from itertools import accumulate
import requests
import pandas as pd
import numpy as np

In [2]:
experimentsToken = "JbeP7pU6U8rnWvQbhTnO0qSX5o9K"
setsToken = "SF1b4unzrUvBfDFZjNHLDTW5xeuI"
velmatToken = "vvelLHalQU1I9DgjrZkagfkjiBjY"

In [3]:
def getHeaders(token):
  return {
    'oauth_resourceownerinfo': "user_id={0}".format(getuser().lower()), 
    'Authorization': "Bearer {0}".format(token),
    "Content-Type": "application/json",
  }

def getSuffix(env):
  suffix = "-np"
  if env == "prod":
    suffix = ""
  return suffix

def getSetURL(env):
  return "https://api01{suffix}.agro.services/sets-api/v2".format(suffix=getSuffix(env))

def getExperimentURL(env):
  return "https://api01{suffix}.agro.services/experiments-api/v3".format(suffix=getSuffix(env))

def getVelmatURL(env):
  return "https://velmat-search-api.velocity{suffix}.ag/search".format(suffix=getSuffix(env))

treatmentsEndpoint = "/experiments/{id}/treatments"
experimentalUnitsEndpoint = "/experiments/{id}/experimental-units"
setEntriesEndpoint = "/set-entries"

lotQuery = "/{index}?type=INTERNAL_SEED&q=materialId\:{materialId}"

## Get the materials assigned to each set entry

Let's query the Sets service and get the mapping of `setEntryId` -> `seedMaterial` (with `materialType` and `materialId`).

In [4]:
def getSetsByExperiment(id=None, env=None, token=setsToken, *args, **kwargs):
  params = {'sourceId': id, "entries": "true", "limit": 500}
  headers = getHeaders(token)
  response = requests.get(getSetURL(env) + "/sets", params=params, headers=headers)
  response.raise_for_status()
  retval = pd.io.json.json_normalize(response.json(), 
                                     ["entries", "materials"],
                                     [
                                       ["entries", "setId"],
                                       ["entries", "entryId"],
                                       "name",
                                     ],
                                     errors='ignore', max_level=10)
  retval = retval[['materialId', 'materialType', 'productType', 'materialName', 'entries.setId', 'entries.entryId', 'name']]
  retval = retval.rename(columns={"entries.setId": "setId", "entries.entryId": "entryId", "name": "setName"})
  for column in ['materialId', 'setId', 'entryId']:
    retval[column] = retval[column].astype('int64')
  return retval

def getSeedsOnly(df):
  return df[df.materialType == 'internal_seed']
  
def mapEntries(item):
  for entry in item["entries"]:
    newEntry = {}
    newEntry["sourceId"] = item["source"]["sourceId"]
    newEntry["setEntryId"] = entry["entryId"]
    seed = [m for m in entry["materials"] if m["materialType"] == "internal_seed"]
    assert len(seed) <= 1
    if len(seed) == 0:
      print(entry)
    #   raise Error()
    try:
      newEntry["materialId"] = seed[0]["materialId"]
      newEntry["materialType"] = seed[0]["productType"]
    except IndexError:
      pass
    # print(newEntry, entry["entryId"])
  return newEntry

def extractEntries(response):
  return [mapEntries(x) for x in response.json()]

def test_extractEntries(response):
    assert sum([len(i["entries"]) for i in response.json()]) == len(extractEntries(response))

In [5]:
setResponse = getSetsByExperiment(5419, '')
setResponse

Unnamed: 0,materialId,materialType,productType,materialName,setId,entryId,setName
0,50099,internal_seed,lot,50099,37856,3355256,TRT B1 L03
1,50017,internal_seed,lot,50017,37856,3355257,TRT B1 L03
2,49858,internal_seed,lot,49858,37856,3355258,TRT B1 L03
3,50151,internal_seed,lot,50151,37856,3355259,TRT B1 L03
4,74417,internal_seed,lot,74417,37856,3355260,TRT B1 L03
...,...,...,...,...,...,...,...
359,74437,internal_seed,lot,74437,37867,3355823,TRT B1 L01
360,74448,internal_seed,lot,74448,37867,3355824,TRT B1 L01
361,74459,internal_seed,lot,74459,37867,3355825,TRT B1 L01
362,74416,internal_seed,lot,74416,37867,3355826,TRT B1 L01


In [6]:
assert setResponse.materialId.count() == 364, setResponse.materialId.count()

setSeeds = getSeedsOnly(setResponse)

## Get the catalog information from Velmat

Now that we have the seed material information in hand, we need to get the relationship between how Experiments stores the material (the catalog ID) and how Sets stores the material (an inventory or lot ID).

In [7]:
def getMaterialsFromSet(df):
  materials = []
  for index, material in df.iterrows():
    materials.append((
      material.productType, 
      "INTERNAL_SEED", 
      int(material.materialId), 
      int(material.entryId), 
      int(material.setId)
    ))
  return materials

def generateListQuery(materials):
  retval = []
  for index, materialType, materialId, entryId, setId in materials:
    entry = {'_index':index, '_type':materialType, '_id': materialId}
    retval.append(entry)
  return json.dumps(retval, sort_keys=True, indent=2)

def getSetMaterialData(materials, env='', token=velmatToken):
  url = "https://velmat-search-api.velocity-np.ag/v2/load"
  headers = {'Authorization': "Bearer {0}".format(token),
            'Content-Type': 'application/json'}
  query = generateListQuery(materials)
  response = requests.post(url, data=query, headers=headers)
  response.raise_for_status()
  return response

def getMaterialIdMapping(materials):
  """ 
  Return a list of dictionaries with keys 'catalogId', 'lotId', and 'inventoryId'.
  The lot OR the inventory key can == None.
  """
  retval = []
  for catalog in materials.json():
    item = {}
    item["catalogId"] = int(catalog["_source"]["catalog"]["id"])
    item["lotId"] = catalog["_source"].get("lot", {}).get("id", None)
    if int(catalog["_id"]) != item["lotId"]: 
      item["materialId"] = int(catalog["_id"])
    else: 
      item["materialId"] = int(item["lotId"])
    retval.append(item)
  retval = pd.DataFrame(retval)
  for column in ['materialId']:
    retval[column] = retval[column].astype('int64')
  return retval

In [8]:
setMaterials = getMaterialsFromSet(setSeeds)
setCatalogs = getSetMaterialData(setMaterials, env='', token=velmatToken)
# setCatalogs
mappedMaterials = getMaterialIdMapping(setCatalogs)
mappedMaterials.head()

Unnamed: 0,catalogId,lotId,materialId
0,60711,74416,2323667
1,60708,74417,2323668
2,60722,74435,2323691
3,60743,74452,2323699
4,60740,74447,2323685


In [9]:
mappedMaterials.tail()

Unnamed: 0,catalogId,lotId,materialId
139,60760,74465,74465
140,60764,74475,74475
141,60781,74499,74499
142,60769,74493,74493
143,60756,74476,74476


## Identify the treatment that the set entry SHOULD be assigned to

So now we know what catalog item each material comes from.  We need to get the entry-to-catalog mapping so we can then map the entry to the treatment combination in Experiments. 

In [10]:
def mapEntryToCatalog(setMaterial, materials):
  if setMaterial[0] == 'inventory':
    return next(filter(lambda x: x['inventoryId'] == setMaterial[2], materials))
  elif setMaterial[0] == 'lot':
    return next(filter(lambda x: x['lotId'] == setMaterial[2] and x['inventoryId'] is None, materials))
  raise Exception('Nothing found!', setMaterial)
  
# i = mapEntryToCatalog(setMaterials[55], mappedMaterials)
# print(i)

In [11]:
assert type(mappedMaterials.materialId.dtype) == type(setSeeds.materialId.dtype)
entriesToCatalog = setSeeds.merge(mappedMaterials, on=["materialId"], how="inner", copy=True)

entriesToCatalog.head()
# print(entriesToCatalog.count())

Unnamed: 0,materialId,materialType,productType,materialName,setId,entryId,setName,catalogId,lotId
0,50099,internal_seed,lot,50099,37856,3355256,TRT B1 L03,33964,50099
1,50099,internal_seed,lot,50099,37858,3355308,TRT B2 L01,33964,50099
2,50099,internal_seed,lot,50099,37858,3355352,TRT B2 L01,33964,50099
3,50099,internal_seed,lot,50099,37858,3355353,TRT B2 L01,33964,50099
4,50099,internal_seed,lot,50099,37858,3355354,TRT B2 L01,33964,50099


In [12]:
setSeeds.head()

Unnamed: 0,materialId,materialType,productType,materialName,setId,entryId,setName
0,50099,internal_seed,lot,50099,37856,3355256,TRT B1 L03
1,50017,internal_seed,lot,50017,37856,3355257,TRT B1 L03
2,49858,internal_seed,lot,49858,37856,3355258,TRT B1 L03
3,50151,internal_seed,lot,50151,37856,3355259,TRT B1 L03
4,74417,internal_seed,lot,74417,37856,3355260,TRT B1 L03


In [13]:
assert entriesToCatalog[entriesToCatalog.materialId == 2323687].lotId.values == mappedMaterials[mappedMaterials.materialId== 2323687].lotId.values
assert entriesToCatalog[entriesToCatalog.materialId == 2323687].entryId.values == setSeeds[setSeeds.materialId== 2323687].entryId.values

In [14]:
import sgqlc as gql
from sgqlc.endpoint.http import HTTPEndpoint as gqlEndpoint


getTreatmentVariableDataQuery = """
query GetTreatmentVariableData($experimentId: Int!) {
  getTreatmentVariablesByExperimentId(experimentId:$experimentId){
    id,
    name,
    tier,
    treatmentVariableLevels {
      treatmentVariableId,
      valueJSON
    }
  }
}
"""
getUnitsByExperimentIdQuery = """
query GetUnitsByExperimentId($experimentId: Int!) {
  getUnitsByExperimentId(experimentId:$experimentId) {
    id,
    block,
    blockId,
    treatmentId,
    setEntryId
  }
}
"""

getTreatmentsByExperimentIdQuery = """
query GetTreatmentsByExperimentId($experimentId: Int!) {
  getTreatmentsByExperimentId(experimentId:$experimentId){
    id, 
    isControl,
    inAllBlocks,
    blockId,
    combinationElements{
      id,
      treatmentVariableLevelId,
      treatmentId,
      treatmentVariableLevel {
        id,
        valueJSON, 
        nestedLevels {valueJSON}
        associatedLevels {valueJSON}
      }
    },
    controlTypes
  }
}
"""

def getTreatmentVariableData(experimentId, env=''):
  return getFromGraphQL(getTreatmentVariableDataQuery, experimentId, env)['getTreatmentVariableData']

def getUnitsByExperimentId(experimentId, env=''):
  return getFromGraphQL(getUnitsByExperimentIdQuery, experimentId, env)['getUnitsByExperimentId']

def getTreatmentsByExperimentId(experimentId, env=''):
  return getFromGraphQL(getTreatmentsByExperimentIdQuery, experimentId, env)['getTreatmentsByExperimentId']

def getFromGraphQL(query, experimentId, env='', token=experimentsToken):
  url = "https://api01-np.agro.services/experiments-api-graphql/v1/graphql" 
  headers = getHeaders(experimentsToken)
  variables = {"experimentId": experimentId}

  endpoint = gqlEndpoint(url, headers)
  data = endpoint(query, variables)
  return data['data']



In [15]:
units = getUnitsByExperimentId(5419)

In [16]:
df2 = pd.DataFrame(units)
for column in ['blockId', 'treatmentId', 'setEntryId', "id"]:
  df2[column] = df2[column].astype('int64')
df2 = df2.rename(columns={"setEntryId": "entryId", "id": "experimentalUnitId"})
# df2.set_index(['treatmentId'], inplace=True)
df2.head()

Unnamed: 0,experimentalUnitId,block,blockId,treatmentId,entryId
0,17387955,TRT-2,5914,351574,3355359
1,17388059,TRT-2,5914,351574,3355463
2,17388163,TRT-2,5914,351574,3355411
3,17388267,TRT-2,5914,351574,3355567
4,17388371,TRT-2,5914,351574,3355775


In [17]:
assert df2.index.size == 1040 # df2[df2.entryId== 3355359]

##### Bad!  This will join to reflect the CURRENT state of the system, NOT what we want to achieve...

```
y = x.merge(df2, on=["entryId"], how="inner", copy=True)
```

In [18]:
treatments = getTreatmentsByExperimentId(5419)

In [19]:
treatments[4]

{'id': 351479,
 'isControl': 'false',
 'inAllBlocks': False,
 'blockId': 5913,
 'combinationElements': [{'id': 525379,
   'treatmentVariableLevelId': 317462,
   'treatmentId': 351479,
   'treatmentVariableLevel': {'id': 317462,
    'valueJSON': {'items': [{'label': 'Seed',
       'value': 60708,
       'objectType': 'Catalog',
       'catalogType': 'INTERNAL_SEED',
       'isPlaceholder': False}],
     'objectType': 'Cluster'},
    'nestedLevels': [],
    'associatedLevels': []}}],
 'controlTypes': []}

In [20]:
parsedTreatments = []
for treatment in treatments:
  for element in treatment["combinationElements"]:
    for item in element["treatmentVariableLevel"]['valueJSON']['items']:
      if item['catalogType'] == "INTERNAL_SEED":
        if 'value' not in item:
          catalogId = np.nan
        else:
          catalogId = item['value']
        parsedTreatments.append({
          "catalogId": catalogId,
          "variableLabel": item["label"],
          "treatmentId": treatment["id"],
          "combinationId": element["id"],
          "treatmentVariableLevelId": element["treatmentVariableLevelId"]
        })

df3 = pd.DataFrame(parsedTreatments)
# df3.set_index(['treatmentId'], inplace=True)
df3.head()

Unnamed: 0,catalogId,variableLabel,treatmentId,combinationId,treatmentVariableLevelId
0,33964.0,Seed,351475,525375,317458
1,33804.0,Seed,351476,525376,317459
2,33786.0,Seed,351477,525377,317460
3,34005.0,Seed,351478,525378,317461
4,60708.0,Seed,351479,525379,317462


In [21]:
assert df3.catalogId.shape[0] == 100

In [22]:
assert pd.unique(df3.catalogId).shape[0] == 93  # Combines the 'dummy' catalog values (e.g. "Seed93")

In [23]:
txToUnits = df3.merge(df2, on=["treatmentId"], how="inner", copy=True, validate="one_to_many")
# txToUnits.set_index(['treatmentId'], inplace=True)
txToUnits.count()

catalogId                    960
variableLabel               1040
treatmentId                 1040
combinationId               1040
treatmentVariableLevelId    1040
experimentalUnitId          1040
block                       1040
blockId                     1040
entryId                     1040
dtype: int64

In [24]:
txToUnits.head()

Unnamed: 0,catalogId,variableLabel,treatmentId,combinationId,treatmentVariableLevelId,experimentalUnitId,block,blockId,entryId
0,33964.0,Seed,351475,525375,317458,17387852,TRT-1,5913,3355776
1,33964.0,Seed,351475,525375,317458,17387956,TRT-1,5913,3355464
2,33964.0,Seed,351475,525375,317458,17388060,TRT-1,5913,3355256
3,33964.0,Seed,351475,525375,317458,17388164,TRT-1,5913,3355204
4,33964.0,Seed,351475,525375,317458,17388268,TRT-1,5913,3355672


In [25]:
txToUnits.tail()

Unnamed: 0,catalogId,variableLabel,treatmentId,combinationId,treatmentVariableLevelId,experimentalUnitId,block,blockId,entryId
1035,,Seed,351574,525474,317557,17388475,TRT-2,5914,3355619
1036,,Seed,351574,525474,317557,17388579,TRT-2,5914,3356243
1037,,Seed,351574,525474,317557,17388683,TRT-2,5914,3355931
1038,,Seed,351574,525474,317557,17388787,TRT-2,5914,3355983
1039,,Seed,351574,525474,317557,17388891,TRT-2,5914,3356191


In [26]:
assert txToUnits[txToUnits.treatmentId == 351475].shape[0] == 20  # experimental blocks (2) * locations (10) * reps/location (1)

In [27]:
txToUnits[txToUnits["entryId"]==3355359]

Unnamed: 0,catalogId,variableLabel,treatmentId,combinationId,treatmentVariableLevelId,experimentalUnitId,block,blockId,entryId
1030,,Seed,351574,525474,317557,17387955,TRT-2,5914,3355359


## The home stretch

Now we have two competing DataFrames: `entriesToCatalog` and `txToUnits` which represent the state of the Sets app and the Experiments app, respectively.  To identify the changes to be made we need to merge these together, but be careful!  We know that the mapping between `experimentalUnitId` and `entryId` is wrong in some places, so we must remember to find where the `catalogId` fields are mismatched.

In [28]:
# entriesToCatalog.set_index('catalogId', inplace=True)
entriesToCatalog.head()

Unnamed: 0,materialId,materialType,productType,materialName,setId,entryId,setName,catalogId,lotId
0,50099,internal_seed,lot,50099,37856,3355256,TRT B1 L03,33964,50099
1,50099,internal_seed,lot,50099,37858,3355308,TRT B2 L01,33964,50099
2,50099,internal_seed,lot,50099,37858,3355352,TRT B2 L01,33964,50099
3,50099,internal_seed,lot,50099,37858,3355353,TRT B2 L01,33964,50099
4,50099,internal_seed,lot,50099,37858,3355354,TRT B2 L01,33964,50099


In [29]:
# txToUnits.set_index(['catalogId', 'entryId'], inplace=True)
txToUnits.head()

Unnamed: 0,catalogId,variableLabel,treatmentId,combinationId,treatmentVariableLevelId,experimentalUnitId,block,blockId,entryId
0,33964.0,Seed,351475,525375,317458,17387852,TRT-1,5913,3355776
1,33964.0,Seed,351475,525375,317458,17387956,TRT-1,5913,3355464
2,33964.0,Seed,351475,525375,317458,17388060,TRT-1,5913,3355256
3,33964.0,Seed,351475,525375,317458,17388164,TRT-1,5913,3355204
4,33964.0,Seed,351475,525375,317458,17388268,TRT-1,5913,3355672


In [30]:
final = txToUnits.merge(entriesToCatalog, on='entryId', how="inner", suffixes=('_e', '_s'), copy=True)
# final.set_index('setName', inplace=True)
print(final.shape[0])
entryIdColumnIndex = final.columns.get_loc("entryId")
setColumns = final.columns[-9:]

364


In [31]:
mismatched = final[(final["catalogId_e"] != final["catalogId_s"])] # | (final["catalogId_e"] == np.nan)]

In [32]:
txToUnits.shape, entriesToCatalog.shape, mismatched.shape

((1040, 9), (364, 9), (234, 17))

In [33]:
mismatched[(mismatched.block == 'TRT-1') & (mismatched.catalogId_e == 60745)]

Unnamed: 0,catalogId_e,variableLabel,treatmentId,combinationId,treatmentVariableLevelId,experimentalUnitId,block,blockId,entryId,materialId,materialType,productType,materialName,setId,setName,catalogId_s,lotId
164,60745.0,Seed,351506,525406,317489,17387991,TRT-1,5913,3355495,74456,internal_seed,lot,74456,37861,TRT B1 L02,60784,74456
165,60745.0,Seed,351506,525406,317489,17388095,TRT-1,5913,3355287,74429,internal_seed,lot,74429,37856,TRT B1 L03,60713,74429
166,60745.0,Seed,351506,525406,317489,17388199,TRT-1,5913,3355235,2323690,internal_seed,inventory,2323690,37857,TRT B1 L04,60751,74460
167,60745.0,Seed,351506,525406,317489,17388303,TRT-1,5913,3355703,74429,internal_seed,lot,74429,37864,TRT B1 L05,60713,74429


In [34]:
txToUnits[(txToUnits.block == 'TRT-1') & (txToUnits.catalogId == 60745)]

Unnamed: 0,catalogId,variableLabel,treatmentId,combinationId,treatmentVariableLevelId,experimentalUnitId,block,blockId,entryId
350,60745.0,Seed,351506,525406,317489,17387887,TRT-1,5913,3355807
351,60745.0,Seed,351506,525406,317489,17387991,TRT-1,5913,3355495
352,60745.0,Seed,351506,525406,317489,17388095,TRT-1,5913,3355287
353,60745.0,Seed,351506,525406,317489,17388199,TRT-1,5913,3355235
354,60745.0,Seed,351506,525406,317489,17388303,TRT-1,5913,3355703
355,60745.0,Seed,351506,525406,317489,17388407,TRT-1,5913,3355859
356,60745.0,Seed,351506,525406,317489,17388511,TRT-1,5913,3355651
357,60745.0,Seed,351506,525406,317489,17388615,TRT-1,5913,3356067
358,60745.0,Seed,351506,525406,317489,17388719,TRT-1,5913,3356015
359,60745.0,Seed,351506,525406,317489,17388823,TRT-1,5913,3356119


In [35]:
final[(final.block == 'TRT-1') & (final.catalogId_e == 60745)]

Unnamed: 0,catalogId_e,variableLabel,treatmentId,combinationId,treatmentVariableLevelId,experimentalUnitId,block,blockId,entryId,materialId,materialType,productType,materialName,setId,setName,catalogId_s,lotId
163,60745.0,Seed,351506,525406,317489,17387887,TRT-1,5913,3355807,74455,internal_seed,lot,74455,37867,TRT B1 L01,60745,74455
164,60745.0,Seed,351506,525406,317489,17387991,TRT-1,5913,3355495,74456,internal_seed,lot,74456,37861,TRT B1 L02,60784,74456
165,60745.0,Seed,351506,525406,317489,17388095,TRT-1,5913,3355287,74429,internal_seed,lot,74429,37856,TRT B1 L03,60713,74429
166,60745.0,Seed,351506,525406,317489,17388199,TRT-1,5913,3355235,2323690,internal_seed,inventory,2323690,37857,TRT B1 L04,60751,74460
167,60745.0,Seed,351506,525406,317489,17388303,TRT-1,5913,3355703,74429,internal_seed,lot,74429,37864,TRT B1 L05,60713,74429


In [36]:
entriesToCatalog[entriesToCatalog.catalogId == 60745].sort_values(by=['entryId'])

Unnamed: 0,materialId,materialType,productType,materialName,setId,entryId,setName,catalogId,lotId
208,2323698,internal_seed,inventory,2323698,37857,3355212,TRT B1 L04,60745,74455
337,74455,internal_seed,lot,74455,37860,3355443,TRT B2 L02,60745,74455
338,74455,internal_seed,lot,74455,37861,3355491,TRT B1 L02,60745,74455
339,74455,internal_seed,lot,74455,37867,3355807,TRT B1 L01,60745,74455


In [37]:
entriesToCatalog

Unnamed: 0,materialId,materialType,productType,materialName,setId,entryId,setName,catalogId,lotId
0,50099,internal_seed,lot,50099,37856,3355256,TRT B1 L03,33964,50099
1,50099,internal_seed,lot,50099,37858,3355308,TRT B2 L01,33964,50099
2,50099,internal_seed,lot,50099,37858,3355352,TRT B2 L01,33964,50099
3,50099,internal_seed,lot,50099,37858,3355353,TRT B2 L01,33964,50099
4,50099,internal_seed,lot,50099,37858,3355354,TRT B2 L01,33964,50099
...,...,...,...,...,...,...,...,...,...
359,74448,internal_seed,lot,74448,37861,3355508,TRT B1 L02,60770,74448
360,74448,internal_seed,lot,74448,37867,3355824,TRT B1 L01,60770,74448
361,74459,internal_seed,lot,74459,37860,3355461,TRT B2 L02,60748,74459
362,74459,internal_seed,lot,74459,37861,3355509,TRT B1 L02,60748,74459


In [41]:
# df = final.loc[(final.setName == "TRT B1 L02")].iloc(:, [.copy()
duplicateCatalogIds, expIndices, setIndices = np.intersect1d(df.catalogId_e.values, df.catalogId_s.values, return_indices=True)
print(expIndices[:])
print()
print(setIndices[:])

[ 2  1  0  3 40  4  7 45 50 32  8 36  5 24  9 41 13 25 20 37 28 42 51 21
 33 29 16 47 46 26 12 22 43 17 18 27 31 49 10 19  6 34 38 48 30 11 23 15
 35 39 44 14]

[50 49 48 51 36  0  3 41 46 28  4 32  1 20  5 37  9 21 16 33 24 38 47 17
 29 25 12 43 42 22  8 18 39 13 14 23 27 45  6 15  2 30 34 44 26  7 19 11
 31 35 40 10]


In [38]:
def switchRows(df, i1, i2, verbose):
  r1 = df.loc[i1, setColumns]
  r2 = df.loc[i2, setColumns]
  df.loc[i1, setColumns] = r2
  df.loc[i2, setColumns] = r1
  if verbose: 
    print(i1, '<-->', i2, end=" ")
  return df

def correctUnitEntryAssociations(df, verbose):
  df = df.copy()
  duplicateCatalogIds, expIndices, setIndices = np.intersect1d(df.catalogId_e.values, df.catalogId_s.values, return_indices=True)
  if verbose:
    print(expIndices)
    print(setIndices)
  # Some materials are doubled up and some are missing...
  modified = []
  if ( len(pd.unique(df.catalogId_s)) <= len(pd.unique(df.catalogId_e)) ) and len(duplicateCatalogIds) > 0:
    for e in expIndices:
      # Two cases:
      #   1) Row has one material value that's needed
      #   2) Row has two material values that are needed
      #
      # Simple case:
      if verbose: 
        print(e)
      start = e.copy()
      if e not in setIndices:  # These are the 'chain ends' for case #2!
        s = setIndices[np.argwhere(expIndices == e)[0][0]]
        if s not in expIndices:
          if verbose: 
            print('Single swap: ', end="\t")
          df = switchRows(df, e, s, verbose)
        else:
          while s in expIndices:  # Found a chain end
            df = switchRows(df, e, s, verbose)  # index = 40, ni = 16
            modified.append(e)  # TODO
            e = s                      # i = 16
            s = setIndices[np.argwhere(expIndices == s)[0][0]]
          # Wrap up the chain
          df = switchRows(df, s, e, verbose)
          modified.append(s)  # TODO
          if verbose: 
            print('Wrapped up!')
      else:
        
    # Check if we have any merry-go-round situations...
    if (len(expIndices) != len(modified)):
      # TODO...
        
  return df

IndentationError: expected an indented block (<ipython-input-38-55ecbfe2dcad>, line 49)

### Scenario 1

In [None]:
scenario1 = final[final.setName == "TRT B1 L01"].copy()
assert scenario1.shape[0] == 52, scenario1.shape
scenario1.index = list(range(scenario1.shape[0]))
fixed1 = correctUnitEntryAssociations(scenario1, False)
mismatched = fixed1[(fixed1["catalogId_e"] != fixed1["catalogId_s"])]
assert mismatched[(mismatched.setName == 'TRT B1 L01')].size == 0  # Scenario 1: happy path, user did everything correctly

### Scenario 2

In [None]:
scenario2 = final[final.setName == "TRT B2 L01"].copy()
assert scenario2.shape[0] == 52, scenario2.shape
scenario2.index = list(range(scenario2.shape[0]))
fixed2 = correctUnitEntryAssociations(scenario2, False)
mismatched = fixed2[(fixed2["catalogId_e"] != fixed2["catalogId_s"])]
assert mismatched[(mismatched.setName == "TRT B2 L01") & (mismatched.catalogId_e == np.nan)].shape[0] == 0  # Scenario 2: happy path, user did everything correctly w/ exp placeholders

### Scenario 3

In [None]:
scenario3 = final[final.setName == "TRT B1 L02"].copy()
assert scenario3.shape[0] == 52, scenario3.shape
assert scenario3[(scenario3.productType != 'lot')].shape[0] == 0
scenario3.index = list(range(scenario3.shape[0]))
fixed3 = correctUnitEntryAssociations(scenario3, True)
mismatched = fixed3[(fixed3["catalogId_e"] != fixed3["catalogId_s"])]
# assert mismatched.shape[0] == 0  # Scenario 3: Set has correct materials but all assigned to wrong treatment.  Same as "TRT B1 L04" but with lots instead of inv


In [None]:
duplicateCatalogIds, expIndices, setIndices = np.intersect1d(fixed3.catalogId_e.values, fixed3.catalogId_s.values, return_indices=True)
print(expIndices)
print(setIndices)

### Scenario 4

In [None]:
scenario4 = final[final.setName == "TRT B1 L05"].copy()
assert scenario4.shape[0] == 52, scenario4.shape
scenario4.index = list(range(scenario4.shape[0]))
fixed4 = correctUnitEntryAssociations(scenario4)

In [None]:
correctlyAssigned = scenario4[(scenario4["catalogId_e"] == scenario4["catalogId_s"])]
assert correctlyAssigned.shape[0] == 2, correctlyAssigned.shape

In [None]:
# duplicateCatalogIds, expIndices, setIndices = np.intersect1d(fixed4.catalogId_e.values, fixed4.catalogId_s.values, return_indices=True)
# print(expIndices)
# print(setIndices)

In [None]:
# Get the test data for the complex case
# fixed4.iloc[[47, 21, 7, 2, 26], fixed4.columns.get_indexer(["catalogId_e", "catalogId_s", "experimentalUnitId", "entryId"])]

In [None]:
# scenario4.iloc[[21, 7, 2, 26, 47], scenario4.columns.get_indexer(["catalogId_e", "catalogId_s", "experimentalUnitId", "entryId"])]

#### Test case 1: Simple chain
```
47 wants to switch with 21,
21 wants to switch with  7,
 7 wants to switch with  2,
 2 wants to switch with 26,
26 must switch with 47 to complete the chain.
```

#### Test case 2: Simple chain (longer)
```
40 wants to switch with 16,
16 wants to switch with  5, 
 5 wants to switch with  1, 
 1 wants to switch with 25, 
25 wants to switch with  9, 
 9 wants to switch with  4, 
 4 wants to switch with  0, 
 0 wants to switch with 24, 
24 wants to switch with  8, 
 8 wants to switch with  3, 
 3 wants to switch with 27, 
27 must switch with 40 to complete the chain.
```

#### Test case 3: Do-see-do
```
45 and 19 want to switch with each other.
```

#### Test case 4: Merry-go-round
```
x wants to switch with y,
y wants to switch with z,
z wants to switch with x.
```


In [None]:
cases = [  # format: ( expected order, original order)
  ([21, 7, 2, 26, 47], [47, 21, 7, 2, 26]),
  ([16, 5, 1, 25, 9, 4, 0, 24, 8, 3, 27, 40], [40, 16, 5, 1, 25, 9, 4, 0, 24, 8, 3, 27]),
  ([19, 45], [45, 19]),
]
for e, a in cases:
  c =  scenario4.columns.get_indexer(setColumns)
  expected = scenario4.iloc[e, c].values
  actual = fixed4.iloc[a, c].values
  assert np.array_equal(expected, actual), "Expected != actual {0}\n{1}".format((e, a), np.where(expected != actual, expected, np.empty_like(expected)))

# Check that the experiments side of the table is unaltered
expectedExp = scenario4.iloc[:, :7].values
actualExp = fixed4.iloc[:, :7].values
# print(expectedExp, actualExp)
assert np.all(expectedExp == actualExp), "Violation: experiment values have been altered! {0}".format(np.where(expectedExp != actualExp, expectedExp, np.empty_like(expectedExp)))

# Check the count of the catalog matches has increased by X
previousMatches = scenario4[(scenario4["catalogId_s"] == scenario4["catalogId_e"])].shape[0]
finalMatches = fixed4[(fixed4["catalogId_s"] == fixed4["catalogId_e"])].shape[0]
assert finalMatches == 28, "Did not match the test criteria"
assert finalMatches - previousMatches == 26, "{0}, {1}".format(finalMatches, previousMatches)

### Scenario 5

In [None]:
scenario5 = final[(final.setName == "TRT B2 L02")].copy()  # Set with materials from the wrong block assigned to 48 of 52 entries
scenario5.index = list(range(scenario5.shape[0]))
scenario5.shape

In [None]:
fixed5 = correctUnitEntryAssociations(scenario5)
# fixed5.iloc[40:, :]

In [None]:
# scenario5.iloc[40:, :]

In [None]:
# The last seven records are for checks, so the catalogId_e is NaN (which DataFrame.eq can't handle), so we'll ignore them for the first assertion
assert np.all(fixed5.iloc[:40, :].eq(scenario5.iloc[:40, :])), "Violation: values have been altered!\n {0}".format(fixed5.eq(scenario5))
assert np.all(fixed5.iloc[40:, 1:].eq(scenario5.iloc[40:, 1:])), "Violation: values have been altered!\n {0}".format(fixed5.eq(scenario5))

### Scenario 6

In [None]:
scenario6 = final[(final.setName == "TRT B1 L04")].copy()
scenario6.index = list(range(scenario6.shape[0]))
assert scenario6.shape[0] == 52  # Scenario 5: Set with the right materials; all are assigned to the wrong treatment.  Same as "TRT B1 L02" but with inv instead of lots
assert scenario6[(scenario6.productType != 'inventory')].shape[0] == 0
