In [52]:
import requests
from IPython.display import Markdown

url = 'https://kata.geosci.ai/challenge/sample-names'
r = requests.get(url)
print(r.status_code)
Markdown(r.text)

200


# Sample names

You have a set of sample names. They look like this:

    001235_Ainsa_Sobrarbe_C_2016-04-20_PCx
    ^^^^^^ ^^^^^ ^^^^^^^^ ^ ^^^^^^^^^^ ^^^
      1      2      3     4      5      6

A **valid name** consists of 6 parts separated by underscores. The parts are underlined, above. Having 6 such parts is enough to be called 'valid' (though there may be other problems, for example with the spelling or formatting of individual parts).

The 6 parts are:

- **Unique identifier** consisting of 6 characters.
- **Basin name.** Note that spellings are not guaranteed to be correct.
- **Unit or Formation name.** Note that spellings are not guaranteed to be correct.
- **Specimen type**, either H or C (hand or core).
- **Date**, which must be in ISO 8601 YYYY-MM-DD format to be considered correct.
- **Preparation codes** of at least one character.

We need to extract some information from this dataset.
        
1. How many valid sample names are there?
2. How many valid samples were taken in the Ainsa basin? Include records with misspelt basin names.
3. What's the longest period of days with no valid samples taken in Ainsa?

If looking for misspellings, we'll assume that any word starting and ending in the same letters, but with the middle letters scrambled, is the same word. So 'Anisa' is a misspelling of 'Ainsa', but 'Aimsa' is not. We'll also assume that the spelling with the most occurrences is the correct spelling.


## Example

Here's some sample data:

    001235_Ainsa_Sobrarbe_C_2016-04-20_PCx
    001236_Ainsa_Sobrarbe_H_2016-04-21_P
    001237_Anisa_Sobrarbe_H_2016-04-29_TCx
    001238_Sorbas_Gochar_2017-06-03_PxM
    001238_Sorbas_Gochar_C_2017-06-03_PxM
    001240_SORBAS_Gochar_C_2017-06-03_PxM

Let's answer the 3 questions for this sample dataset:

- There are **5** valid names (and 1 invalid one, with no specimen type).
- The Ainsa Basin appears in **3** sample names (including 1 misspelling).
- There is a **7** day period with no samples taken, between 21 April and 29 April.


## Hints

It's likely that the `datetime` library will be useful in answering question 3. In particular, this code is useful:

    from datetime import datetime
    datetime.fromisoformat('2016-07-03')
    
If that command fails on a date, then you should consider the date format incorrect and ignore that record.


## A quick reminder how this works

You can retrieve your data by choosing any Python string as a **`<KEY>`** and substituting here:
    
    https://kata.geosci.ai/challenge/sample-names?key=<KEY>
                                                      ^^^^^
                                                      use your own string here

To answer question 1, make a request like:

    https://kata.geosci.ai/challenge/sample-names?key=<KEY>&question=1&answer=1234
                                                      ^^^^^          ^        ^^^^
                                                      your key       Q        your answer

[Complete instructions at kata.geosci.ai](https://kata.geosci.ai/challenge)

----

© 2020 Agile Scientific, licensed CC-BY

In [53]:
example_input = """001235_Ainsa_Sobrarbe_C_2016-04-20_PCx
                    001236_Ainsa_Sobrarbe_H_2016-04-21_P
                    001237_Anisa_Sobrarbe_H_2016-04-29_TCx
                    001238_Sorbas_Gochar_2017-06-03_PxM
                    001238_Sorbas_Gochar_C_2017-06-03_PxM
                    001240_SORBAS_Gochar_C_2017-06-03_PxM"""

In [273]:
import pandas as pd
from collections import Counter
def samples_to_df(samples):
    """Parse all specimens into a dataframe. Remove invalid entries that do not have all 6 attributes"""
    samples = [sample.strip() for sample in samples.split('\n')] # split by line and remove whitespace
    samples = [sample.split('_') for sample in samples]
    
    df = pd.DataFrame(samples, columns=['uid', 'basin_name', 'formation_name', 'specimen_type', 'date', 'prep_code'])
    
    df = df.dropna() # remove invalid entries
    #df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='ignore') # errors=ignore: challenge considers entries with invalid dates to still be valid entries
    df = df.reset_index()
    return df


def is_misspelling(standard_string, check_string):
    """Return True if check_string is a mispelling of standard string under the rules of the challenge:
    Any word starting and ending with the same letters, but with the middle letters scrambled is considered a mispelling"""
    
    standard_string = standard_string.lower().strip()
    check_string = check_string.lower().strip()    
    
    if ((standard_string[0] == check_string[0]) and 
        (standard_string[-1] == check_string[-1]) and 
        (Counter(standard_string) == Counter(check_string))):
        
        return True
    else:
        return False
    
assert is_misspelling('Ainsa', 'ainsa')
assert is_misspelling('Ainsa', 'Ansia')
assert not is_mispelling('Ainsa', 'naisa')
assert not is_misspelling('Ainsa', 'Ainssa')


def find_max_date_gap(df):
    df.loc[:, 'date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='coerce')
    df.loc[:, 'gaps'] = df.sort_values(by='date').loc[:, 'date'].diff() # find the difference between dates
    df.loc[:, 'period_w_no_collection'] = df.loc[:, 'gaps'].dt.days - 1 # the period between the dates
    return(df['period_w_no_collection'].max())


df = samples_to_df(example_input)
ainsa_samples = df[df['basin_name'].apply(lambda x: is_misspelling('Ainsa', x))]

assert df.shape[0] == 5
assert ainsa_samples.shape[0] == 3
assert find_max_date_gap(ainsa_samples) == 7


## Get Data

In [275]:
my_key = "MH - :)"
params = {'key': my_key}
r = requests.get(url, params)
print(r)
samples = r.text

<Response [200]>


In [276]:
df = samples_to_df(samples)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9121 entries, 0 to 9120
Data columns (total 7 columns):
index             9121 non-null int64
uid               9121 non-null object
basin_name        9121 non-null object
formation_name    9121 non-null object
specimen_type     9121 non-null object
date              9121 non-null object
prep_code         9121 non-null object
dtypes: int64(1), object(6)
memory usage: 498.9+ KB


In [277]:
df.head()

Unnamed: 0,index,uid,basin_name,formation_name,specimen_type,date,prep_code
0,0,67,Sorbas,Zorreras,C,2000-01-01,PTxM
1,1,68,Sorbas,Gochar,H,2000-01-01,P
2,2,69,Sorbas,Gochar,H,2000-01-02,PC
3,3,70,Sorbas,Gochar,H,2000-01-02,TC
4,4,71,SORBAS,Gochar,H,2000-01-03,PTC


In [127]:
df['basin_name'].str.lower().value_counts()

jaca      1822
sorbas    1766
tremp     1670
ainsa     1580
asana     1412
jcaa       162
anisa      102
termp       98
trmep       93
aisna       90
aasna       76
asnaa       75
sorabs      59
srobas      58
sobras      58
Name: basin_name, dtype: int64

### Time pd.to_datetime()

In [133]:
%%timeit
pd.to_datetime(df['date'])

176 ms ± 6.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [141]:
%%timeit
pd.to_datetime(df['date'], infer_datetime_format=True)

168 ms ± 3.47 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [134]:
%%timeit
pd.to_datetime(df['date'], format='%Y-%m-%d', errors='coerce')

30.1 ms ± 175 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Determine Answers

In [278]:
answer1 = df.shape[0]

ainsa_basin_samples = df[df['basin_name'].apply(lambda x: is_misspelling('Ainsa', x))]
answer2 = ainsa_basin_samples.shape[0]
answer3 = find_max_date_gap(ainsa_basin_samples)

print(answer1, answer2, answer3)

9121 1772 234.0


In [77]:
params = {'key': my_key,
          'question': 1,
          'answer': answer1
         }

r = requests.get(url, params)
print(r)
print(r.text)

<Response [200]>
Correct


In [208]:
params = {'key': my_key,
          'question': 2,
          'answer': answer2
         }

r = requests.get(url, params)
print(r)
print(r.text)

<Response [200]>
Correct


In [257]:
params = {'key': my_key,
          'question': 3,
          'answer': answer3
         }

r = requests.get(url, params)
print(r)
print(r.text)

<Response [200]>
Correct! The next challenge is: https://kata.geosci.ai/challenge/prospecting - good luck!
