In [1]:
import requests

url = 'https://kata.geosci.ai/challenge/sequence'  # <--- In week 2, you'll change the name.
r = requests.get(url)
r.status_code

200

In [2]:
from IPython.display import Markdown
Markdown(r.text)

# Sequence

You have a string of lithology codes, reading from the **bottom up** of a geological section. There is a sample every metre. There are three lithologies:

- **M**udstone
- **F**ine sandstone or siltstone
- **S**andstone

The strings look like this:

      ...MFFSSFSSSS...

Your data, when you receive it, will be much longer than this.

We need to get some geological information from this string of codes. Specifically, you need to answer 3 questions:

1. What is the total thickess in metres of sandstone (`S`)? Each sample represents one metre.
2. How many sandstone beds are there? A bed is a contiguous group of one lithology, so `MMFFF` is 2 beds, one of `M` and one of `F`.
3. How many times does the most common *upwards* bed transition occur? Do not include transitions from a lithology to itself.

Remember that the sequence is given to you from the bottom up. So an upwards transition is equivalent to a transition to the right.


## Example

Here is some example input:

      SSMMFFFFFFFFSSMFFSSFSSSSFMFSSSSFFSSFFFMM
      ^^          ^^   ^^ ^^^^   ^^^^  ^^

And the answers to the 3 questions:

- In this example, the total thickess of sandstone is 16 m. So the required answer is: **16**
- There are 6 sandstone beds in the sequence (marked above). The answer is: **6**
- The most common bed transition is `F` to `S`, which occurs 5 times. So the answer is: **5**


## A quick reminder how this works

You can retrieve your data by choosing any Python string as a **`<KEY>`** and substituting here:
    
    https://kata.geosci.ai/challenge/sequence?key=<KEY>
                                                  ^^^^^
                                                  use your own string here

To answer question 1, make a request like:

    https://kata.geosci.ai/challenge/sequence?key=<KEY>&question=1&answer=1234
                                                  ^^^^^          ^        ^^^^
                                                  your key       Q        your answer

[Complete instructions at kata.geosci.ai](https://kata.geosci.ai/challenge)

----

© 2020 Agile Scientific, licensed CC-BY

In [3]:
from collections import Counter

def net_thickness(section, facies):
    """Given a string representing a 'measured section' of facies, return the net thickness of the given facies.
    
    Example:
    ---------
    >>> net_thickness(section='MMSFM', facies='M')
    
    3
    """
    c = Counter(section)
    
    return c[facies]

assert net_thickness('MMSFM', 'M') == 3
assert net_thickness('MMSFM', 'S') == 1
assert net_thickness('MMSFM', 'C') == 0
assert net_thickness('SSMMFFFFFFFFSSMFFSSFSSSSFMFSSSSFFSSFFFMM', 'S') == 16


def split_section_into_beds(section):
    """Split a section up into a list of strings of contiguous facies
    
    Example:
    ---------
    >>> split_section_into_beds('MMMSSSFFM')
    
    ['MMM', 'SSS', 'FF', 'M']
    """
    beds = list()
    
    i = 0
    while i < len(section):
        current_facies = section[i]
        current_bed = ''

        while i < len(section) and current_facies == section[i]:
            current_bed += section[i]
            i += 1            

        beds.append(current_bed)
    
    return beds
    
     
assert split_section_into_beds('MSF') == ['M', 'S', 'F']
assert split_section_into_beds('MMSSFF') == ['MM', 'SS', 'FF']
assert split_section_into_beds('MMSSFSF') == ['MM', 'SS', 'F', 'S', 'F']
        

def bed_count(section, facies):
    """Count the number of continuous beds of a given facies within a string representing a measured section"""
    
    # Split into beds
    beds = split_section_into_beds(section)
    # Filter by facies
    facies_beds = [bed for bed in beds if facies in bed]
    # Count beds
    return len(facies_beds)

assert bed_count('MMSSFSF', 'M') == 1
assert bed_count('MMSSFSF', 'S') == 2
assert bed_count('MMSSFSF', 'F') == 2
assert bed_count('MMSSFSF', 'C') == 0

def count_transitions(section):
    "Find all the transitions in a section"
    transitions = (pair for pair in zip(section[:-1], section[1:]) if pair[0] != pair[1])
    return Counter(transitions)


def most_common_transition_count(section):
    """Return the count of the most common transition in a section"""
    return(count_transitions(section).most_common(1)[0][1])

assert most_common_transition_count('SSMMFFFFFFFFSSMFFSSFSSSSFMFSSSSFFSSFFFMM') == 5

# My Submission

In [4]:
my_key = "MH - :)"
params = {'key': my_key}
r = requests.get(url, params)
section = r.text

In [5]:
answer1 = net_thickness(section, 'S')
answer2 = bed_count(section, 'S')
answer3 = most_common_transition_count(section)

In [6]:
print(answer1, answer2, answer3)

5741 2352 2263


## Answer 1

In [7]:
params = {'key': my_key,
          'question': 1,
          'answer': answer1
         }

r = requests.get(url, params)
r

<Response [200]>

In [8]:
r.text

'Correct'

## Answer 2

In [9]:
params['question'] = 2
params['answer'] = answer2

r = requests.get(url, params)
print(r)
r.text

<Response [200]>


'Correct'

## Answer 3

In [10]:
params['question'] = 3
params['answer'] = answer3

r = requests.get(url, params)
print(r)
r.text

<Response [200]>


'Correct! The next challenge is: https://kata.geosci.ai/challenge/boreholes - good luck!'

## Alternate solution using numpy

Using code from [this gist](https://gist.github.com/alimanfoo/c5977e87111abe8127453b21204c1065). I had to modify it slightly to use the `np.char` module for this problem.

In [11]:
import numpy as np


def find_runs(x):
    """Find runs of consecutive items in an array."""

    # ensure array
    x = np.asanyarray(x)
    if x.ndim != 1:
        raise ValueError('only 1D array supported')
    n = x.shape[0]

    # handle empty array
    if n == 0:
        return np.array([]), np.array([]), np.array([])

    else:
        # find run starts
        loc_run_start = np.empty(n, dtype=bool)
        loc_run_start[0] = True
        np.not_equal(x[:-1], x[1:], out=loc_run_start[1:])
        run_starts = np.nonzero(loc_run_start)[0]

        # find run values
        run_values = x[loc_run_start]

        # find run lengths
        run_lengths = np.diff(np.append(run_starts, n))

        return run_values, run_starts, run_lengths
    
def find_char_runs(x):
    """Find runs of consecutive items in an array."""

    # ensure array
    x = np.asanyarray(x)
    if x.ndim != 1:
        raise ValueError('only 1D array supported')
    n = x.shape[0]

    # handle empty array
    if n == 0:
        return np.array([]), np.array([]), np.array([])

    else:
        # find run starts
        loc_run_start = np.empty(n, dtype=bool)
        loc_run_start[0] = True
        loc_run_start[1:] = np.char.not_equal(x[:-1], x[1:]) # this line had to be modified to work with np.char
        run_starts = np.nonzero(loc_run_start)[0]

        # find run values
        run_values = x[loc_run_start]

        # find run lengths
        run_lengths = np.diff(np.append(run_starts, n))

        return run_values, run_starts, run_lengths
    
def bed_count_np(section, facies):
    values, _, _ = find_char_runs(np.array(list(section)))
    return np.sum(values == facies)

In [12]:
a = np.array(list(section))
values, starts, lengths = find_char_runs(a)

np.sum(values == 'S')

2352

In [13]:
%%timeit
assert bed_count('MMSSFSF', 'M') == 1
assert bed_count('MMSSFSF', 'S') == 2
assert bed_count('MMSSFSF', 'F') == 2
assert bed_count('MMSSFSF', 'C') == 0

51.6 µs ± 1 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [14]:
%%timeit
assert bed_count_np('MMSSFSF', 'M') == 1
assert bed_count_np('MMSSFSF', 'S') == 2
assert bed_count_np('MMSSFSF', 'F') == 2
assert bed_count_np('MMSSFSF', 'C') == 0

730 µs ± 59.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [15]:
%%timeit
bed_count(section, 'S')

27.5 ms ± 1.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
%%timeit
bed_count_np(section, 'S')

7.05 ms ± 204 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
