# Problem

For a fixed positive integer $k$, order all possible $k$-mers taken from an underlying alphabet **lexicographically**.

Then the $k$-mer composition of a string s can be represented by an array $A$ for which $A[m]$ denotes the number of times that the mth k-mer (with respect to the lexicographic order) appears in $s$.

**Given** : A DNA string $s$ in FASTA format (having length at most 100 kbp).

**Return** : The 4-mer composition of $s$.

In [10]:
import toolz as tz
from toolz.curried import *
from operator import *
from itertools import product

In [16]:
input_data = {
    "sample": {
        "fasta":""">Rosalind_6431
CTTCGAAAGTTTGGGCCGAGTCTTACAGTCGGTCTTGAAGCAAAGTAACGAACTCCACGG
CCCTGACTACCGAACCAGTTGTGAGTACTCAACTGGGTGAGAGTGCAGTCCCTATTGAGT
TTCCGAGACTCACCGGGATTTTCGATCCAGCCTCAGTCCAGTCTTGTGGCCAACTCACCA
AATGACGTTGGAATATCCCTGTCTAGCTCACGCAGTACTTAGTAAGAGGTCGCTGCAGCG
GGGCAAGGAGATCGGAAAATGTGCTCTATATGCGACTAAAGCTCCTAACTTACACGTAGA
CTTGCCCGTGTTAAAAACTCGGCTCACATGCTGTCTGCGGCTGGCTGTATACAGTATCTA
CCTAATACCCTTCAGTTCGCCGCACAAAAGCTGGGAGTTACCGCGGAAATCACAG"""
    },
    "test": {
        "fasta": open("data/rosalind_kmer.txt", "r").read()
    }
}
cur_state = "test"
cur_data = input_data[cur_state]

input_processor = compose(''.join, list, tz.concat, map(list), filter(lambda x: not x.startswith(">") and bool(x)),  flip(str.split, "\n"))

@curry
def kmer_composition(k, dna_seq):
    kmer_seq = list(map(''.join, tz.sliding_window(k, dna_seq)))
    whole_kmer = map(''.join, product(["A", "T", "G", "C"], repeat=k))
    A = {kmer:0 for kmer in whole_kmer}
    for kmer in kmer_seq:
        A[kmer] += 1

    return map(last,sorted(A.items(), key=first))

four_mer_composition = kmer_composition(4)

run = compose(four_mer_composition, input_processor)

print(*run(cur_data["fasta"]))

354 336 385 355 346 344 360 349 337 348 350 369 317 367 363 361 347 327 333 366 318 321 365 321 330 349 354 356 363 344 338 335 325 343 342 323 338 348 344 324 331 343 358 350 366 335 356 375 345 311 321 343 362 324 353 338 391 329 365 349 347 320 334 333 368 340 334 342 314 300 347 323 333 321 340 364 334 335 361 309 358 299 315 339 302 319 342 305 345 347 349 357 322 311 347 339 339 337 324 338 316 337 365 348 373 305 365 319 352 358 351 372 343 353 341 317 338 334 320 344 337 325 362 347 377 337 365 312 344 370 334 365 367 349 320 330 338 352 357 345 339 331 355 353 341 322 344 296 365 298 356 339 327 329 340 373 340 319 329 371 371 355 356 364 329 345 330 352 362 382 358 343 361 341 330 346 385 385 334 322 357 342 321 362 364 318 356 353 374 355 329 399 364 353 351 346 345 332 362 378 325 333 335 354 330 344 355 311 338 336 366 338 326 330 335 354 336 341 319 347 329 363 357 346 378 331 370 353 320 328 330 335 380 326 364 366 347 348 354 364 341 368 351 358 321 345 349 351 340 341 

In [14]:
bool('')

False