# Enumerating k-mers ~~Lexicographically~~ Alphabetically
**Given:** A collection of at most 10 symbols defining an ordered alphabet, and a positive integer *n* (*n*<=10).

**Return:** All strings of length *n* that can be formed from the alphabet, ordered ~~lexicographically~~ alphabetically.



# Sample Dataset

In [1]:
%%file Sample_Dataset.txt
T A G C
2



Overwriting Sample_Dataset.txt


# Sample Output

In [2]:
%%file Sample_Output.txt
TT
TA
TG
TC
AT
AA
AG
AC
GT
GA
GG
GC
CT
CA
CG
CC



Overwriting Sample_Output.txt


# Solution

In [3]:
import itertools
symbols_string = "T A G C"
symbols = symbols_string.split()
n = 2
kmers = list(itertools.product(symbols, repeat = n))
kmers_as_strings = ["".join(map(str,kmer)) for kmer in kmers]
#print("\n".join(kmers_as_strings))
kmers_as_strings.sort()
kmers_as_strings

['AA',
 'AC',
 'AG',
 'AT',
 'CA',
 'CC',
 'CG',
 'CT',
 'GA',
 'GC',
 'GG',
 'GT',
 'TA',
 'TC',
 'TG',
 'TT']

In [4]:
import itertools
def enumerateKmers(symbols, n):
    "Given a collection of at most 10 symbols defining an ordered alphabet, and a positive integer n (n<=10), return all strings of length n that can be formed from the alphabet, ordered lexicographically."
    
    kmers = list(itertools.product(symbols, repeat = n))
    kmers_as_strings = ["".join(map(str,kmer)) for kmer in kmers]
    
    return kmers_as_strings

def enumerateKmersFromFileToFile(input_file_path, output_file_path):
    "Wraps enumerateGeneOrders to read from input_file_path and write to output_file_path"
    
    input_file = open(input_file_path,'r')
    output_file = open(output_file_path,'w')
    
    input_lines = input_file.readlines()
    symbols = input_lines[0].strip().split()
    n = int(input_lines[1].strip())
    
    kmers_as_strings = enumerateKmers(symbols, n)
    
    #Alphabetically, not lexicographically, despite what Rosalind says as of August 29, 2016
    kmers_as_strings.sort()
    kmers_as_string = "\n".join(kmers_as_strings)
    
    output_file.write("%s\n" % kmers_as_string)
    
    
    input_file.close()
    output_file.close()
    
    return


# Test Solution

In [5]:
enumerateKmersFromFileToFile("Sample_Dataset.txt", "Test_Output.txt")

In [6]:
%%bash
echo Sample_Output.txt
md5sum <(sort Sample_Output.txt)
cat Sample_Output.txt|sort

Sample_Output.txt
2b060f4920e3ae20ff3c8c998ed48583  /dev/fd/63
AA
AC
AG
AT
CA
CC
CG
CT
GA
GC
GG
GT
TA
TC
TG
TT


In [7]:
%%bash
echo Test_Output.txt
md5sum <(sort Test_Output.txt)
cat Test_Output.txt

Test_Output.txt
2b060f4920e3ae20ff3c8c998ed48583  /dev/fd/63
AA
AC
AG
AT
CA
CC
CG
CT
GA
GC
GG
GT
TA
TC
TG
TT


In [8]:
%%bash
if [ $(md5sum <(sort Sample_Output.txt)|cut -f1 -d' ') == $(md5sum <(sort Test_Output.txt)|cut -f1 -d' ') ]
then
    echo Sample output matches test output.
else
    echo Sample output does not Match test output.
fi

Sample output matches test output.


# Downloaded Dataset

In [9]:
%%bash
cp ~/Downloads/rosalind_lexf.txt ./
cat rosalind_lexf.txt

S D Z V
4


# Solution to Downloaded Dataset

In [10]:
enumerateKmersFromFileToFile("rosalind_lexf.txt", "Solution_Output.txt")

In [11]:
%%bash
cat Solution_Output.txt

DDDD
DDDS
DDDV
DDDZ
DDSD
DDSS
DDSV
DDSZ
DDVD
DDVS
DDVV
DDVZ
DDZD
DDZS
DDZV
DDZZ
DSDD
DSDS
DSDV
DSDZ
DSSD
DSSS
DSSV
DSSZ
DSVD
DSVS
DSVV
DSVZ
DSZD
DSZS
DSZV
DSZZ
DVDD
DVDS
DVDV
DVDZ
DVSD
DVSS
DVSV
DVSZ
DVVD
DVVS
DVVV
DVVZ
DVZD
DVZS
DVZV
DVZZ
DZDD
DZDS
DZDV
DZDZ
DZSD
DZSS
DZSV
DZSZ
DZVD
DZVS
DZVV
DZVZ
DZZD
DZZS
DZZV
DZZZ
SDDD
SDDS
SDDV
SDDZ
SDSD
SDSS
SDSV
SDSZ
SDVD
SDVS
SDVV
SDVZ
SDZD
SDZS
SDZV
SDZZ
SSDD
SSDS
SSDV
SSDZ
SSSD
SSSS
SSSV
SSSZ
SSVD
SSVS
SSVV
SSVZ
SSZD
SSZS
SSZV
SSZZ
SVDD
SVDS
SVDV
SVDZ
SVSD
SVSS
SVSV
SVSZ
SVVD
SVVS
SVVV
SVVZ
SVZD
SVZS
SVZV
SVZZ
SZDD
SZDS
SZDV
SZDZ
SZSD
SZSS
SZSV
SZSZ
SZVD
SZVS
SZVV
SZVZ
SZZD
SZZS
SZZV
SZZZ
VDDD
VDDS
VDDV
VDDZ
VDSD
VDSS
VDSV
VDSZ
VDVD
VDVS
VDVV
VDVZ
VDZD
VDZS
VDZV
VDZZ
VSDD
VSDS
VSDV
VSDZ
VSSD
VSSS
VSSV
VSSZ
VSVD
VSVS
VSVV
VSVZ
VSZD
VSZS
VSZV
VSZZ
VVDD
VVDS
VVDV
VVDZ
VVSD
VVSS
VVSV
VVSZ
VVVD
VVVS
VVVV
VVVZ
VVZD
VVZS
VVZV
VVZZ
VZDD
VZDS
VZDV
VZDZ
VZSD
VZSS
VZSV
VZSZ
VZVD
VZVS
VZVV
VZVZ
VZZD
VZZS
VZZV
VZZZ
ZDDD
ZDDS
ZDDV
ZDDZ
ZDSD
ZDSS
ZDSV
ZDSZ
