# **Part One of the Course Project**
In this project, you will develop and apply Jaccard similarity to find similar presidential speeches and apply Hamming distance to find similar viral DNA sequences.
<hr style="border-top: 2px solid #606366; background: transparent;">

# **Setup**
 
Reset the Python environment to clear it of any previously loaded variables, functions, or libraries. Then, import the libraries and corpora needed for this project.

In [1]:
%reset -f
from IPython.core.interactiveshell import InteractiveShell as IS; IS.ast_node_interactivity = 'all'
import nltk, string, pandas as pd, numpy as np, unittest, numpy.testing as npt
from colorunittest import run_unittest
_ = nltk.download(['inaugural', 'stopwords'], quiet=True) # silently load corpora from NLTK

np.set_printoptions(linewidth=10000, precision=4, edgeitems=20, suppress=True) 
pd.set_option('max_colwidth', 200, 'display.max_rows', 6)

Next, load two presidential speeches and the English list of common stop words from the NLTK corpus. `SsPunct` is a list of English punctuation symbols.

In [2]:
LsObama9    = nltk.corpus.inaugural.words('2009-Obama.txt') # Text of Pres Obama's inaugural speech of 2009
LsObama13   = nltk.corpus.inaugural.words('2013-Obama.txt') # Text of Pres Obama's inaugural speech of 2013
SsStopwords = set(nltk.corpus.stopwords.words('english'))   # list of common English stop words 
SsPunct     = set(string.punctuation)                       # English punctuation symbols
print(LsObama9[:15])

['My', 'fellow', 'citizens', ':', 'I', 'stand', 'here', 'today', 'humbled', 'by', 'the', 'task', 'before', 'us', ',']


## **Task 1**

Complete the user defined function (UDF) `JS()`, which takes two sets of string tokens, `A` and `B`, and computes Jaccard similarity between them. The other parameters determine the required preprocessing of these sets.

In [3]:
# COMPLETE THIS CELL
def JS(A=set('ABC'), B=set('ABCD'), Lower=False, Stop=SsStopwords, Punct=SsPunct):
    '''Jaccard similarity between sets of string tokens A & B. Evaluate arguments are in order listed.
    If denominator is zero, return zero
    Inputs:
        A, B: two containers (set, list, etc.) with string elements.
        Lower: indicates whether lower case should be used before removal of stopwords and punctuation
        Stop: container of stopword strings
        Punct: container of punctuation strings
    Return: Jaccard similarity after preprocessing of A and B
    '''
    a, b = set(A), set(B)  # initialize a, b (cast to sets, just in case)
#     if Lower: a, b = 
#     if Stop:  a, b = 
#     if Punct: a, b = 
    js = None  # final Jaccard similarity score
    # YOUR CODE HERE
    raise NotImplementedError()
    return js

In [4]:
# version to work on

def JS(A=set('ABC'), B=set('ABCD'), Lower=False, Stop=SsStopwords, Punct=SsPunct):
    '''Jaccard similarity between sets of string tokens A & B. Evaluate arguments are in order listed.
    If denominator is zero, return zero
    Inputs:
        A, B: two containers (set, list, etc.) with string elements.
        Lower: indicates whether lower case should be used before removal of stopwords and punctuation
        Stop: container of stopword strings
        Punct: container of punctuation strings
    Return: Jaccard similarity after preprocessing of A and B
    '''
    a, b = set(A), set(B)  # initialize a, b (cast to sets, just in case)
#     if Lower: a, b = 
#     if Stop:  a, b = 
#     if Punct: a, b = 
    js = None  # final Jaccard similarity score

    a, b = set(A), set(B) #make sets
    
    if Lower: # check lower 
        a = {x.lower() for x in a}
        b = {x.lower() for x in b}
    
    if Stop:
        a = {x for x in a if x not in Stop}
        b = {x for x in b if x not in Stop}
        
    if Punct:
        a = {x for x in a if x not in Punct}
        b = {x for x in b if x not in Punct}
    
    #JaccardSim = lambda A={}, B={}: len(A & B) / len(A | B)    # A, B = sets of characters
    
    intersection = len(a.intersection(b))
    union = len(a.union(b))
    
    if union == 0:
        js = 0.0
    else:
        js = intersection / union
    
    return js

In [5]:
JS("cadcdythggbgbfb fb  bcdt", "cxcdcxcstt")

0.14285714285714285

The tests in the cell below can help you troubleshoot specific failing scenarios.

In [6]:
# RUN CELL TO TEST YOUR CODE
ae = npt.assert_equal
@run_unittest
class Test_JS(unittest.TestCase): # class with methods to test functionality of the JS() function
    def test00(self): ae(JS('', 'ABC'), 0)
    def test01(self): ae(JS('ABC', ''), 0)
    def test02(self): ae(JS('', ''), 0)
    def test03(self): ae(JS('ABC', 'ABC'), 1)
    def test04(self): ae(JS('ABC', 'ABc'), 0.5)
    def test05(self): ae(JS('ABC', 'ABC', Stop='A'), 1)
    def test06(self): ae(JS('ABC', 'ABc', Stop='c'), 0.6666666666666666)
    def test07(self): ae(JS('ABC', 'ABc', Stop='Cc'), 1)
    def test08(self): ae(JS('ABC!', 'ABC', Stop='A'), 1)
    def test09(self): ae(JS('ABC!', 'ABc,', Stop='c'), 0.6666666666666666)
    def test10(self): ae(JS('ABC!', 'ABc%', Stop='Cc'), 1)
    def test11(self): ae(JS('ABC', 'ABC', Lower=True, Stop='A'), 1)
    def test12(self): ae(JS('I like NLP'.split(), 'I like nlp'.split()), 0.5)
    def test13(self): ae(JS('I like NLP'.split(), 'I like nlp'.split(), Lower=True, Stop={}), 1)
    def test14(self): ae(JS('I like NLP'.split(), 'I like nlp'.split(), Lower=True, Stop=['nlp']), 1)
    def test15(self): ae(JS('I like NLP'.split(), 'I like nlp'.split(), Lower=False, Stop=['NLP', 'nlp']), 1)
    def test16(self): ae(JS(LsObama9, LsObama13, Lower=True, Stop={}, Punct={}),  0.25333333333333335)
    def test17(self): ae(JS(LsObama9, LsObama13, Lower=False, Stop={}, Punct={}), 0.24857954545454544)
    def test18(self): ae(JS(LsObama9, LsObama13, Lower=True, Punct={}),  0.21112006446414183)
    def test19(self): ae(JS(LsObama9, LsObama13, Lower=False, Punct={}), 0.21137586471944658)
    def test20(self): ae(JS(LsObama9, LsObama13, Lower=True, Stop={}), 0.2503725782414307)
    def test21(self): ae(JS(LsObama9, LsObama13, Lower=False, Stop={}), 0.24571428571428572)
    def test22(self): ae(JS(LsObama9, LsObama13, Lower=True), 0.20762368207623683)
    def test23(self): ae(JS(LsObama9, LsObama13, Lower=False), 0.20804331013147717)


Ran 24 tests in 0.086s

[1m[34mOK[0m
test00 (__main__.Test_JS) ... [1m[34mok[0m
test01 (__main__.Test_JS) ... [1m[34mok[0m
test02 (__main__.Test_JS) ... [1m[34mok[0m
test03 (__main__.Test_JS) ... [1m[34mok[0m
test04 (__main__.Test_JS) ... [1m[34mok[0m
test05 (__main__.Test_JS) ... [1m[34mok[0m
test06 (__main__.Test_JS) ... [1m[34mok[0m
test07 (__main__.Test_JS) ... [1m[34mok[0m
test08 (__main__.Test_JS) ... [1m[34mok[0m
test09 (__main__.Test_JS) ... [1m[34mok[0m
test10 (__main__.Test_JS) ... [1m[34mok[0m
test11 (__main__.Test_JS) ... [1m[34mok[0m
test12 (__main__.Test_JS) ... [1m[34mok[0m
test13 (__main__.Test_JS) ... [1m[34mok[0m
test14 (__main__.Test_JS) ... [1m[34mok[0m
test15 (__main__.Test_JS) ... [1m[34mok[0m
test16 (__main__.Test_JS) ... [1m[34mok[0m
test17 (__main__.Test_JS) ... [1m[34mok[0m
test18 (__main__.Test_JS) ... [1m[34mok[0m
test19 (__main__.Test_JS) ... [1m[34mok[0m
test20 (__main__.Test_JS) ... [1m[34mo

## **Task 2**

Complete UDF `JS_Speech()` and apply UDF `JS()` to compute Jaccard similarity scores for `SsQry` set and each presidential speech in NLTK (i.e. `nltk.corpus.inaugural.fileids()`). The JS score and `fid` of each speech is packaged as a row in Pandas DataFrame object (with a column `JS`). The `fid` value (without `.txt`) is used as the row index. Return results ordered by decreasing `JS`. So, the top speeches are most similar to the `SsQry` with respect to Jaccard similarity.

In [7]:
def JS_Speech(SsQry=LsObama9, Lower=True, Stop=SsStopwords, Punct=SsPunct):
    '''Compute Jaccard similarity of each presidential inaugural speech with SsQry set of words.
    Inputs:
      SsQry: a vocabulary, i.e. set of words describing the query document
      Lower: indicates whether lower case should be used before removal of stopwords and punctuation
      Stop: container of stopword strings
      Punct: container of punctuation strings
    Return: Dataframe with file id (fid) as row index (labeled as `fid`) and Jaccard score column (labeled `JS`)
      Omit `.txt` from fid index values; i.e. use fid index '1993-Clinton' instead of original fid '1993-Clinton.txt'
    '''
    # YOUR CODE HERE
    raise NotImplementedError()
    return df

In [8]:
FIDs = nltk.corpus.inaugural.fileids()[:59]  # load file IDs (incl. 2021-Biden). This list grows over years
print(FIDs[-5:]) 

['2005-Bush.txt', '2009-Obama.txt', '2013-Obama.txt', '2017-Trump.txt', '2021-Biden.txt']
['2005-Bush.txt', '2009-Obama.txt', '2013-Obama.txt', '2017-Trump.txt', '2021-Biden.txt']


In [9]:
LsDocs = [nltk.corpus.inaugural.raw(fid) for fid in FIDs]
[s[:100]+'...' for s in LsDocs[-5:]]

['Vice President Cheney, Mr. Chief Justice, President Carter, President Bush, President Clinton, membe...',
 'My fellow citizens:\n\nI stand here today humbled by the task before us, grateful for the trust you ha...',
 'Thank you. Thank you so much.\n\nVice President Biden, Mr. Chief Justice, Members of the United States...',
 'Chief Justice Roberts, President Carter, President Clinton, President Bush, President Obama, fellow ...',
 'Chief Justice Roberts, Vice President Harris, Speaker Pelosi, Leader Schumer, Leader McConnell, Vice...']

In [10]:
#version to work on

def JS_Speech(SsQry=LsObama9, Lower=True, Stop=SsStopwords, Punct=SsPunct):
    results = []

    for fid, speech_text in FIDs():
        words = set(speech_text.split())
        jaccard_score = JS(SsQry, words, Lower, Stop, Punct)
        results.append((fid.replace('.txt', ''), jaccard_score))
    df = pd.DataFrame(results, columns=['JS'], index=pd.Series([fid for fid, _ in results]))
    df.index.name = 'fid'
    
    return df

In [11]:
def JS_Speech(SsQry=LsObama9, Lower=True, Stop=SsStopwords, Punct=SsPunct):
    FIDs = nltk.corpus.inaugural.fileids()
    results = []

    for fid in FIDs:
        speech_text   = nltk.corpus.inaugural.words(fid)
        jaccard_score = JS(SsQry, speech_text)
        results.append((fid.replace('.txt', ''), jaccard_score))
    
    df = pd.DataFrame(results, columns=['fid', 'JS']) #, index=pd.Series([fid for fid, _ in results]))
    df.set_index('fid', inplace=True)
    df = df.sort_values(by='JS', ascending=False)
    
    return df

In [24]:

def JS_Speech(SsQry=LsObama9, Lower=True, Stop=SsStopwords, Punct=SsPunct):
    FIDs = nltk.corpus.inaugural.fileids()
    results = []

    for fid in FIDs:
        speech_text = set(nltk.corpus.inaugural.words(fid))
        
        jaccard_score = JS(speech_text, SsQry, Lower, Stop, Punct)
        results.append((fid.replace('.txt', ''), jaccard_score))
    
    df = pd.DataFrame(results, columns=['fid', 'JS'])
    df.set_index('fid', inplace=True)
    df_sorted = df.sort_values(by='JS', ascending=False)

    return df_sorted

In [25]:
JS_Speech()

Unnamed: 0_level_0,JS
fid,Unnamed: 1_level_1
2009-Obama,1.000000
2013-Obama,0.207624
1997-Clinton,0.191667
...,...
1865-Lincoln,0.092593
1829-Jackson,0.082674
1793-Washington,0.022700


In [14]:
(JS_Speech().iloc[2,0], 0.19166666666666668)

(0.19637223974763407, 0.19166666666666668)

In [26]:
# RUN CELL TO TEST YOUR CODE
ae = npt.assert_equal
@run_unittest
class Test_JS_Speech(unittest.TestCase): # class with methods to test functionality of the JS() function
    def test00(self): ae(JS_Speech().shape, (59,1))
    def test01(self): ae(list(JS_Speech().T.columns[:2]), ['2009-Obama', '2013-Obama'])
    def test02(self): ae(JS_Speech().T['2009-Obama'].values, 1)
    def test03(self): ae(JS_Speech().iloc[2,0], 0.19166666666666668)
    def test04(self): ae(JS_Speech(Lower=False, Stop={}).iloc[2,0], 0.23795620437956205)
    def test05(self): ae(JS_Speech(Lower=False, Punct={}).iloc[2,0], 0.2004698512137823)
    def test06(self): ae(JS_Speech(Lower=False, Stop={}, Punct={}).iloc[2,0], 0.24147933284989123)
    def test07(self): ae(JS_Speech(Lower=True, Stop={}).iloc[2,0], 0.23637759017651575)
    def test08(self): ae(JS_Speech(Lower=True, Punct={}).iloc[2,0], 0.19602977667493796)
    def test09(self): ae(JS_Speech(Lower=True, Stop={}, Punct={}).iloc[2,0], 0.24009146341463414)


Ran 10 tests in 4.619s

[1m[34mOK[0m
test00 (__main__.Test_JS_Speech) ... [1m[34mok[0m
test01 (__main__.Test_JS_Speech) ... [1m[34mok[0m
test02 (__main__.Test_JS_Speech) ... [1m[34mok[0m
test03 (__main__.Test_JS_Speech) ... [1m[34mok[0m
test04 (__main__.Test_JS_Speech) ... [1m[34mok[0m
test05 (__main__.Test_JS_Speech) ... [1m[34mok[0m
test06 (__main__.Test_JS_Speech) ... [1m[34mok[0m
test07 (__main__.Test_JS_Speech) ... [1m[34mok[0m
test08 (__main__.Test_JS_Speech) ... [1m[34mok[0m
test09 (__main__.Test_JS_Speech) ... [1m[34mok[0m

----------------------------------------------------------------------



## Task 3

In this task, you are given a Hamming distance UDF `HD()` and a sequence generating UDF `GenSeq()` similar to those you saw in the video. You will apply these functions in Task 3 described below.

In [16]:
# Hamming distance UDF
HD = lambda s1='ab', s2='ad': sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2)) if len(s1)==len(s2) else np.inf

def GenSeq(nLen=10, seed=int(0), LsElements=list('ACGT')):
    '''Generate a list of sampled nLen objects listed in LsElements'''
    if isinstance(seed, int):        # only integers >=0 are used for seeding
        np.random.seed(abs(seed))      # seed random number generator (RNG) if integer seed is provided
    return ''.join(np.random.choice(LsElements, nLen, replace=True))

GenDNA = lambda nLen=5, seed=0: GenSeq(nLen, seed, list('ACGT'))
sQry = GenDNA(100, seed=0)
sTgt = GenDNA(200, seed=1)

Complete the UDF `Marker()`, which takes two same-length strings, query `sQry` and target `sTgt`, and computes returns a "**marked**" string `sTgt` where the characters matching to corresponding characters in `sQry` are replaced with `_`. This function is helpful to visually highlight unmatched elements in string `sTgt`. If string lengths differ, UDF returns `''`.

In [17]:
def Marker(sQry='ACGG', sTgt='ACGT'):
    '''Compares two same-length strings and replaces a character in sTgt with '_' 
        if it matches the corresponding character in sQry. Else leaves the character as is.
    Inputs:
      sQry, sTgt: target and query strings
    Returns: a string with length of sTgt with some characters replaced with '_'.
      If sTgt and sQry have different lengths, UDF returns '' (empty string)
    '''
    # YOUR CODE HERE
    raise NotImplementedError()
    return sOut  # return string

In [18]:


def Marker(sQry='ACGG', sTgt='ACGT'):
    if len(sQry) != len(sTgt):
        return ''
    
    sOut = ''.join([ch if ch != sQry[i] else '_' for i, ch in enumerate(sTgt)])
    return sOut


In [19]:
# RUN CELL TO TEST YOUR CODE
ae = npt.assert_equal
@run_unittest
class Test_Marker(unittest.TestCase): # class with methods to test functionality of the JS() function
    def test00(self): ae(Marker(sQry='ACGG', sTgt='ACGT'), '___T')
    def test01(self): ae(Marker(sQry='ACGT', sTgt='ACGT'), '____')
    def test02(self): ae(Marker(sQry='ACGGA', sTgt='ACGT'), '')
    def test03(self): ae(Marker(sQry='AGT', sTgt='ACGT'), '')
    def test04(self): ae(Marker(GenDNA(20, seed=0), GenDNA(20, seed=0)), '____________________')
    def test05(self): ae(Marker(GenDNA(20, seed=0), GenDNA(20, seed=1)), 'C_A__C_CTAAC__C_GC_A')


Ran 6 tests in 0.001s

[1m[34mOK[0m
test00 (__main__.Test_Marker) ... [1m[34mok[0m
test01 (__main__.Test_Marker) ... [1m[34mok[0m
test02 (__main__.Test_Marker) ... [1m[34mok[0m
test03 (__main__.Test_Marker) ... [1m[34mok[0m
test04 (__main__.Test_Marker) ... [1m[34mok[0m
test05 (__main__.Test_Marker) ... [1m[34mok[0m

----------------------------------------------------------------------



## Task 4

Now, apply `HD()` and `Marker()` to create UDF `RankHD()`, which takes two strings: query `sQry` and target `sTgt`, of varying lengths. Then, for each substring in `sTgt` with length equal to `len(sQry)`, compute the Hamming distance and the corresponding marked string. This can be done by looping over all substrings in `sTgt`, if such substrings exist. Package the results into two dataframe columns, as described in the function docstring below. 

This function can be used to find the closest match to `sQry` inside the `sTgt` and is often used by professionals to rank viral samples by their similarity or distance to the bank of known viral strings (which can have varying length samples).

In [20]:
def RankHD(sQry='ACGT', sTgt='ACTGTCT'):
    '''This UDF uses HD() to rank each substring in sTgt 
        of length N=len(sQry) with its HD to sQry. 
        To do that you can iterate over each N-length substrings (i.e. Samples).
        For each Sample compute HD(sQry, Sample) and Marker(sQry, Sample).
        For example, sTgt='ACTGTCT' has 4 Samples of length 4: ACTG,CTGT,TGTC,GTCT
        and Marker('ACGT', 'ACTG') returns '__GT'
        
        
        HD(sQry, Sample) and Marker(Sample, sQry).
        
    Inputs:
      sQry, sTgt: strings of not necessarily same length.
    Returns:
      dataframe of shape M x 2 with columns 'HD' (rank or Hamming distance)
      and 'Sample', which contains 
    '''
    # YOUR CODE HERE
    raise NotImplementedError()
    return df # return dataframe 

In [92]:
def RankHD(sQry='ACGT', sTgt='ACTGTCT'):
    N = len(sQry)  # length of query
    M = len(sTgt)  # length of target
    results = []

    if M < N:      #make sure the target is not shorter 
        return pd.DataFrame(columns=['HD', 'Sample'])

    # Loop through valid substrings 
    for i in range(M - N + 1):  # Iterate from 0 to M - N to extract substrings of length N
        sample = sTgt[i:i + N]  # Extract substring of length N
        hd = HD(sample, sQry)  # Calculate Hamming distance
        marker = Marker(sample, sQry)
        results.append((hd, marker))

    df = pd.DataFrame(results, columns=['HD', 'Sample'])
    df = df.sort_values(by='HD', ascending=True)  
    
    return df


In [94]:
RankHD()

Unnamed: 0,HD,Sample
0,2,__GT
1,2,AC__
3,3,ACG_
2,4,ACGT


In [93]:
# RUN CELL TO TEST YOUR CODE
ae = npt.assert_equal
@run_unittest
class Test_RankHD(unittest.TestCase): # class with methods to test functionality of the JS() function
    def test00(self): ae(RankHD().shape, (4, 2))
    def test01(self): ae(RankHD().iloc[0,:].values.tolist(), [2, '__GT'])
    def test02(self): ae(RankHD().iloc[-1,:].values.tolist(), [4, 'ACGT'])
    def test03(self): ae(RankHD(sQry='ACTGTCT', sTgt='ACGT').shape, (0, 2))
    def test04(self): ae(RankHD(sQry, sTgt).shape, (101, 2))
    def test05(self): ae(RankHD(sQry, sTgt).iloc[0,0], 60)
    def test06(self): ae(RankHD(sQry, sTgt).iloc[-1,0], 88)
    def test07(self): ae(RankHD(sQry, sTgt).iloc[0,1][:10], 'A_CATT__CT')
    def test08(self): ae(RankHD(sQry, sTgt).iloc[-1,1][:10], 'ATCATTTTCT')


Ran 9 tests in 0.031s

[1m[34mOK[0m
test00 (__main__.Test_RankHD) ... [1m[34mok[0m
test01 (__main__.Test_RankHD) ... [1m[34mok[0m
test02 (__main__.Test_RankHD) ... [1m[34mok[0m
test03 (__main__.Test_RankHD) ... [1m[34mok[0m
test04 (__main__.Test_RankHD) ... [1m[34mok[0m
test05 (__main__.Test_RankHD) ... [1m[34mok[0m
test06 (__main__.Test_RankHD) ... [1m[34mok[0m
test07 (__main__.Test_RankHD) ... [1m[34mok[0m
test08 (__main__.Test_RankHD) ... [1m[34mok[0m

----------------------------------------------------------------------

