In [1]:
text = "We can see the shining sun."

### Bi-grams using NLTK

In [3]:
import nltk
import string
from nltk import word_tokenize

token = [w for w in word_tokenize(text) if w not in list(string.punctuation)]
bigrams = nltk.bigrams(token)
print(list(bigrams))

[('We', 'can'), ('can', 'see'), ('see', 'the'), ('the', 'shining'), ('shining', 'sun')]


### Ngrams using NLTK

In [6]:
from nltk.util import ngrams

n = 3
token=word_tokenize(text)
ngrams=ngrams(token,n)
print(list(ngrams))

[('We', 'can', 'see'), ('can', 'see', 'the'), ('see', 'the', 'shining'), ('the', 'shining', 'sun'), ('shining', 'sun', '.')]


In [7]:
print(text)
token=word_tokenize(text)
ngrams = []
n = 3 # n for ngrams
c = 0
while c < len(token) -n+1:
    ngrams.append(tuple(token[c:c+n]))
    c += 1
print(ngrams)

We can see the shining sun.
[('We', 'can', 'see'), ('can', 'see', 'the'), ('see', 'the', 'shining'), ('the', 'shining', 'sun'), ('shining', 'sun', '.')]


### Calculating precision, recall and fscore using the intersection/length methods

In [8]:
def prec_rec_f1_method1(ref, res):
    intersect = len(set(ref).intersection(set(res)))
    recall = intersect/len(ref)
    precision = intersect/len(res)
    f1 = 2*recall*precision/(recall+precision)

    return(precision, recall, f1)


ref = [1,2]
res = [1,2,3]
print(prec_rec_f1_method1(ref, res))


ref = [1,2,4]
res = [1,2]
print(prec_rec_f1_method1(ref,res))


ref = [1,3,5,6]
res = [1,2,3,4,5]
print(prec_rec_f1_method1 (ref, res))

(0.6666666666666666, 1.0, 0.8)
(1.0, 0.6666666666666666, 0.8)
(0.6, 0.75, 0.6666666666666665)


### Calculating precision, recall and fscore using the True Positives, False Positives and False Negatives method

In [9]:
def prec_rec_f1_method2(ref, res):
    
    tp = len(set(ref).intersection(set(res)))
    
    fp = 0
    fn = 0
    
    for i in ref:
        if i not in res:
            fn+=1
    
    for i in res:
        if i not in ref:
            fp+=1
    
    print('true positives = %d, false positives = %d, false negatives = %d'%(tp, fp, fn))

    recall = tp/(tp+fn)
    precision = tp/(tp+fp)
    f1 = 2*recall*precision/(recall+precision)

    return(precision, recall, f1)

ref = [1,2]
res = [1,2,3]
print(prec_rec_f1_method2(ref, res))

ref = [1,2,4]
res = [1,2]
print(prec_rec_f1_method2(ref,res))

ref = [1,3,5,6]
res = [1,2,3,4,5]
print(prec_rec_f1_method2 (ref, res))


true positives = 2, false positives = 1, false negatives = 0
(0.6666666666666666, 1.0, 0.8)
true positives = 2, false positives = 0, false negatives = 1
(1.0, 0.6666666666666666, 0.8)
true positives = 3, false positives = 2, false negatives = 1
(0.6, 0.75, 0.6666666666666665)


### Using NLTK

In [10]:
from nltk.metrics import precision, recall
ref = [1,3,5,6]
res = [1,2,3,4,5]
print(precision(set(ref), set(res)), recall(set(ref), set(res))) # must convert lists to sets

0.6 0.75


### Using sklearn

In [11]:
from sklearn.metrics import precision_score, recall_score

ref = [1,3,5,6,7,9]
res = [1,2,3,4,5,7,8]

# Lists of length the same as the maximum value in the ref/res
# Every index in the arrays is true/false (or 0/1) 
# True means that the document whose id is the index is relevant
ref_binary = [i in ref for i in range(10)] 
res_binary = [i in res for i in range(10)]
print(ref_binary, res_binary, sep='\n')
print(precision_score(ref_binary, res_binary), recall_score(ref_binary, res_binary))

[False, True, False, True, False, True, True, True, False, True]
[False, True, True, True, True, True, False, True, True, False]
0.571428571429 0.666666666667


### Links
https://www.quora.com/What-is-the-difference-between-Precision-and-Recall