# Problem Statement
#We learnt to build our own HMM-based POS tagger and implement the Viterbi algorithm using the Penn Treebank training
#corpus. The vanilla Viterbi algorithm we had written had resulted in ~87% accuracy. The approx. 13% loss of accuracy 
#was majorly due to the fact that when the algorithm encountered an unknown word(i.e. not present in the training set, such as #'Twitter'), it assigned an incorrect tag arbitrarily.
#This is because, for unknown words, the emission probabilities for all candidate tags are 0, so the algorithm arbitrarily
#chooses (the first) tag.
# In this assignment, we need to modify the Viterbi algorithm to solve the problem of unknown words using at least two techniques. Though there could be multiple ways to solve this problem,following hints can be considered:
#Find tag class that most unknown words belong to.Then identify rules (e.g. based on morphological cues) that can be used to tag unknown words?
#Viterbi algorithm chooses a random tag on encountering an unknown word.Modify the Viterbi algorithm so that it considers only one of the transition or emission probabilities for unknown words

## POS tagging using modified Viterbi

### Data Preparation

In [1]:
#Importing required libraries
import nltk
import nltk, re, pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize


In [2]:
#reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [3]:
# looking at the first 10 tagged sentences
print(nltk_data[:1])

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')]]


In [4]:
#Splitting the corpus into train and test,checking the lenght of each set and printing the first 10 sentences in train set
random.seed(1234)
train_set, test_set = train_test_split(nltk_data,test_size=0.05)

print(len(train_set))
print(len(test_set))


3718
196


In [8]:
train_set

[[('Tokyo', 'NOUN'),
  ('stocks', 'NOUN'),
  ('edged', 'VERB'),
  ('up', 'ADP'),
  ('Wednesday', 'NOUN'),
  ('in', 'ADP'),
  ('relatively', 'ADV'),
  ('active', 'ADJ'),
  ('but', 'CONJ'),
  ('unfocused', 'ADJ'),
  ('trading', 'NOUN'),
  ('.', '.')],
 [('AN', 'DET'),
  ('EXCHANGE', 'NOUN'),
  ('of', 'ADP'),
  ('U.S.', 'NOUN'),
  ('and', 'CONJ'),
  ('Soviet', 'ADJ'),
  ('designers', 'NOUN'),
  ('promises', 'VERB'),
  ('change', 'NOUN'),
  ('on', 'ADP'),
  ('both', 'DET'),
  ('sides', 'NOUN'),
  ('.', '.')],
 [('As', 'ADP'),
  ('it', 'PRON'),
  ('stands', 'VERB'),
  ('now', 'ADV'),
  (',', '.'),
  ('Congress', 'NOUN'),
  ('presumes', 'VERB'),
  ('after', 'ADP'),
  ('the', 'DET'),
  ('Reagan', 'NOUN'),
  ('administration', 'NOUN'),
  ('that', 'ADP'),
  ('the', 'DET'),
  ('White', 'NOUN'),
  ('House', 'NOUN'),
  ('will', 'VERB'),
  ('take', 'VERB'),
  ('unconstitutional', 'ADJ'),
  ('provisions', 'NOUN'),
  ('in', 'ADP'),
  ('appropriations', 'NOUN'),
  ('bills', 'NOUN'),
  ('*-1', 'X'),
  

In [9]:
# Get the list of tagged words from train set
train_tagged_words = [tup for sent in train_set for tup in sent]
len(train_tagged_words)

95659

In [10]:
# Print the list of tagged words from train set.
train_tagged_words[:5]

[('Tokyo', 'NOUN'),
 ('stocks', 'NOUN'),
 ('edged', 'VERB'),
 ('up', 'ADP'),
 ('Wednesday', 'NOUN')]

In [11]:
# Getting tokens for tagged words from train set 
tokens = [pair[0] for pair in train_tagged_words]
tokens[:10]

['Tokyo',
 'stocks',
 'edged',
 'up',
 'Wednesday',
 'in',
 'relatively',
 'active',
 'but',
 'unfocused']

In [12]:
# printing the length ofvocabulary
V = set(tokens)
print(len(V))

12090


In [13]:
# Printing the count of number of tags
T = set([pair[1] for pair in train_tagged_words])
len(T)

12

In [14]:
# printing the 12 tags that are their in the train set
print(T)

{'ADP', '.', 'PRT', 'ADJ', 'CONJ', 'DET', 'VERB', 'X', 'ADV', 'NUM', 'PRON', 'NOUN'}


### Build the vanilla Viterbi based POS tagger

In [15]:
# computing P(w/t) and storing in T x V matrix
t = len(T)
v = len(V)
w_given_t = np.zeros((t, v))

In [16]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [17]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [18]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(T), len(T)), dtype='float32')
for i, t1 in enumerate(list(T)):
    for j, t2 in enumerate(list(T)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [19]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(T), index=list(T))
tags_df

Unnamed: 0,ADP,.,PRT,ADJ,CONJ,DET,VERB,X,ADV,NUM,PRON,NOUN
ADP,0.017112,0.039572,0.00139,0.10492,0.000856,0.325775,0.008235,0.034011,0.013476,0.064278,0.069091,0.321283
.,0.090247,0.093213,0.002247,0.044584,0.058337,0.173753,0.089258,0.026787,0.053303,0.081258,0.066067,0.220854
PRT,0.019993,0.043592,0.001967,0.08489,0.001967,0.100623,0.402491,0.01311,0.009833,0.057686,0.018682,0.245166
ADJ,0.077545,0.065006,0.011054,0.065831,0.016994,0.00495,0.011879,0.021119,0.00495,0.020954,0.00066,0.69906
CONJ,0.052729,0.03469,0.005088,0.117484,0.000463,0.121184,0.158649,0.008326,0.054117,0.041628,0.059204,0.346438
DET,0.009265,0.018409,0.000241,0.204548,0.000481,0.005655,0.039947,0.045602,0.012634,0.022019,0.003369,0.637829
VERB,0.091206,0.03496,0.031386,0.064947,0.005516,0.134789,0.168505,0.217915,0.081805,0.022685,0.035193,0.111094
X,0.145018,0.162846,0.185291,0.017351,0.010506,0.055078,0.202802,0.075135,0.025629,0.002865,0.055078,0.062401
ADV,0.117355,0.133554,0.013554,0.130248,0.006942,0.069421,0.344793,0.023471,0.08,0.032397,0.015868,0.032397
NUM,0.034523,0.116849,0.027147,0.033048,0.013278,0.003246,0.017999,0.208911,0.002951,0.18324,0.00118,0.357628


In [20]:
#Viterbi Algorithm

In [21]:
len(train_tagged_words)

95659

In [22]:
#Evaluating on Test Set

# Running on entire test dataset would take more than 3-4hrs. 
# Let's test our Viterbi algorithm on a few sample sentences of test dataset

random.seed(1234)

# choose random 5 sents
rndom = [random.randint(1,196) for x in range(3)]
##rndom = list(range(1,196))
# list of sents
test_run = [test_set[i] for i in rndom]
##test_run = [test_set]

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]
##test_tagged_words



In [23]:

# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            count_w_given_tag, count_tag = word_given_tag(words[key], tag)
            emission_p = count_w_given_tag/count_tag
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))
       
    #return list(state_max)

In [24]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start

In [25]:
print("Time taken in seconds: ", difference)
print(tagged_seq)
#print(test_run_base)

Time taken in seconds:  7.288452625274658
[('The', 'DET'), ('government', 'NOUN'), ('reported', 'VERB'), ('that', 'ADP'), ('orders', 'NOUN'), ('for', 'ADP'), ('manufactured', 'VERB'), ('goods', 'NOUN'), ('were', 'VERB'), ('essentially', 'ADV'), ('unchanged', 'ADJ'), ('in', 'ADP'), ('September', 'NOUN'), ('while', 'ADP'), ('construction', 'NOUN'), ('spending', 'NOUN'), ('was', 'VERB'), ('slightly', 'ADV'), ('lower', 'ADJ'), ('.', '.'), ('The', 'DET'), ('Czech', 'ADP'), ('dam', 'NOUN'), ('ca', 'VERB'), ("n't", 'ADV'), ('be', 'VERB'), ('operated', 'VERB'), ('*-69', 'X'), ('solely', 'ADP'), ('at', 'ADP'), ('peak', 'NOUN'), ('periods', 'NOUN'), ('without', 'ADP'), ('the', 'DET'), ('Nagymaros', 'NOUN'), ('project', 'NOUN'), ('.', '.'), ('Factory', 'NOUN'), ('payrolls', 'NOUN'), ('fell', 'VERB'), ('in', 'ADP'), ('September', 'NOUN'), ('.', '.')]


In [26]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
accuracy = len(check)/len(tagged_seq)
print(len(check))
print(len(tagged_seq))
accuracy

41
43


0.9534883720930233

In [27]:
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]
incorrect_tagged_cases

[[('The', 'DET'), (('Czech', 'ADP'), ('Czech', 'ADJ'))],
 [('*-69', 'X'), (('solely', 'ADP'), ('solely', 'ADV'))]]

In [28]:
def unigram(test):
    unigram_tagger = nltk.UnigramTagger(train_set)
    accuracy = unigram_tagger.evaluate(test)
    return accuracy

In [29]:
def morph(test):
    #Rule-Based (Regular Expression) Tagger¶
#Rule-Based (Regular Expression) Tagger¶
# specify patterns for tagging
    patterns = [
        (r'.*ing$', 'VERB'),              # gerund
        (r'.*ed$', 'VERB'),               # past tense
        (r'.*es$', 'VERB'),               # 3rd singular present
        (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), # cardinal numbers
        (r'^[A-Z]','NOUN'),
        (r'.*', 'NOUN')                    # nouns
        ]

    rule_based_tagger = nltk.RegexpTagger(patterns)

# lexicon backed up by the rule-based tagger
    lexicon_tagger = nltk.UnigramTagger(train_set, backoff=rule_based_tagger)

# Bigram backed up by the lexicon tagger
    bigram_tagger = nltk.BigramTagger(train_set, backoff=lexicon_tagger)

    accuracy = bigram_tagger.evaluate(test)

    return accuracy

In [49]:
# Viterbi Heuristic
def Viterbi_Mod2(words,sentence,train_bag = train_tagged_words):
    state = []
    tester = []
    unigram_tagger = nltk.UnigramTagger(train_set)
    T = list(set([pair[1] for pair in train_bag]))


    for key, word in enumerate(words):
        
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            count_w_given_tag, count_tag = word_given_tag(words[key], tag)
            emission_p = count_w_given_tag/count_tag
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
        tagged = list(zip(words, state))
        check = [i for i, j in zip(tagged, test_run_base) if i == j] 
        accuracy = len(check)/len(tagged)
    
   
    unigram_accuracy = unigram(sentence)
        
    morph_accuracy = morph(sentence)            
    return (accuracy,unigram_accuracy,morph_accuracy)

    #return list(state_max)

In [50]:
# tagging the test sentences
start = time.time()
tagged_seq2 = Viterbi_Mod2(test_tagged_words,test_set)
end = time.time()
difference = end-start

In [51]:
tagged_seq2

(0.9534883720930233, 0.9093083516045446, 0.9535579031293602)

# ATUL - I ahve called the three seperately below and it works.BUt how do I call then from inside the viterbi function

### Solve the problem of unknown words

In [33]:
#3.2 Lexicon (Unigram) Tagger
#Let's now try training a lexicon (or a unigram) tagger which assigns the most commonly assigned tag to a word.

#In NLTK, the UnigramTagger() can be used to train such a model.

In [34]:
# Lexicon (or bigram tagger)
unigram_tagger = nltk.UnigramTagger(train_set)
unigram_tagger.evaluate(test_set)

0.9093083516045446

In [35]:
# Lexicon (or bigram tagger)
bigram_tagger = nltk.BigramTagger(train_set)
bigram_tagger.evaluate(test_set)

0.2154674108032689

In [36]:
#Rule-Based (Regular Expression) Tagger¶
#Rule-Based (Regular Expression) Tagger¶
# specify patterns for tagging
patterns = [
    (r'.*ing$', 'VERB'),              # gerund
    (r'.*ed$', 'VERB'),               # past tense
    (r'.*es$', 'VERB'),               # 3rd singular present
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), # cardinal numbers
    (r'^[A-Z]','NOUN'),
    (r'.*', 'NOUN')                    # nouns
    
]

In [37]:
regexp_tagger = nltk.RegexpTagger(patterns)
# help(regexp_tagger)

In [38]:
regexp_tagger.evaluate(test_set)

0.3322702810444489

In [39]:
#3.4 Combining Taggers
#Let's now try combining the taggers created above. We saw that the rule-based tagger by itself is quite ineffective since we've only written a handful of rules. However, if we could combine the lexicon and the rule-based tagger, we can potentially create a tagger much better than any of the individual ones.

#NLTK provides a convenient way to combine taggers using the 'backup' argument. In the following code, we create a regex tagger which is used as a backup tagger to the lexicon tagger, i.e. when the tagger is not able to tag using the lexicon (in case of a new word not in the vocabulary), it uses the rule-based tagger.

#Also, note that the rule-based tagger itself is backed up by the tag 'NN'.

In [40]:
# rule based tagger
rule_based_tagger = nltk.RegexpTagger(patterns)

# lexicon backed up by the rule-based tagger
lexicon_tagger = nltk.UnigramTagger(train_set, backoff=rule_based_tagger)

# Bigram backed up by the lexicon tagger
bigram_tagger = nltk.BigramTagger(train_set, backoff=lexicon_tagger)

bigram_tagger.evaluate(test_set)

0.9535579031293602

#### Evaluating tagging accuracy

In [41]:
## Preparing data for the test file and its sentences
sentence_test ='Android is a mobile operating system developed by Google.Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.Google and Twitter made a deal in 2015 that gave Google access to Twitter firehose.Twitter is an online news and social networking service on which users post and interact with messages known as tweets.Before entering politics, Donald Trump was a domineering businessman and a television personality.The 2018 FIFA World Cup is the 21st FIFA World Cup, an international football tournament contested once every four years.This is the first World Cup to be held in Eastern Europe and the 11th time that it has been held in Europe.Show me the cheapest round trips from Dallas to Atlanta.I would like to see flights from Denver to Philadelphia.Show me the price of the flights leaving Atlanta at about 3 in the afternoon and arriving in San Francisco.NASA invited social media users to experience the launch of ICESAT-2 Satellite.'
sentence_test_split = sentence_test.split()
words = word_tokenize(sentence_test)

In [42]:
words

['Android',
 'is',
 'a',
 'mobile',
 'operating',
 'system',
 'developed',
 'by',
 'Google.Android',
 'has',
 'been',
 'the',
 'best-selling',
 'OS',
 'worldwide',
 'on',
 'smartphones',
 'since',
 '2011',
 'and',
 'on',
 'tablets',
 'since',
 '2013.Google',
 'and',
 'Twitter',
 'made',
 'a',
 'deal',
 'in',
 '2015',
 'that',
 'gave',
 'Google',
 'access',
 'to',
 'Twitter',
 'firehose.Twitter',
 'is',
 'an',
 'online',
 'news',
 'and',
 'social',
 'networking',
 'service',
 'on',
 'which',
 'users',
 'post',
 'and',
 'interact',
 'with',
 'messages',
 'known',
 'as',
 'tweets.Before',
 'entering',
 'politics',
 ',',
 'Donald',
 'Trump',
 'was',
 'a',
 'domineering',
 'businessman',
 'and',
 'a',
 'television',
 'personality.The',
 '2018',
 'FIFA',
 'World',
 'Cup',
 'is',
 'the',
 '21st',
 'FIFA',
 'World',
 'Cup',
 ',',
 'an',
 'international',
 'football',
 'tournament',
 'contested',
 'once',
 'every',
 'four',
 'years.This',
 'is',
 'the',
 'first',
 'World',
 'Cup',
 'to',
 'be',

In [43]:
text= "Android is a mobile operating system developed by Google.Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.Google and Twitter made a deal in 2015 that gave Google access to Twitter's firehose.Twitter is an online news and social networking service on which users post and interact with messages known as tweets.Before entering politics, Donald Trump was a domineering businessman and a television personality.The 2018 FIFA World Cup is the 21st FIFA World Cup, an international football tournament contested once every four years.This is the first World Cup to be held in Eastern Europe and the 11th time that it has been held in Europe.Show me the cheapest round trips from Dallas to Atlanta I would like to see flights from Denver to Philadelphia.Show me the price of the flights leaving Atlanta at about 3 in the afternoon and arriving in San Francisco.NASA invited social media users to experience the launch of ICESAT-2 Satellite."

text      

"Android is a mobile operating system developed by Google.Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.Google and Twitter made a deal in 2015 that gave Google access to Twitter's firehose.Twitter is an online news and social networking service on which users post and interact with messages known as tweets.Before entering politics, Donald Trump was a domineering businessman and a television personality.The 2018 FIFA World Cup is the 21st FIFA World Cup, an international football tournament contested once every four years.This is the first World Cup to be held in Eastern Europe and the 11th time that it has been held in Europe.Show me the cheapest round trips from Dallas to Atlanta I would like to see flights from Denver to Philadelphia.Show me the price of the flights leaving Atlanta at about 3 in the afternoon and arriving in San Francisco.NASA invited social media users to experience the launch of ICESAT-2 Satellite."

In [44]:
text_test_split = text.split()
text_test_split

['Android',
 'is',
 'a',
 'mobile',
 'operating',
 'system',
 'developed',
 'by',
 'Google.Android',
 'has',
 'been',
 'the',
 'best-selling',
 'OS',
 'worldwide',
 'on',
 'smartphones',
 'since',
 '2011',
 'and',
 'on',
 'tablets',
 'since',
 '2013.Google',
 'and',
 'Twitter',
 'made',
 'a',
 'deal',
 'in',
 '2015',
 'that',
 'gave',
 'Google',
 'access',
 'to',
 "Twitter's",
 'firehose.Twitter',
 'is',
 'an',
 'online',
 'news',
 'and',
 'social',
 'networking',
 'service',
 'on',
 'which',
 'users',
 'post',
 'and',
 'interact',
 'with',
 'messages',
 'known',
 'as',
 'tweets.Before',
 'entering',
 'politics,',
 'Donald',
 'Trump',
 'was',
 'a',
 'domineering',
 'businessman',
 'and',
 'a',
 'television',
 'personality.The',
 '2018',
 'FIFA',
 'World',
 'Cup',
 'is',
 'the',
 '21st',
 'FIFA',
 'World',
 'Cup,',
 'an',
 'international',
 'football',
 'tournament',
 'contested',
 'once',
 'every',
 'four',
 'years.This',
 'is',
 'the',
 'first',
 'World',
 'Cup',
 'to',
 'be',
 'held'

In [45]:
from nltk.tokenize import word_tokenize
text_words = word_tokenize(text)
text_words

['Android',
 'is',
 'a',
 'mobile',
 'operating',
 'system',
 'developed',
 'by',
 'Google.Android',
 'has',
 'been',
 'the',
 'best-selling',
 'OS',
 'worldwide',
 'on',
 'smartphones',
 'since',
 '2011',
 'and',
 'on',
 'tablets',
 'since',
 '2013.Google',
 'and',
 'Twitter',
 'made',
 'a',
 'deal',
 'in',
 '2015',
 'that',
 'gave',
 'Google',
 'access',
 'to',
 'Twitter',
 "'s",
 'firehose.Twitter',
 'is',
 'an',
 'online',
 'news',
 'and',
 'social',
 'networking',
 'service',
 'on',
 'which',
 'users',
 'post',
 'and',
 'interact',
 'with',
 'messages',
 'known',
 'as',
 'tweets.Before',
 'entering',
 'politics',
 ',',
 'Donald',
 'Trump',
 'was',
 'a',
 'domineering',
 'businessman',
 'and',
 'a',
 'television',
 'personality.The',
 '2018',
 'FIFA',
 'World',
 'Cup',
 'is',
 'the',
 '21st',
 'FIFA',
 'World',
 'Cup',
 ',',
 'an',
 'international',
 'football',
 'tournament',
 'contested',
 'once',
 'every',
 'four',
 'years.This',
 'is',
 'the',
 'first',
 'World',
 'Cup',
 'to',

In [46]:
# tagging the test sentences
start = time.time()
tagged1_seq = Viterbi(text_words)
end = time.time()
difference = end-start

In [47]:
tagged1_seq

[('Android', 'ADP'),
 ('is', 'VERB'),
 ('a', 'DET'),
 ('mobile', 'ADJ'),
 ('operating', 'NOUN'),
 ('system', 'NOUN'),
 ('developed', 'VERB'),
 ('by', 'ADP'),
 ('Google.Android', 'ADP'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'ADP'),
 ('worldwide', 'ADP'),
 ('on', 'ADP'),
 ('smartphones', 'ADP'),
 ('since', 'ADP'),
 ('2011', 'ADP'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013.Google', 'ADP'),
 ('and', 'CONJ'),
 ('Twitter', 'ADP'),
 ('made', 'VERB'),
 ('a', 'DET'),
 ('deal', 'NOUN'),
 ('in', 'ADP'),
 ('2015', 'ADP'),
 ('that', 'DET'),
 ('gave', 'VERB'),
 ('Google', 'ADP'),
 ('access', 'NOUN'),
 ('to', 'PRT'),
 ('Twitter', 'ADP'),
 ("'s", 'PRT'),
 ('firehose.Twitter', 'ADP'),
 ('is', 'VERB'),
 ('an', 'DET'),
 ('online', 'ADP'),
 ('news', 'NOUN'),
 ('and', 'CONJ'),
 ('social', 'ADJ'),
 ('networking', 'NOUN'),
 ('service', 'NOUN'),
 ('on', 'ADP'),
 ('which', 'DET'),
 ('users', 'NOUN'),
 ('post', 'NOUN'),

In [48]:
start = time.time()
tagged1_seq2 = Viterbi_Mod2(text_test_split,text_test_split)
end = time.time()
difference = end-start

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
tagged1_seq2