# Part 5: Modified Perceptron

### Data Preprocessing 

In [1]:
import numpy as np
import pandas as pd
import time

# Function to create lowercase training set
def createCleanTrain(language, file):
    '''
    language = 'CN' , 'EN', 'FR', 'SG'
    '''
    
    original = open('./' + language + file, "r", encoding='utf8')
    output = open('./' + language + file + "_clean", "w+", encoding='utf8')

    
    # Import training data
    with original as f, output as g:
            
        for lf in f:
           # if line is not empty
            if lf != "\n":
                try:
                    o, s = lf.strip().split(" ")
                except:
                    o = lf.strip().split(" ")[0]
                    s = None
                    
                new_o = o.lower()
                if s is None:
                    g.write(new_o + "\n")
                else:
                    g.write(" ".join([new_o, s]) + "\n")
                    
            # else if line if empty
            else:
                g.write(lf)
                
        f.close()
        g.close()

In [2]:
# Function to create lowercase dev.in
def createCleanDevIn(language, file):
    '''
    language = 'CN' , 'EN', 'FR', 'SG'
    '''
    
    original = open('./' + language + file, "r", encoding='utf8')
    output = open('./' + language + file[0:4] + "_clean.in", "w", encoding='utf8')

    
    # Import training data
    with original as f, output as g:
            
        for lf in f:
           # if line is not empty
            if lf != "\n":
                try:
                    o, s = lf.strip().split(" ")
                except:
                    o = lf.strip().split(" ")[0]
                    s = None
                    
                new_o = o.lower()
                if s is None:
                    g.write(new_o + "\n")
                else:
                    g.write(" ".join([new_o, s]) + "\n")
                    
            # else if line if empty
            else:
                g.write(lf)
                
        f.close()
        g.close()

In [3]:
# Create dataframe
def createDf(language,file):
    '''
    language = 'CN' , 'EN', 'FR', 'SG'
    '''
    tweets = []          # A list of all the tweets (Each tweet is a list)
    word_count, tweet_count = 0, 0
    # Import training data
    with open('./' + language + file, encoding='utf8') as f:
        training_lines = f.readlines()

        # For each line in the file
        for line in training_lines: 

            # If line is empty (i.e. we enter a new tweet)
            if line in['\n', '\r\n']: # Initialize a new tweet, reset word count
                if word_count != 0: #If the previous tweet was not empty, increase tweet count
                    tweet_count += 1
                word_count = 0

            else:
                # Remove the spaces in each line
                stripped = line.strip().split(" ")
                if len(stripped) == 2:
                    if word_count == 0:
                        tweets.append([tweet_count, word_count,'None','Start'])
                        word_count += 1
                    tweets.append([tweet_count, word_count] + stripped)
                    word_count += 1
    df = pd.DataFrame(tweets,columns=['Tweet', 'Word', 'Observation', 'State'])
    df = df.set_index(['Tweet', 'Word'])
    print('Training dataframe created.')
    return df

In [4]:
# Create dev.in dataframe
def createDfDevin(language,file):
    '''
    language = 'CN' , 'EN', 'FR', 'SG'
    '''
    tweets = []          # A list of all the tweets (Each tweet is a list)
    word_count, tweet_count = 0, 0
    # Import training data
    with open('./' + language + file, encoding='utf8') as f:
        training_lines = f.readlines()

        # For each line in the file
        for line in training_lines: 

            # If line is empty (i.e. we enter a new tweet)
            if line in['\n', '\r\n']: # Initialize a new tweet, reset word count
                if word_count != 0: #If the previous tweet was not empty, increase tweet count
                    tweet_count += 1
                word_count = 0

            else:
                # Remove the spaces in each line
                stripped = line.strip().split(" ")
                if len(stripped) == 1:
                    if word_count == 0:
                        tweets.append([tweet_count, word_count,'None'])
                        word_count += 1
                    tweets.append([tweet_count, word_count] + stripped)
                    word_count += 1
    df = pd.DataFrame(tweets,columns=['Tweet', 'Word', 'Observation'])
    df = df.set_index(['Tweet', 'Word'])
    print('Testing dataframe created.')
    return df

#### Functions for data manipulation

In [5]:
def getTweet(df, tweetNumber):
    """
    Inputs:
    df: dataframe of all tweets
    tweetNumber: which tweet to access and extract

    Output:
    obs_list: list of observations for a specified tweet 
    """
    df_resetindex = df.reset_index()
    tweet_df = df_resetindex.loc[df_resetindex['Tweet'] == tweetNumber]
    
    # Convert tweet dataframe to a list
    tweet_list = tweet_df.values.T.tolist()
    
    # Append a None at the end of observation to account for 'Stop' state
    obs_list = tweet_list[2]
    obs_list.append('None')
    
    return obs_list

In [6]:
def getTweetLabel(df, tweetNumber):
    """
    Inputs:
    df: dataframe of all tweets
    tweetNumber: which tweet to access and extract

    Output:
    obs_list: list of observations for a specified tweet 
    """
    df_resetindex = df.reset_index()
    tweet_df = df_resetindex.loc[df_resetindex['Tweet'] == tweetNumber]
    
    # Convert tweet dataframe to a list
    tweet_list = tweet_df.values.T.tolist()
    
    # Append a None at the end of observation to account for 'Stop' state
    obs_list = tweet_list[3]
    
    return obs_list[1:]

In [7]:
def Count_State(df):
    '''
    Get Count(i) and Count(j)
    '''
    states_counter = df.groupby('State').count()
    return states_counter

### Obtain Emission Probabilities

In [8]:
def Count_Emission(df):
    df = df.copy()
    df["Count"] = 1
    
    count_emission = df.groupby(['State','Observation'],).count().reset_index(level = 'Observation')
    count_emission = count_emission.join(Count_State(df), rsuffix = '_State')
    count_emission = count_emission.drop('Observation_State',axis=1)    
    count_emission["emission"] = count_emission['Count'] / count_emission['Count_State']
    count_emission = count_emission.drop('Count_State',axis=1)
    print("Emission Dataframe created.")
    return count_emission

def Replace_With_Unk(df, k):
    emission_count = df.copy()
    drop_table = emission_count.groupby(['Observation'],).sum()
    drop_table = drop_table.loc[drop_table['Count'] < k].reset_index()
    emission_count['Observation'].loc[emission_count['Observation'].isin(drop_table['Observation'])] = '#UNK#'
    emission_count = emission_count.groupby(['State','Observation'],).sum()
    
    return emission_count

### (5 pts) Write a function that estimates the transition parameters from the training set using MLE (maximum likelihood estimation):

#### Please make sure the following special cases are also considered: q(STOP|yn) and q(y1|START).

<img src="images/mle2.jpg">

In [9]:
def Count_Transistion(df):
    transistion = df.copy()
    transistion['J'] = transistion['State']
    transistion['J'] = transistion['J'].shift(-1)
    transistion['J'].loc[transistion['J'] == 'Start'] = 'Stop'
    transistion['J'].loc[pd.isnull(transistion['J'])] = 'Stop'
    count_transistion = transistion.groupby(['State','J']).count()
    
    #Create Full table of transistion permutations
    states = Count_State(df).reset_index().as_matrix()[:-1,0]
    length = states.shape[0] + 1
    start = np.reshape(np.concatenate((['Start'],states)),(1,-1))
    Stop = np.reshape(np.concatenate((states,['Stop'])),(1,-1))
    states = np.vstack((np.repeat(start,length),np.ravel(np.repeat(Stop,length,axis=0)))).T
    states = pd.DataFrame(states, columns=['State','J'])
    states['Observation'] = 0
    states = states.set_index(['State','J'])
    count_transistion = states.join(count_transistion, how= 'left', lsuffix= '2').drop('Observation2', axis = 1).fillna(0)
    
    #Compute transistion probabilities
    count_transistion = count_transistion.join(Count_State(df), lsuffix='_trans', rsuffix='_state')
    count_transistion['aij'] = count_transistion['Observation_trans'] / count_transistion['Observation_state']
    print("Transistion Dataframe created.")
    
    return count_transistion

In [10]:
def Viterbi_W (global_trigram, global_weights, tweet, df, word_list, count_emission, count_transistion, order, prob_max=None, arg_max=None, level = 1, train = True):
    '''
    @@@This Viterbi is modified for weights@@@
    tweet    : Tweet formatted into list of words
    prob_max : Max probability of layer
    arg_max  : Max argument of layer
    level    : Current layer
    '''
    # Base case
    if level == 1:
        pi = np.ones(len(order))
        transmission = [count_transistion.get(key) for key in [('Start',next_state) for next_state in order]]
        emission = [0 if i is None else i for i in [count_emission.get(key)\
                                         for key in [(state,tweet[level]) for state in order]]]
        new_prob_max = pi + np.log(transmission) + np.log(emission) #elementwise log addition
        new_arg_max = np.full((len(order)),'Start')
        new_prob_max += getWordWeights(tweet[level], order, global_weights) #update with global weights

    else:
        new_prob_max = []
        new_arg_max = []
        pi = prob_max
        
        # Final case (len(new_argmax) = len(new_max) = 1)
        if level == tweet.shape[0]-1:
            transmission = [count_transistion.get(key) for key in [(prev_state,'Stop') for prev_state in order]]
            emission = np.ones(len(order)) #Emission in STOP state = 1
            intermediate = pi + np.log(transmission) + np.log(emission) #elementwise log addition
                        
            new_prob_max = intermediate.max()
            new_arg_max = order[intermediate.argmax()] #Best previous state
        
        # Recursive case (len(new_argmax) = len(new_max) = # of possible states)
        else:
            emission_all = [0 if i is None else i for i in [count_emission.get(key)\
                             for key in [(state,tweet[level]) for state in order]]]
            for i in range(len(order)):
                transmission = [count_transistion.get(key) for key in [(prev_state,order[i]) for prev_state in order]]
                emission = np.full(7,emission_all[i])
                intermediate = pi + np.log(transmission) + np.log(emission) #elementwise log addition

                new_prob_max.append(intermediate.max())
                new_arg_max.append(order[intermediate.argmax()]) #Best previous state
            
            new_prob_max = np.asarray(new_prob_max)
            new_arg_max = np.asarray(new_arg_max)
            
            new_prob_max += getWordWeights(tweet[level], order, global_weights) #update with global weights
            
            if level >= 3:
                state_trans_a = {a_:b_ for a_,b_ in zip(order,arg_max)}
                state_trans_b = {a_:b_ for a_,b_ in zip(order,new_arg_max)}
                tristate_list = []
                for c in list(state_trans_b.keys()):
                    b = state_trans_b[c]
                    a = state_trans_a[b]
                    tristate_list.append([a,b,c])
                new_prob_max += getTrigramWeights(tristate_list, global_trigram)
            
    # Final case
    if level == tweet.shape[0]-1: #Recursion termination (reached nth term)
        return ([arg_max[np.where(order == new_arg_max)[0].item()],new_arg_max])
    
    else: #Need further recursion, increment level(layer)
        
        #Forward Propagation
        path = Viterbi_W (global_trigram, global_weights, tweet, df, word_list, count_emission, count_transistion, order, new_prob_max, new_arg_max, level+1, train)
        
        #Backward Propagation
        #When backward propagation finishes, append stop
        if len(np.where(order == path[0])[0]) == 0: 
            return path +['Stop']
        
        #If yet to finish, concat newly discovered state to path
        else:
            return [arg_max[np.where(order == path[0])[0].item()]] + path


#### Creation of Trigrams & tag-word pairs

In [11]:
# Create Trigram objects
def initTagTrigram(order):
    order_permute = np.array(np.meshgrid(order, order, order)).T.reshape(-1,3)
    return {' '.join(word): 0 for word in order_permute}

# Create tag-word pairs
def initTagWordPair(order, words):
    order_permute = np.array(np.meshgrid(order, words)).T.reshape(-1,2)
    tagWordPair = {' '.join(word): 0 for word in order_permute}
    return tagWordPair

#### Updating of Trigrams & tag-word pairs

In [12]:
# Update trigrams
def updateTagTrigram(actual_states, predicted_states, tagTrigram, learning_rate = 1):
    assert(len(predicted_states) == len(actual_states)), "Actual and predicted states are of different length"
    
    # For actual states
    for i in range(len(actual_states) - 2):
        trigram = actual_states[i:i+3]
        trigram = ' '.join(trigram)
        tagTrigram[trigram] += learning_rate
    
    # For predicted states
    for i in range(len(predicted_states) - 2):
        trigram = predicted_states[i:i+3]
        trigram = ' '.join(trigram)
        tagTrigram[trigram] -= learning_rate
    return tagTrigram


def updateTagWordPair(tweet, actual_states, predicted_states, tagWordPair, learning_rate = 1):
    assert(len(actual_states) == len(predicted_states) == len(tweet)),"Length of states/words are not equal %d, %d, %d"
    
    # iterate through each actual state and words
    actual_pairs = ["{} {}".format(a_, b_) for a_, b_ in zip(actual_states, tweet)]

    # iterate through each predicted state and words
    predicted_pairs = ["{} {}".format(a_, b_) for a_, b_ in zip(predicted_states, tweet)]

    # Increment score by 1 if actual tagword pair
    for x in actual_pairs:
        tagWordPair[x] += learning_rate
        
    # Decrement score by 1 if actual tagword pair
    for x in predicted_pairs:
        tagWordPair[x] -= learning_rate
    
    return tagWordPair

#### Obtaining trigram weights & word weights

In [13]:
def getTrigramWeights(tristate_list, global_trigram):
    order_permute = [" ".join(x) for x in tristate_list]
    return np.asarray([global_trigram[key] for key in order_permute])


def getWordWeights(word, order, weights):
    order_permute = ["{} {}".format(a_, b_) for a_, b_ in zip(order, [word]*7)]
    return np.asarray([weights[key] for key in order_permute])

### Modified Perceptron

In [14]:
def Perceptron(tweet, label, global_trigram, global_weights, df, word_list, count_emission, count_transistion, order, iteration=20, trigram_rate=1, word_rate=1):
    tweet = tweet.tolist()
    for i in range(1,len(tweet)-1):
        if tweet[i] not in word_list: #These are Start Stop states containing None
            tweet[i] = '#UNK#'
    tweet = np.asarray(tweet)
    count=0
    for i in range(iteration):
        predicted_label = Viterbi_W(global_trigram, global_weights, tweet, df, word_list, count_emission, count_transistion, order)[1:-1]
        count= count + 1
        if not np.array_equal(np.asarray(predicted_label), label):
            global_trigram = updateTagTrigram(label, predicted_label, global_trigram, trigram_rate)
            global_weights = updateTagWordPair(tweet[1:-1], label, predicted_label, global_weights, word_rate)
        else:
            break
    return global_trigram, global_weights


def TrainPerceptron(lang):
    np.seterr(divide='ignore') #Ignore log zero warnings
    createCleanTrain(lang, '/train')
    createCleanDevIn(lang, '/dev.in')
    df = createDf(lang,'/train_clean')
    df_test = createDfDevin(lang,'/dev_clean.in')
    
    count_state = Count_State(df)
    count_emission = Replace_With_Unk(Count_Emission(df), 2)
    word_list = count_emission.reset_index().Observation.values
    count_transistion = Count_Transistion(df)

    count_transistion_swap = count_transistion.swaplevel(i = 'State', j = 'J').sort_index()
    order = Count_State(df).reset_index().as_matrix()[:-1,0]
    
    count_state = count_state.to_dict()['Observation']
    count_emission = count_emission.to_dict()['emission']
    count_transistion = count_transistion.to_dict()['aij']
    count_transistion_swap = count_transistion_swap.to_dict()['aij']
    
    global_weights = initTagWordPair(order, word_list)
    global_trigram = initTagTrigram(order)
    print('Training Perceptron...')
    df_train_size = df_test.reset_index().Tweet.max()+1
    s_time = time.time()
    
    # Use training set to train Perceptron
    for x in range(df_train_size):
        if x%100==0:
            print('Tweet number:',x,'time:',time.time()-s_time)
        tweet_x = np.asarray(getTweet(df,x))
        tweet_y = np.asarray(getTweetLabel(df, x))
        parameters = {'EN':{'iteration':200 , 'trigram_rate':0.00001, 'word_rate':0.0001 },'FR':{'iteration':50 , 'trigram_rate':0.001, 'word_rate':0.00001}}

        global_trigram, global_weights = Perceptron(tweet_x, tweet_y, global_trigram, global_weights, df, word_list, count_emission, count_transistion, order, iteration=parameters.get(lang).get('iteration'), trigram_rate=parameters.get(lang).get('trigram_rate'), word_rate=parameters.get(lang).get('word_rate'))
    
    # Use dev.in to predict the labels
    df_test_size = df_test.reset_index().Tweet.max()+1
    predictions = []
    pred_list = []
    s_time = time.time()
    
    # Iterations for prediction
    print('Predicting...')
    for x in range(df_test_size):
        if x%100==0:
            print('Tweet number:',x,'time:',time.time()-s_time)
        tweet = np.asarray(getTweet(df_test,x))
        tweet = tweet.tolist()
        for i in range(1,len(tweet)-1):
            if tweet[i] not in word_list: #These are Start Stop states containing None
                tweet[i] = '#UNK#'
        tweet = np.asarray(tweet)
        observations = Viterbi_W(global_trigram, global_weights, tweet, df, word_list, count_emission, count_transistion, order, train = False)[1:-1]
        predictions = predictions + ["{} {}\n".format(a_, b_) for a_, b_ in zip(tweet[1:-1],observations)] + ['\n']

        pred_list = pred_list + [x for x in zip(tweet[1:-1],observations)]
        
    with open('%s/dev.p5.out'%lang, 'w',encoding='utf8') as f:
        f.write(''.join(predictions))
        print("Saved.")
    return global_weights, global_trigram, predictions, pred_list

In [16]:
def __main__(language):
    global_weights, global_trigram, predictions, pred_list = TrainPerceptron(language)
    
language = 'FR'
__main__(language)

Training dataframe created.
Testing dataframe created.
Emission Dataframe created.
Transistion Dataframe created.
Training Perceptron...
Tweet number: 0 time: 0.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Tweet number: 100 time: 15.412178039550781
Tweet number: 200 time: 31.302725791931152
Predicting...
Tweet number: 0 time: 0.0009999275207519531
Tweet number: 100 time: 0.799354076385498
Tweet number: 200 time: 1.4084093570709229
Saved.
