# Part 2

### Recall that the HMM discussed in class is defined as follows:

<img src="images/hmm_eqn.jpg">

#### Data Preprocessing

In [1]:
import numpy as np
import pandas as pd

def createDf(language):
    '''
    language = 'CN' , 'EN', 'FR', 'SG'
    '''
    tweets = []          # A list of all the tweets (Each tweet is a list)
    word_count, tweet_count = 0, 0
    # Import training data
    with open('./' + language + '/train', encoding='utf8') as f:
        training_lines = f.readlines()

        # For each line in the file
        for line in training_lines: 

            # If line is empty (i.e. we enter a new tweet)
            if line in['\n', '\r\n']: # Initialize a new tweet, reset word count
                if word_count != 0: #If the previous tweet was not empty, increase tweet count
                    tweet_count += 1
                word_count = 0

            else:
                # Remove the spaces in each line
                stripped = line.strip().split(" ")
                if len(stripped) == 2:
                    if word_count == 0:
                        tweets.append([tweet_count, word_count,'None','Start'])
                        word_count += 1
                    tweets.append([tweet_count, word_count] + stripped)
                    word_count += 1
                    
    df = pd.DataFrame(tweets,columns=['Tweet', 'Word', 'Observation', 'State'])
    df = df.set_index(['Tweet', 'Word'])
    return df

In [2]:
def createDfDevin(language, file):
    '''
    language = 'CN' , 'EN', 'FR', 'SG'
    '''
    # A list of all the tweets (Each tweet is a list)
    tweets = []          
    word_count, tweet_count = 0, 0
    
    # Import training data
    with open('./' + language + file, encoding='utf8') as f:
        training_lines = f.readlines()

        # For each line in the file
        for line in training_lines: 

            # If line is empty (i.e. we enter a new tweet)
            if line in['\n', '\r\n']: # Initialize a new tweet, reset word count
                if word_count != 0: #If the previous tweet was not empty, increase tweet count
                    tweet_count += 1
                word_count = 0

            else:
                # Remove the spaces in each line
                stripped = line.strip().split(" ")
                if len(stripped) == 1:
                    if word_count == 0:
                        tweets.append([tweet_count, word_count,'None'])
                        word_count += 1
                    tweets.append([tweet_count, word_count] + stripped)
                    word_count += 1
    df = pd.DataFrame(tweets,columns=['Tweet', 'Word', 'Observation'])
    df = df.set_index(['Tweet', 'Word'])
    return df

In [3]:
def getTweet(df, tweetNumber):
    """
    Inputs:
    df: dataframe of all tweets
    tweetNumber: which tweet to access and extract

    Output:
    obs_list: list of observations for a specified tweet 
    """
    df_resetindex = df.reset_index()
    tweet_df = df_resetindex.loc[df_resetindex['Tweet'] == tweetNumber]
    
    # Convert tweet dataframe to a list
    tweet_list = tweet_df.values.T.tolist()
    
    # Append a None at the end of observation to account for 'Stop' state
    obs_list = tweet_list[2]
    obs_list.append('None')
    
    # returns a list of observations
    return obs_list

#### Obtain count of labels

In [4]:
def Count_State(df):
    '''
    Get Count(i) and Count(j)
    '''
    states_count = df.groupby('State').count()
    return states_count

In [5]:
def Count_Transistion(df, count_state):
    transistion = df.copy()
    transistion['J'] = transistion['State']
    transistion['J'] = transistion['J'].shift(-1)
    transistion['J'].loc[transistion['J'] == 'Start'] = 'Stop'
    transistion['J'].loc[pd.isnull(transistion['J'])] = 'Stop'
    count_transistion = transistion.groupby(['State','J']).count()
    
    # Create full table of transistion permutations
    states = Count_State(df).reset_index().as_matrix()[:-1,0]
    length = states.shape[0] + 1
    start = np.reshape(np.concatenate((['Start'],states)),(1,-1))
    stop = np.reshape(np.concatenate((states,['Stop'])),(1,-1))
    states = np.vstack((np.repeat(start,length),np.ravel(np.repeat(stop,length,axis=0)))).T
    states = pd.DataFrame(states, columns=['State','J'])
    states['Observation'] = 0
    states = states.set_index(['State','J'])
    count_transistion = states.join(count_transistion, how='left', lsuffix='2').drop('Observation2', axis=1).fillna(0)
    
    # Compute transistion probabilities
    count_transistion = count_transistion.join(count_state, lsuffix='_trans', rsuffix='_state')
    count_transistion['aij'] = count_transistion['Observation_trans'] / count_transistion['Observation_state']
    
    return count_transistion

### (5 pts) Write a function that estimates the emission parameters from the training set using MLE (maximum likelihood estimation):

<img src="images/MLE.jpg">

In [6]:
def Count_Emission(df):
    # Create a column 'Counts' filled with 1's
    df["Count"] = 1
    
    # Dataframe manipulation to obtain emission count
    emission_count = df.groupby(['State','Observation'],).count().reset_index(level = 'Observation')
    emission_count = emission_count.join(Count_State(df), rsuffix = '_State')
    emission_count = emission_count.drop('Observation_State',axis=1)
    emission_count["emission"] = emission_count['Count'] / emission_count['Count_State']
    emission_count = emission_count.drop('Count_State',axis=1)
    
    return emission_count

### (10 pts) During the testing phase, if the word does not appear in the “modified training set”, we replace that word with #UNK# as well. Set k to 3, implement this fix into your function for computing the emission parameters.

In [7]:
def Replace_With_Unk(df, k):
    emission_count = df.copy()
    drop_table = emission_count.groupby(['Observation'],).sum()
    drop_table = drop_table.loc[drop_table['Count'] < k].reset_index()
    emission_count['Observation'].loc[emission_count['Observation'].isin(drop_table['Observation'])] = '#UNK#'
    emission_count = emission_count.groupby(['State','Observation'],).sum()
    
    return emission_count

### (10 pts) Implement a simple sentiment analysis system that produces the tag y* = arg max e(x|y) for each word x in the sequence

In [9]:
def convertToDataFrame(df):
    index = df.groupby("Observation").idxmax()
    obs_list = index.as_matrix()
    index_list = []
    
    for i in range(0,len(obs_list)):
        index_list.append(obs_list[i][1])
        
    tags = pd.DataFrame(index_list,columns=['Observation', 'State']).set_index('Observation')
    
    return tags

In [10]:
def predictOutput(language, tags):
    test_pred = []

    with open('./' + language + '/dev.in', encoding='utf8') as test:

        lines = test.readlines()
        test_lines =[]
        for line in lines:
            line = line.strip("\n")
            test_lines.append(line)

    for i in test_lines:
        try:
            if i == '':
                x = ''
            else:
                x = tags.loc[i]
        except KeyError:
            x = tags.loc["#UNK#"]

        for j in x:
            x = j

        temp = [i,x]
        test_pred.append(temp)

    tests = pd.DataFrame(test_pred,columns=['Observation',"Predictions"])
    tests.to_csv(r'./' + language + '/dev.p2.out', header=None, index=None, sep=' ', mode='a',encoding='utf8')

In [11]:
def __main__(language):
    df = createDf(language)
    count_state = Count_State(df)
    count_transistion = Count_Transistion(df, count_state)
    emission_count = Count_Emission(df)
    emission_count = Replace_With_Unk(emission_count, 3)
    emission_count = emission_count.swaplevel(i = 'Observation', j = 'State' ).sortlevel()
    tags = convertToDataFrame(emission_count)
        
    predictOutput(language, tags)
    print("File is saved")
    

In [12]:
pd.options.mode.chained_assignment = None  # Turn off warning message

'''
Languages: 'CN' , 'EN', 'FR', 'SG'
Change the language below accordingly
'''
language = 'EN' 
__main__(language)


  import sys


File is saved
