# Part 2

### Recall that the HMM discussed in class is defined as follows:

<img src="images/hmm_eqn.jpg">

#### Data Preprocessing

In [1]:
import numpy as np
import pandas as pd

# Containers for the states and observations
tweets = []          # A list of all the tweets (Each tweet is a list)
word_count, tweet_count = 0, 0

# Import training data
with open('./SG/train', encoding='utf8') as f:
    training_lines = f.readlines()
    
    # For each line in the file
    for line in training_lines: 
        
        # If line is empty (i.e. we enter a new tweet)
        if line in['\n', '\r\n']: # Initialize a new tweet, reset word count
            if word_count != 0: #If the previous tweet was not empty, increase tweet count
                tweet_count += 1
            word_count = 0
        
        else:
            # Remove the spaces in each line
            stripped = line.strip().split(" ")
            if len(stripped) == 2:
                if word_count == 0:
                    tweets.append([tweet_count, word_count,'None','Start'])
                    word_count += 1
                tweets.append([tweet_count, word_count] + stripped)
                word_count += 1

#### Obtain count of labels

In [2]:
# Get Count(i) / Count(j)
df = pd.DataFrame(tweets,columns=['Tweet', 'Word', 'Observation', 'State'])
df = df.set_index(['Tweet', 'Word'])
states_counter = df.groupby('State').count().reset_index()
count_i_array = states_counter.as_matrix()
print(count_i_array)


[['B-negative' 1299]
 ['B-neutral' 5722]
 ['B-positive' 2613]
 ['I-negative' 443]
 ['I-neutral' 5272]
 ['I-positive' 1653]
 ['O' 91753]
 ['Start' 7094]]


In [3]:
transistion = df
transistion['J'] = transistion['State']
for Tweet, new_df in df.groupby(level=0):

    step = transistion.loc[Tweet,'State'].shift(-1).as_matrix()
    step.put(-1,'Stop')
    transistion.loc[Tweet,'J'] = step
transistion

Unnamed: 0_level_0,Unnamed: 1_level_0,Observation,State,J
Tweet,Word,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,,Start,O
0,1,I'm,O,O
0,2,at,O,B-neutral
0,3,VivoCity,B-neutral,O
0,4,in,O,B-neutral
0,5,Singapore,B-neutral,O
0,6,https://t.co/naeH7jxBhE,O,Stop
1,0,,Start,O
1,1,I,O,O
1,2,miss,O,B-positive


In [4]:
count_transistion = transistion.groupby(['State','J']).count()
count_transistion['aij'] = count_transistion['Observation']
for i_state,i_sum in count_i_array:
    count_transistion.loc[i_state,'aij'] = count_transistion.loc[i_state,'aij'].as_matrix()/float(i_sum)

count_transistion

Unnamed: 0_level_0,Unnamed: 1_level_0,Observation,aij
State,J,Unnamed: 2_level_1,Unnamed: 3_level_1
B-negative,B-negative,2,0.00154
B-negative,B-positive,1,0.00077
B-negative,I-negative,353,0.271747
B-negative,O,913,0.702848
B-negative,Stop,30,0.023095
B-neutral,B-negative,2,0.00035
B-neutral,B-neutral,32,0.005592
B-neutral,B-positive,4,0.000699
B-neutral,I-neutral,2673,0.467144
B-neutral,O,2908,0.508214


### (5 pts) Write a function that estimates the emission parameters from the training set using MLE (maximum likelihood estimation):

<img src="images/MLE.jpg">

In [5]:
df["Count"] = 1
state_obs_counter = df.groupby(['State','Observation'],).count().reset_index()
count_y_o = state_obs_counter.as_matrix()

# Print State, Observation, Count
print(count_y_o)

[['B-negative' '#1MDB' 1 1]
 ['B-negative' '#AHS6' 1 1]
 ['B-negative' '#AustonMatthews' 2 2]
 ..., 
 ['O' '️' 226 226]
 ['O' '＠' 1 1]
 ['Start' 'None' 7094 7094]]


In [6]:
e = []
count_i_1d = count_i_array.flatten()
count_i_1d_list = count_i_1d.tolist()

for i in range(0, len(count_y_o)):
    y = count_y_o[i][0]
    index_y = count_i_1d_list.index(y)
    count_y = count_i_1d_list[index_y + 1]
    count_y_x = count_y_o[i][2]
    e_x_y = count_y_x / count_y
    e.append(e_x_y)
    
state_obs_counter["e(x|y)"] = e

In [7]:
print(state_obs_counter)

            State                      Observation     J  Count    e(x|y)
0      B-negative                            #1MDB     1      1  0.000770
1      B-negative                            #AHS6     1      1  0.000770
2      B-negative                  #AustonMatthews     2      2  0.001540
3      B-negative                          #Beatty     1      1  0.000770
4      B-negative                      #Blackhawks     1      1  0.000770
5      B-negative                             #C51     1      1  0.000770
6      B-negative                             #CNN     1      1  0.000770
7      B-negative                            #CSIS     1      1  0.000770
8      B-negative                           #China     1      1  0.000770
9      B-negative                         #Clinton     1      1  0.000770
10     B-negative                  #CrookedHillary     1      1  0.000770
11     B-negative                       #DailyMail     1      1  0.000770
12     B-negative                    #

### (10 pts) During the testing phase, if the word does not appear in the “modified training set”, we replace that word with #UNK# as well. Set k to 3, implement this fix into your function for computing the emission parameters.

In [8]:
k = 3

# Replace observation words with #UNK# for observations with count <= 3
state_obs_counter['Observation'].loc[state_obs_counter['Count'] < k] = '#UNK#'

print(state_obs_counter)

            State      Observation     J  Count    e(x|y)
0      B-negative            #UNK#     1      1  0.000770
1      B-negative            #UNK#     1      1  0.000770
2      B-negative            #UNK#     2      2  0.001540
3      B-negative            #UNK#     1      1  0.000770
4      B-negative            #UNK#     1      1  0.000770
5      B-negative            #UNK#     1      1  0.000770
6      B-negative            #UNK#     1      1  0.000770
7      B-negative            #UNK#     1      1  0.000770
8      B-negative            #UNK#     1      1  0.000770
9      B-negative            #UNK#     1      1  0.000770
10     B-negative            #UNK#     1      1  0.000770
11     B-negative            #UNK#     1      1  0.000770
12     B-negative            #UNK#     1      1  0.000770
13     B-negative            #UNK#     1      1  0.000770
14     B-negative            #UNK#     1      1  0.000770
15     B-negative            #UNK#     1      1  0.000770
16     B-negat

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


### (10 pts) Implement a simple sentiment analysis system that produces the tag y* = arg max e(x|y) for each word x in the sequence