# Part 2

### Recall that the HMM discussed in class is defined as follows:

<img src="images/hmm_eqn.jpg">

#### Data Preprocessing

In [1]:
import numpy as np
import pandas as pd

# Containers for the states and observations
tweets = []          # A list of all the tweets (Each tweet is a list)
word_count, tweet_count = 0, 0

# Import training data
with open('./SG/train', encoding='utf8') as f:
    training_lines = f.readlines()
    
    # For each line in the file
    for line in training_lines: 
        
        # If line is empty (i.e. we enter a new tweet)
        if line in['\n', '\r\n']: # Initialize a new tweet, reset word count
            if word_count != 0: #If the previous tweet was not empty, increase tweet count
                tweet_count += 1
            word_count = 0
        
        else:
            # Remove the spaces in each line
            stripped = line.strip().split(" ")
            if len(stripped) == 2:
                if word_count == 0:
                    tweets.append([tweet_count, word_count,'None','Start'])
                    word_count += 1
                tweets.append([tweet_count, word_count] + stripped)
                word_count += 1

#### Obtain count of labels

In [2]:
# Get Count(i) / Count(j)
df = pd.DataFrame(tweets,columns=['Tweet', 'Word', 'Observation', 'State'])
df = df.set_index(['Tweet', 'Word'])
states_counter=df.groupby('State').count().reset_index()
count_i_array=states_counter.as_matrix()
print(count_i_array)


[['B-negative' 1299]
 ['B-neutral' 5722]
 ['B-positive' 2613]
 ['I-negative' 443]
 ['I-neutral' 5272]
 ['I-positive' 1653]
 ['O' 91753]
 ['Start' 7094]]


### (5 pts) Write a function that estimates the emission parameters from the training set using MLE (maximum likelihood estimation):

<img src="images/MLE.jpg">

In [3]:
df["Count"] = 1
state_obs_counter = df.groupby(['State','Observation'],).count().reset_index()
count_y_o = state_obs_counter.as_matrix()

# Print State, Observation, Count
print(count_y_o)

[['B-negative' '#1MDB' 1]
 ['B-negative' '#AHS6' 1]
 ['B-negative' '#AustonMatthews' 2]
 ..., 
 ['O' '️' 226]
 ['O' '＠' 1]
 ['Start' 'None' 7094]]


In [4]:
e = []
count_i_1d = count_i_array.flatten()
count_i_1d_list = count_i_1d.tolist()

for i in range(0, len(count_y_o)):
    y = count_y_o[i][0]
    index_y = count_i_1d_list.index(y)
    count_y = count_i_1d_list[index_y + 1]
    count_y_x = count_y_o[i][2]
    e_x_y = count_y_x / count_y
    e.append(e_x_y)
    
state_obs_counter["e(x|y)"] = e

In [5]:
print(state_obs_counter)

            State                      Observation  Count    e(x|y)
0      B-negative                            #1MDB      1  0.000770
1      B-negative                            #AHS6      1  0.000770
2      B-negative                  #AustonMatthews      2  0.001540
3      B-negative                          #Beatty      1  0.000770
4      B-negative                      #Blackhawks      1  0.000770
5      B-negative                             #C51      1  0.000770
6      B-negative                             #CNN      1  0.000770
7      B-negative                            #CSIS      1  0.000770
8      B-negative                           #China      1  0.000770
9      B-negative                         #Clinton      1  0.000770
10     B-negative                  #CrookedHillary      1  0.000770
11     B-negative                       #DailyMail      1  0.000770
12     B-negative                    #DavidCameron      1  0.000770
13     B-negative                            #Da

### (10 pts) During the testing phase, if the word does not appear in the “modified training set”, we replace that word with #UNK# as well. Set k to 3, implement this fix into your function for computing the emission parameters.

In [6]:
k = 3

# Replace observation words with #UNK# for observations with count <= 3
state_obs_counter['Observation'].loc[state_obs_counter['Count'] < k] = '#UNK#'

print(state_obs_counter)

            State      Observation  Count    e(x|y)
0      B-negative            #UNK#      1  0.000770
1      B-negative            #UNK#      1  0.000770
2      B-negative            #UNK#      2  0.001540
3      B-negative            #UNK#      1  0.000770
4      B-negative            #UNK#      1  0.000770
5      B-negative            #UNK#      1  0.000770
6      B-negative            #UNK#      1  0.000770
7      B-negative            #UNK#      1  0.000770
8      B-negative            #UNK#      1  0.000770
9      B-negative            #UNK#      1  0.000770
10     B-negative            #UNK#      1  0.000770
11     B-negative            #UNK#      1  0.000770
12     B-negative            #UNK#      1  0.000770
13     B-negative            #UNK#      1  0.000770
14     B-negative            #UNK#      1  0.000770
15     B-negative            #UNK#      1  0.000770
16     B-negative            #UNK#      1  0.000770
17     B-negative            #UNK#      1  0.000770
18     B-neg

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


### (10 pts) Implement a simple sentiment analysis system that produces the tag y* = arg max e(x|y) for each word x in the sequence