# Part 2 (25 points)

### Just a means to import your train, dev.in files





In [1]:
# from google.colab import drive

# drive.mount('/content/gdrive/')

### Get necessary library imports

In [2]:
import pandas as pd 
import numpy as np
import itertools 

### Establish which language folder you are using

In [3]:
filepath = 'EN'
# filepath = 'SG'
# filepath = 'CN'
# filepath = 'AL'


### Setting up our training data to be used

In [4]:
#Assign Column names
colnames=['word', 'tag'] 

#Import the dataset without blank lines
df  = pd.read_csv(f"{filepath}/train", sep=" ", names=colnames, header=None)
df.dropna(inplace=True)
df

Unnamed: 0,word,tag
0,Municipal,B-NP
1,bonds,I-NP
2,are,B-VP
3,generally,B-ADVP
4,a,B-ADJP
...,...,...
181623,resolved,I-VP
181624,in,B-PP
181625,the,B-NP
181626,West,I-NP


### Find the total counts of each unique word and tag pair

In [5]:
#Find the word count for a given tag
word_tag_count = df.groupby(['word','tag']).size().to_frame('count').reset_index()
word_tag_count

Unnamed: 0,word,tag,count
0,!,O,15
1,#,B-ADJP,3
2,#,B-NP,20
3,#,I-NP,10
4,#,O,1
...,...,...,...
25046,zestfully,B-VP,1
25047,zigzags,I-NP,1
25048,zip,I-NP,1
25049,zones,I-NP,2


### Get the total counts of each word

In [6]:
#Find the count for each word
word_count = df.groupby('word').size().to_frame('count').reset_index()
word_count

Unnamed: 0,word,count
0,!,15
1,#,34
2,$,1476
3,%,1030
4,&,171
...,...,...
18207,zestfully,1
18208,zigzags,1
18209,zip,1
18210,zones,2


### First we filter out words appearing less than k times, then we smooth by replacing the words with #UNK#

In [7]:
#Filter out words with count less than k
k = 3
word_count = word_count.drop(word_count[word_count['count'] < k].index)
word_count

Unnamed: 0,word,count
0,!,15
1,#,34
2,$,1476
3,%,1030
4,&,171
...,...,...
18194,yields,25
18195,you,137
18196,young,17
18197,younger,9


In [8]:
#Replace words that appear less than k times with #UNK#
cols = [col for col in word_tag_count.columns if col == 'word']
word_tag_count.loc[~word_tag_count['word'].isin(word_count['word']), cols ] = '#UNK#'
word_tag_count

Unnamed: 0,word,tag,count
0,!,O,15
1,#,B-ADJP,3
2,#,B-NP,20
3,#,I-NP,10
4,#,O,1
...,...,...,...
25046,#UNK#,B-VP,1
25047,#UNK#,I-NP,1
25048,#UNK#,I-NP,1
25049,#UNK#,I-NP,2


In [9]:
#Combine all the #UNK# for a given tag
word_tag_count = word_tag_count.groupby(['word','tag'])['count'].sum().to_frame().reset_index()
word_tag_count

Unnamed: 0,word,tag,count
0,!,O,15
1,#,B-ADJP,3
2,#,B-NP,20
3,#,I-NP,10
4,#,O,1
...,...,...,...
12166,young,I-NP,7
12167,younger,B-NP,4
12168,younger,I-NP,5
12169,your,B-NP,38


### Find total counts of each tag

In [10]:
#Find the total count for a given tag
tag_count = word_tag_count.groupby('tag')['count'].sum().to_frame().reset_index()
tag_count

Unnamed: 0,tag,count
0,B-ADJP,1751
1,B-ADVP,3565
2,B-CONJP,49
3,B-INTJ,26
4,B-LST,11
5,B-NP,47305
6,B-PP,18387
7,B-PRT,468
8,B-SBAR,1899
9,B-UCP,1


### Our function for finding the emission

In [11]:
#Fucntion to find the emission value of a word for a given tag
def find_emission(tag, w_count):
    return w_count/(tag_count.loc[tag_count['tag'] == tag]['count'])

In [12]:
#Find emission parameters
word_tag_count['e'] = np.vectorize(find_emission)(word_tag_count['tag'].values, word_tag_count['count'].values)
# word_tag_count.to_csv('emission.csv', header = True)
word_tag_count

Unnamed: 0,word,tag,count,e
0,!,O,15,0.000628
1,#,B-ADJP,3,0.001713
2,#,B-NP,20,0.000423
3,#,I-NP,10,0.000183
4,#,O,1,0.000042
...,...,...,...,...
12166,young,I-NP,7,0.000128
12167,younger,B-NP,4,0.000085
12168,younger,I-NP,5,0.000092
12169,your,B-NP,38,0.000803


### Read and open up our dev.in file

In [13]:
#Read the test dataset
f = open(f"{filepath}/dev.in", "r")
f_list = f.read().splitlines()
f_list

['HBO',
 'has',
 'close',
 'to',
 '24',
 'million',
 'subscribers',
 'to',
 'its',
 'HBO',
 'and',
 'Cinemax',
 'networks',
 ',',
 'while',
 'Showtime',
 'and',
 'its',
 'sister',
 'service',
 ',',
 'The',
 'Movie',
 'Channel',
 ',',
 'have',
 'only',
 'about',
 '10',
 'million',
 ',',
 'according',
 'to',
 'Paul',
 'Kagan',
 'Associates',
 ',',
 'a',
 'Carmel',
 ',',
 'Calif.',
 ',',
 'research',
 'firm',
 '.',
 '',
 'WASHINGTON',
 'LIES',
 'LOW',
 'after',
 'the',
 'stock',
 'market',
 "'s",
 'roller-coaster',
 'ride',
 '.',
 '',
 'This',
 'may',
 'seem',
 'to',
 'be',
 'a',
 'preposterous',
 'and',
 'utterly',
 'futile',
 'effort',
 'in',
 'Africa',
 '.',
 '',
 'American',
 'Express',
 'Bank',
 'earnings',
 'fell',
 '50',
 '%',
 'to',
 '$',
 '21.3',
 'million',
 'from',
 '$',
 '42.5',
 'million',
 'despite',
 'a',
 '29',
 '%',
 'revenue',
 'gain',
 '.',
 '',
 'Californians',
 ',',
 'meanwhile',
 ',',
 'tried',
 'to',
 'cope',
 'with',
 'still-limited',
 'services',
 ',',
 'blocked',

### Assign tags to each of the words in our dev.in data, and output the result in dev.p2.out file

In [14]:
# Do a sentiment analysis and save it in a dev.p2.out file
df_unk = df_temp = word_tag_count[word_tag_count['word'] == '#UNK#']
unk_label = df_unk.loc[df_unk['e'].idxmax()]['tag']
l_count = 0
l_len = len(f_list)
with open(f"{filepath}/dev.p2.out", 'w') as out:
    l_count += 1
    for i in f_list:
        temp = ''
        if i != '':
            df_temp = word_tag_count[word_tag_count['word'] == i]
            if len(df_temp) > 0:
                temp = i + ' ' + df_temp.loc[df_temp['e'].idxmax()]['tag']
            else:
                temp = i + ' ' + unk_label
        else:
            temp = '\n'
            
        #writing to output file
        if l_count != l_len and temp != '\n':
            out.write("%s\n" % temp)
        else:
            out.write(temp)    