In [10]:
import numpy as np
import pandas as pd
import time

# 01.112 Machine Learning Design Project

## About the Project

We have 4 datasets in the `/data` folder. For each dataset, there is: 
- a labelled training set train, 
- an unlabelled development set `dev.in`
- a labelled development set `dev.out` 

The labelled data has the format of: `token` `\t` `tag`
- one token per line
- token and tag separated by tab 
- single empty lines that separates sentences

For the labels, they are slightly different for different datasets.
- SG, CN (Entity):
    - B-*: Beginning of entity
    - I-*: Inside of entity
    - O: Outside of any entity
- EN, AL (Phrase):
    - B-VP: Beginning of Verb Phrase
    - I-VP: Inside of Verb Phrase
    - *-NP: Noun Phrase
    - *PP: Propositional Phrase
    - O: Outside of any phrase

*Goal*: Build sequence labelling systems from training data (x) and use it to predict tag sequences for new sentences (y).

## Team members 
- Andri Setiawan Susanto
- Eldon Lim 
- Tey Siew Wen

## Part 1
Already completed individually.

## Part 2

a) Write a function that estimates the emission parameters from the training set using MLE (maximum likelihood estimation):

In [23]:
def emissionPara(xy,y):
    e_x_y= xy/y
    
    return e_x_y

b)

1. Make a modified training set by replacing those words that appear $<k$ times in the training set with a special word token `#UNK#` before training.
2. During testing phase, ifaworddoesnot appear in the modified training set, we also replace that wordwith `#UNK#`.
3. Compute Emission Paramters with the function in (a)

In [8]:
k = 3
replaceWord = "#UNK#" 

In [3]:
def split_into_columns(df_column):
    new = df_column.str.split(" ", n=1, expand=True)
    return new[0], new[1]

In [14]:
def preprocess_training(data,k):
    global replaceWord
    
    start = time.process_time()   
    x_dic = {}
    
    # dropping null value columns e.g. index_col to avoid errors 
    df= pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False, engine="python")
    
    # new data frame with split value columns 
    df["x"], df["y"] = split_into_columns(df["original"])

    # df display: record x value and replace y values with replaceWord when necessary, in respective dictionaries
    uniqueX, uniqueCountX= np.unique(df['x'].astype(str),return_counts=True)
    for i in range(len(uniqueX)):
        x_dic[uniqueX[i]] = uniqueCountX[i]

    for i, text in enumerate(df['x']):
        if x_dic[text] < k:
            df['x'][i] = replaceWord
            df['original'][i]=df['original'][i].replace(text,replaceWord, 1)
    
    y_dic={}
    
    uniqueY, uniqueCountY= np.unique(df['y'].astype(str),return_counts=True)
    for i in range(len(uniqueY)):
        y_dic[uniqueY[i]] = uniqueCountY[i]
        
    xy_dic = {}
    df1= df.copy()
    
    # Get a tuple of unique values & their count from a numpy array
    df1.dropna(inplace = True) 
    uniqueXY, uniqueCountXY= np.unique(df1['original'].astype(str),return_counts=True)

    for i in range(len(uniqueXY)):
        xy_dic[uniqueXY[i]] = uniqueCountXY[i]
    # print('Unique Values : ', uniqueValues)
    
    # print('Count of Unique Values : ', uniqueCount)
    dft = pd.DataFrame([uniqueXY,uniqueCountXY]).T
    dft=dft.rename({0:'x_y',1:'count_x_y'},axis='columns')
    
    dft['count_y']=0
    for i,text in enumerate(dft['x_y']):
        data = text.split(" ")
        dft['count_y'][i]=y_dic[data[1]]
        
    dft['emission']=emissionPara(dft['count_x_y'], dft['count_y'])
    
    # new data frame with split value columns 
    dft1 = dft.copy()
    dft1["x"], dft1["y"] = split_into_columns(dft1["x_y"])
    
    xy_pred_dic = {}

    for word in dft1['x']:
        index = pd.Series.idxmax((dft1.loc[dft1['x'] == word]['emission']).astype(float))
        xy_pred_dic[word]=dft1['y'][index] 
    print("Time taken for train data: ", time.process_time() - start)
    return xy_pred_dic

In [5]:
def preprocess_test(data,k):
    global replaceWord
    
    start = time.process_time()   

    testdf1= pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False, engine="python")
    testdf= pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False,skip_blank_lines=False, engine="python")

    x_dic = {}

    uniqueX, uniqueCountX= np.unique(testdf1['original'].astype(str),return_counts=True)
    for i in range(len(uniqueX)):
        x_dic[uniqueX[i]] = uniqueCountX[i]

    testdf['modified']=''
#     print(testdf)
    for i, text in enumerate(testdf['original']):
    #         df['x'][i] = replaceWord
        try:
            if text not in xy_pred_dic:
            
                testdf['modified'][i]=testdf['original'][i].replace(text,replaceWord)
            else:
                testdf['modified'][i]=testdf['original'][i]
        except:
            continue
    testdf['predict_label']=''
    for index, word in enumerate(testdf['modified']):
#     print(word)
        try:
            testdf['predict_label'][index]= xy_pred_dic[word]
        except:
            continue
    print("Time taken for test data: ",time.process_time() - start)
    return testdf

For all the four datasets EN, AL, CN, and SG, learn these parameters with `train`, and evaluate your
system on the development set `dev.in` for each of the dataset. Write your output to `dev.p2.out`
for the four datasets respectively. Compare your outputs and the gold-standard outputs in `dev.out`
and report the precision, recall and F scores of such a baseline system for each dataset.

In [6]:
data_folders = ["AL", "EN","CN","SG"]
for x in data_folders:
    print("Performing sentiment analysis for data folder ", x)
    train_data = "./data/{}/train".format(x)
    test_data = "./data/{}/dev.in".format(x)
    test_result = "./data/{}/dev.out".format(x)
    
    xy_pred_dic = preprocess_training(train_data, k)
    testdf = preprocess_test(test_data,k)
    
    testresultdf = pd.read_csv(test_result, sep='/n', delimiter=None, names=['original'],index_col=False, engine="python")
    new = testresultdf["original"].str.split(" ", n=1,expand=True) 

    # making separate first name column from new data frame 
    testresultdf["x"]= new[0] 

    # making separate last name column from new data frame 
    testresultdf["y"]= new[1]
    final = pd.DataFrame()
    
    final['result'] = testdf['modified'] + ' ' + testdf['predict_label']
    print(final.head(3))
    
    print("Writing the final result to dev.out...")
    f = open('./output/{}/dev.p2.out'.format(x) ,'w')
    for word in final['result']:
        f.write(word + '\n')
    f.close()

Performing sentiment analysis for data folder  AL


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Time taken for train data:  26.46875
Time taken for test data:  6.3125
     result
0  杭 B-CITY
1  州 I-CITY
2  市 I-CITY
Writing the final result to dev.out...
Performing sentiment analysis for data folder  EN
Time taken for train data:  58.296875
Time taken for test data:  16.53125
         result
0      HBO B-NP
1      has B-VP
2  close B-ADJP
Writing the final result to dev.out...
Performing sentiment analysis for data folder  CN
Time taken for train data:  104.453125
Time taken for test data:  14.046875
             result
0      一 I-negative
1  #UNK# B-negative
2  #UNK# B-negative
Writing the final result to dev.out...
Performing sentiment analysis for data folder  SG
Time taken for train data:  416.0
Time taken for test data:  42.734375
               result
0     Tour B-positive
1  Scotland B-neutral
2         followers O
Writing the final result to dev.out...


## Part 3

Write a function that estimates the transition parameters from the training set using MLE (maximum likelihood estimation):

In [43]:
from collections import Counter, defaultdict

test = pd.read_csv("./data/EN/train", sep='/n', delimiter=None, names=['original'],index_col=False, engine="python", skip_blank_lines=False)
test.replace(np.nan, None, inplace=True)

def transitionPara(data):
    x, y = split_into_columns(test["original"])
    xy_dic = dict(zip(x, y))
    
    # Get bottom count (Count(yi))
    y_count = Counter(y)
    
    # Get top count (Count(yi-1, yi))
    subseq_count = defaultdict(int)
    for i in range(len(y)-1):
        y1 = y[i]
        y2 = y[i+1]
        if y1 == None or y2 == None:
            continue
        subseq_count[y1,y2] += 1
    
    # Calculation of transition params
    result = np.empty(len(y)+2)
    
    for i in range(0,len(y)):
        # account for START and STOP
        if i == 0: 
            result[i] = 1
            continue
        elif i == len(y):
            result[i] = 1
            break
        
        # for all other nodes
        y1 = y[i-1]
        y2 = y[i]
        if y1 == None or y2 == None:
            continue
        result[i] = subseq_count[y1,y2] / y_count[y2]        
    
    return result

transitionPara(test)

array([1.00000000e+000, 5.92854267e-001, 4.03351221e-001, ...,
       3.61900222e-001, 6.91002260e-310, 6.91002360e-310])