In [1]:
import glob
import pandas as pd
import numpy as np

In [2]:
not_wanted = set(["Greater", "Equal", "Pre", "Post", "Extrinsic-Pre", "Extrinsic-Post", "Assign", "Less", "OperationName", "Comparison"])

In [3]:
# This function builds a dictionary with all the words in the ann file as values and indices as their 
# corresponding variable names

def builddict(file):
    d = {}
    f = open(file, 'r')
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line.startswith("T"):
                line = line.split("\t")[1:]
                entity = line[0].split(" ")[0]
                if entity not in not_wanted:
                    _, si, ei = line[0].split(" ")
                    if si not in d:
                        d[si] = []
                        d[si].append(str(entity)) 
                        d[si].append(str(line[1]))
    return d

In [4]:
# This function builds a listt of list for a given txt file, with the entries of the inner list being
# the words in the txt file and its corresponding tag (var/value)

def build_dataframe(file, d):
    df = []
    with open(file) as f:
        cindex = 0
        seen = False
        end = 0
        nonzerocount = 0
        temp = f.readlines()
        flag = False
        
        for line in temp:
            tempstr = ""
            
            if "DESCRIPTION" in line:
                flag = True
            for i,char in enumerate(line):
                i_inner = end+i
                if str(cindex) in d or seen and flag:
                    seen = True
                    tempstr += char
                    
                    if seen and str(cindex) in d and i_inner == cindex+int(len(d[str(cindex)][1])):
                        seen = False
                        # add to df
                        # check if var or val 
                        if d[str(cindex)][0] != "Value":
#                             df.append([tempstr[:-1], d[str(cindex)][0]])
                            df.append([tempstr[:-1], "Variable"])
                        else:
                            df.append([tempstr[:-1], d[str(cindex)][0]])
                        tempstr = ""
                        cindex = i_inner
                        nonzerocount += 1

                else:
                    if flag:
                        if char == " ":
                            df.append([tempstr, '0'])
                            tempstr = ""
                        else:
                            tempstr += char

                    cindex = i_inner

            end += len(line)
    return df,nonzerocount

In [5]:
# This cell generates the training data per file and appends to the data list

import os
from collections import defaultdict

EXTENSIONS = {'.ann', '.txt'}

# directory = 'only ann'
directory = 'Annotated files/'

grouped_files = defaultdict(int)

for f in os.listdir(directory):
    name, ext = os.path.splitext(os.path.join(directory, f))
    if ext in EXTENSIONS:
        grouped_files[name] += 1
data = []
totalcount = 0
tc2 = 0
for name in grouped_files:
    if grouped_files[name] == len(EXTENSIONS):
        filetxt = '{}.txt'.format(name)
        fileann = '{}.ann'.format(name)
#         print "*"*10,filetxt,fileann
        d = builddict(fileann)
        totalcount += len(d)
        df,nzc = build_dataframe(filetxt, d)
        tc2 += nzc
        data += df

In [6]:
# Converting the data list to pandas dataframe
dataframe = pd.DataFrame(data, columns=['entity', 'tag'])

In [7]:
# Run this cell to get rid of empty entries
dataframe['entity'].replace('', np.nan, inplace=True)
dataframe.dropna(inplace=True)

In [8]:
dataframe[dataframe['tag']!='0'].head()

Unnamed: 0,entity,tag
3,current filehandle,Variable
22,filehandle for its parent directory,Value
26,current filehandle,Variable
33,parent directory,Value
36,NFS4ERR_NOENT,Value


In [9]:
# Writing train data to file
def write_to_csv(dataframe):
    dataframe.to_csv("train_data.csv")
# write_to_csv(dataframe)

In [10]:
# splitting row entry over spaces
df2 = []
sentence_id = 0

import re
import nltk

for index, row in dataframe.iterrows():
    for ele in row['entity'].split(" "):
#         s = re.sub(r'[^\w\s]','',ele)
        ele = re.sub('[(){},]', '', ele)
        ele2 = ele = re.sub('[(){},<>]', '', ele)
        token = nltk.word_tokenize(ele2)
        pos_tag = nltk.pos_tag(token)
        if pos_tag:
            pos_tag = pos_tag[0][1]
        else:
            pos_tag = "."
        df2.append([ele, row['tag'], sentence_id, pos_tag])
        if "<NULL>" in ele:
            sentence_id += 1
df2 = pd.DataFrame(df2, columns=['entity', 'tag', 'Sentence #', 'POS'])
df2.to_csv("train_data_3.csv")

In [11]:
df2[df2['tag']!='0']

Unnamed: 0,entity,tag,Sentence #,POS
2,current,Variable,0,JJ
3,filehandle,Variable,0,NN
21,filehandle,Value,0,NN
22,for,Value,0,IN
23,its,Value,0,PRP$
24,parent,Value,0,NN
25,directory,Value,0,NN
29,current,Variable,0,JJ
30,filehandle,Variable,0,NN
36,parent,Value,0,NN


In [12]:
tags = list(set(df2["tag"].values))

In [13]:
et = df2['entity']

In [14]:
# adding pos tags
import nltk
text=nltk.word_tokenize("We are going out.Just you and me.")
print nltk.pos_tag(text)

[('We', 'PRP'), ('are', 'VBP'), ('going', 'VBG'), ('out.Just', 'IN'), ('you', 'PRP'), ('and', 'CC'), ('me', 'PRP'), ('.', '.')]
