# Zone Index

files:

In [1]:
files = ['d'+str(i)+'.html' for i in range(5)]
files

['d0.html', 'd1.html', 'd2.html', 'd3.html', 'd4.html']

In [2]:
file = open(files[0], 'r', encoding=None)# utf8 for one file
content = file.read()
file.close()

In [3]:
def get_preprocessed_text(text):
    """
    Function conducts preprocessing on given text. 
    It changes text to lower cases, removes numbers, removes special characters and duplicated white characters.
    
    Args:
        text (string): The text to convert.
    Returns:
        string: The text after preprocessing.
    """
    import re
    if text != ' ':
        text = text.lower() 
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[^\w\s]','',text)
        text = re.sub(r"\s+"," ", text)
    
    return text

In [4]:
def get_title(document):
    document = document.lower()
    index0 = document.find('<title>')
    index1 = document.find('</title>')
    
    return document[index0+len('<title>'):index1]

In [5]:
t = get_title(content)
t

'\nauditory and visual feedback during eye typing\n'

In [7]:
get_preprocessed_text(t)

' auditory and visual feedback during eye typing '

In [6]:
def get_abstract(document):
    document = document.lower()
    search = '<b>abstract</b>'
    index = document.find(search)
    if index == -1:
        search = '<b>abstract:</b>'
        index = document.find(search)
        
    result = document[index+len(search):]
    if result[:4]=='<br>':
        result= result[4:]
    index = result.find('<p') # it is the end of abstract
    result = result[:index]
    
    return result

In [7]:
a = get_abstract(content)
a

"\nwe describe a study on how auditory and visual feedback affects eye \ntyping. results show that the feedback method influences both text entry\n speed and error rate. in addition, a proper feedback mode facilitates \neye typing by reducing the user's need to switch her gaze between the \non-screen keyboard and the typed text field.\n"

In [8]:
get_preprocessed_text(a)

' we describe a study on how auditory and visual feedback affects eye typing results show that the feedback method influences both text entry speed and error rate in addition a proper feedback mode facilitates eye typing by reducing the users need to switch her gaze between the onscreen keyboard and the typed text field '

In [9]:
def get_introduction(document):
    """
    introduction title is always in <h2> tag 
    """
    result = ''
    found = False # if introduction zone is found
    document = document.lower()
    
    # find beginning of zone - when tag <h2> with specified content occurrs
    while not found:
        index0 = document.find('<h2>')
        index1 = document.find('</h2>')
        result = document[index1+len('<\h2>'):]
        
        if document[index0:index1].find('introduction') > -1: 
            found = True
    
    # find end of zone - when the next <h2>-tag occurrs
    index = result.find('<h2>')
    result = result[:index]        
    return result

In [10]:
i = get_introduction(content)
i

'\n<p>\nfor people with severe disabilities their eyes may be the only means for\n communication. even though eye typing has been studied for many years, \nthere is little research on design issues [2]. our goal was to study how\n feedback could facilitate the tedious [1] eye typing task and make \ngaze-based computer-aided communication more practical for those who \nneed it.\n</p><p>\n\n</p><h3>\nfeedback modes\n</h3>\n\n<p>\nduring eye typing the user first focuses on the desired letter. to \nselect the focused letter she continues to fixate on it thus using dwell\n time as an activation command. feedback is given for focus and \nselection. the following four feedback modes were tested. \n</p><p>\n\n<b>visual only.</b> in the visual only mode, the key is highlighted on \nfocus (the 2nd key on left in figure 1) and its symbol shrinks as dwell \ntime elapses. the shrinking draws the attention in, helping the user \nfocus on the center of the key. on selection the letter turns red and 

In [11]:
def remove_tags_from_string(text, 
                            toRemove=['<p>', '</p>', '<b>', '</b>', '<br>', '<h3>', '</h3>', '<center>', '</center>', 
            '</table>', '</tbody>','</tr>', '</th>', '</td>'], 
                            removeImages=True, 
                            additionallyTags=['<table', '<tbody', '<tr', '<th', '<td']):
    """
    for example: <img src="...">
    """
    text = text.lower()
    for word in toRemove:
        text = text.replace(word, '')
    
    # remove images
    i = text.find('<img')
    while i > -1:
        temp = text[i:]
        j = i + temp.find('>')
        text = text[:i] + text[j+1:]
        i = text.find('<img')
    
    # remove tables tags
    for tag in additionallyTags:
        i = text.find(tag)
        while i > -1:
            temp = text[i:]
            j = i + temp.find('>')
            text = text[:i] + text[j+1:]
            i = text.find(tag)
        
    return text         

In [12]:
i = remove_tags_from_string(i)
i

'\n\nfor people with severe disabilities their eyes may be the only means for\n communication. even though eye typing has been studied for many years, \nthere is little research on design issues [2]. our goal was to study how\n feedback could facilitate the tedious [1] eye typing task and make \ngaze-based computer-aided communication more practical for those who \nneed it.\n\n\n\nfeedback modes\n\n\n\nduring eye typing the user first focuses on the desired letter. to \nselect the focused letter she continues to fixate on it thus using dwell\n time as an activation command. feedback is given for focus and \nselection. the following four feedback modes were tested. \n\n\nvisual only. in the visual only mode, the key is highlighted on \nfocus (the 2nd key on left in figure 1) and its symbol shrinks as dwell \ntime elapses. the shrinking draws the attention in, helping the user \nfocus on the center of the key. on selection the letter turns red and \nthe key goes down. \n\n\n\n\nfigure 1.

In [13]:
get_preprocessed_text(i)

' for people with severe disabilities their eyes may be the only means for communication even though eye typing has been studied for many years there is little research on design issues our goal was to study how feedback could facilitate the tedious eye typing task and make gazebased computeraided communication more practical for those who need it feedback modes during eye typing the user first focuses on the desired letter to select the focused letter she continues to fixate on it thus using dwell time as an activation command feedback is given for focus and selection the following four feedback modes were tested visual only in the visual only mode the key is highlighted on focus the nd key on left in figure and its symbol shrinks as dwell time elapses the shrinking draws the attention in helping the user focus on the center of the key on selection the letter turns red and the key goes down figure animation for visual only feedback mode speech only the speech only mode did not use vis

# The zone index

In [16]:
import pandas as pd

In [14]:
def create_zone_index(files):
    df = None
    zones = ['title','abstract','intro']
    for i in range(len(files)):
        try:
            f = open(files[i], 'r', encoding=None)
            content = f.read()
        except:
            f = open(files[i], 'r', encoding="utf8")
            content = f.read()

        f.close()

        # title
        text = get_preprocessed_text(get_title(content))
        words = text.split()

        tfs = dict.fromkeys(set(words))
        for w in tfs.keys():
            tfs[w] = words.count(w)

        if df is not None:
            df = pd.concat([df, pd.DataFrame.from_dict(tfs, orient='index', columns=['d'+str(i)+'-title'])],  sort=False, axis=1)
        else:
            df = pd.DataFrame.from_dict(tfs, orient='index', columns=['d'+str(i)+'-'+zones[0]])

        # abstract
        text = get_preprocessed_text(get_abstract(content))
        words = text.split()

        tfs = dict.fromkeys(set(words))
        for w in tfs.keys():
            tfs[w] = words.count(w)

        df = pd.concat([df, pd.DataFrame.from_dict(tfs, orient='index', columns=['d'+str(i)+'-abstract'])],  sort=False, axis=1)

        # introduction
        text = get_preprocessed_text(remove_tags_from_string(get_introduction(content)))
        words = text.split()

        tfs = dict.fromkeys(set(words))
        for w in tfs.keys():
            tfs[w] = words.count(w)

        df = pd.concat([df, pd.DataFrame.from_dict(tfs, orient='index', columns=['d'+str(i)+'-intro'])],  sort=False, axis=1)

    df.fillna(0, inplace=True)
    
    return df

In [17]:
df = create_zone_index(files)
df = df.sort_index()
df.head()

Unnamed: 0,d0-title,d0-abstract,d0-intro,d1-title,d1-abstract,d1-intro,d2-title,d2-abstract,d2-intro,d3-title,d3-abstract,d3-intro,d4-title,d4-abstract,d4-intro
a,0.0,2.0,2.0,0.0,1.0,18.0,0.0,2.0,7.0,1.0,4.0,7.0,0.0,6.0,9.0
ability,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
able,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
about,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
accelerometer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0


## Find the document/documents with term ’eye’ in the title and ’performance’ in the abstract and ’methods’ in introduction.

As one query: 

In [19]:
def find_documents(zoneIndex, andQueryValues, andQueryZones = ['title','abstract','intro']):
    doc_num = len(zoneIndex.columns)//3 # zoneIndex has always 3 zones for each document
    result = [] 
    
    for i in range(doc_num):
        contains_all = True
        j = 0
        while contains_all and j < len(andQueryZones):
            zone = andQueryZones[j]
            column = zoneIndex['d'+str(i)+'-'+zone]
            tf = column.loc[andQueryValues[j]]
            if tf == 0:
                contains_all = False
            j += 1
                    
        if contains_all:
            result +=['document'+str(i+1)]
       
    return result     

In [20]:
print('In printed result documents are indexed from 1.')
print('Result: ', find_documents(df, ['eye', 'performance', 'method']))

In printed result documents are indexed from 1.
Result:  ['document2']


As 3 separately queries:

## Find the document/documents with term ’eye’ in the title

In [21]:
def find_documents(zoneIndex, zone, query):
    result = [] 
    doc_num = len(zoneIndex.columns)//3 # zoneIndex has always 3 zones for each document
    
    for i in range(doc_num):
        column = df['d'+str(i)+'-'+zone]
        tf = column.loc[query]
        
        if tf > 0 :
            result += ['document'+str(i+1)]
       
    return result

In [22]:
find_documents(df, 'title', 'eye')

['document1', 'document2', 'document5']

## Find the document/documents with term ’performance’ in the abstract

In [23]:
find_documents(df, 'abstract', 'performance')

['document2', 'document3', 'document4', 'document5']

##  Find the document/documents with term ’methods’ in the introduction

In [24]:
find_documents(df, 'intro', 'methods')

['document2', 'document4']

#  The weighted zone score

In [25]:
def calculate_weighted_zone_score(andQueryValues, docName, weights = [0.45,0.30,0.25]):
    """
    We check whether all terms from andQueryValues are in all zones.
    """
    zones = ['title','abstract','intro']
    score = 0
    
    for i in range(len(zones)):
        column = df[docName+'-'+zones[i]]
        
        # check whether the zones fulfills all conditions
        contains_all = True
        j = 0
        while contains_all and j < len(andQueryValues):
            tf = column.loc[andQueryValues[j]]
            if tf == 0:
                contains_all = False
            j += 1
       
        # after each zone update score
        score += weights[i] * contains_all
       
    return score

## The weighted zone score of each document for the boolean query ’eye’ AND ’tracking’

In [26]:
andQueryValues = ['eye', 'tracking']

for i in range(5):
    print('score for document',i+1,':', calculate_weighted_zone_score(andQueryValues,'d'+str(i)))

score for document 1 : 0.0
score for document 2 : 0.25
score for document 3 : 0.0
score for document 4 : 0.25
score for document 5 : 1.0
