In [70]:
# read brand names into a list
brand_file = open('brands.txt', 'r')
brands = brand_file.readlines()

for i in range(0, len(brands)):
    brands[i] = brands[i][:-1].lower()

In [71]:
import glob
import os

file_path = 'selected_texts/unlabelled_test_texts/'
files = glob.glob(file_path + '*.txt')

n = 8
candidates = []

for file in files:
    with open(file, 'r') as file:
        text = file.read()
        words = text.split()
        
        len_of_text_read = 0
        
        for start in range(0, len(words)):
            len_of_target_read = len(words[start])
            
            for length in range(2, 8):
                if (start + length) <= len(words):
                    len_of_target_read += 1 + len(words[start + length - 1])
                    candidate_string = words[start : start + length]
                    candidate_string = ' '.join(s for s in candidate_string)

                    text_before = words[start - n : start]
                    text_after = words[start + length : start + length + n]

                    candidate_up = ' '.join(s for s in text_before)
                    candidate_down = ' '.join(s for s in text_after)
                    candidate_follow = candidate_string[-1:]
                    
                    start_pos = len_of_text_read
                    end_pos = start_pos + len_of_target_read
                    
                    candidates.append([candidate_string, candidate_up, candidate_down, candidate_follow, start_pos, end_pos])

            len_of_text_read += len(words[start]) + 1
            
test_examples = []

for candidate in candidates:
    candidate_string = candidate[0]
    words = candidate_string.split()

    capitalized = False
    start_with_brand = False

    # criterion 1: Are there more than half of the words capitalized?
    found = 0
    for word in words:
        if word != '' and word[0].isupper():
            found += 1

    if found / len(words) >= 0.5:
        capitalized = True

    # criterion 2: Does the string starts with a brand name?
    found = False
    for brand in brands:
        if words[0].lower() == brand.lower():
            found = True
            break

    if found:
        start_with_brand = True

    if capitalized and start_with_brand:
        test_examples.append(candidate)

In [73]:
import csv
import re

csv_f = open('test_feat_vec.csv', 'w')

# header of the table
fields = ['product_name', 'start', 'end', 'num_of_words', 'total_str_len', 'avg_word_len', 'fraction_capitalized', 'num_of_non-Eng_words', 'num_of_digits', 'word(s)_with_uppercase_letters', 'starts_with_brand', 'paranthesis', ')_in_last_word', 
          '\w-\w_end', '\w-\w_second', '\d_end', 'contains_year', 'contains_inch_info', 'contains_core_info', 'starts_with_i', 'concatenated_words', 'keywords_downstream', 
          'is_was_downstream', '(_downstream', '$_downstream', 'the_upstream', ',_or_._after_string']
writer = csv.writer(csv_f, delimiter = ',')
writer.writerow(fields)

fvs = []

for example in test_examples:
    product_name = example[0]
    inst_up = example[1].split(' ')
    inst_down = example[2].split(' ')
    inst_follow = example[3]
    
    # add product name at the front
    vector = []
    vector.append(product_name)
    
    # add start and end positions
    start = example[4]
    end = example[5]
    vector.append(start)
    vector.append(end)

    # feature 1(a): number of words
    # feature 1(b): total number of characters in the string (excluding blank space)
    # feature 1(c): average word length
    words = product_name.split()
    vector.append(len(words))
    
    total_len = 0
    
    for word in words:
        total_len += len(word)
    
    vector.append(total_len)
    vector.append(total_len / len(words))

    # feature 2: What is the fraction of capitalized words?
    # feature 3: How many words are not English words?
    upper = 0
    total = 0
    
    for word in words:
        if word.isalpha():
            total += 1
            if word[0].isupper():
                upper += 1
    
    vector.append(upper / total)
    vector.append(len(words) - total)

    # feature 4: How many digits are in the string?
    found = 0
    for word in words:
        for char in word:
            if char.isdigit():
                found += 1

    vector.append(found)

    # feature 5: Is there a word whose characters are all capitalized (excluding the leading word)?
    found = False
    for word in words[1:]:      # skip 1st word
        if word.isalpha() and word.isupper():
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 6: Does the string start with a brand name, and the brand name only appears once?
    found = False
    
    if words[0].lower() in brands:
        found = True
    
    for word in words[1:]:
        if word.lower() in brands:
            found = False
    
    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 7(a): Does the string contain '(' and ')'?
    # feature 7(b): Does ')' appear in the last word as the last character?
    found1 = False
    found2 = False
    pos = -1
    
    for i, word in enumerate(words):
        if '(' in word:
            found1 = True
        if ')' in word:
            found2 = True

    if found1 and found2:
        vector.append('1')
    else:
        vector.append('0')
    
    last_word = words[len(words) - 1]
    
    if last_word[-1:] == ')':
        vector.append('1')
    else:
        vector.append('0')

    # feature 8(a): Does the last word contain a string of alphanumeric characters delimited by '-' ?
    # feature 8(b): Does the second word contain a string of alphanumeric characters delimited by '-'?
    # feature 8(c): Does the last word contain a number?
    found1 = False
    found2 = False
    found3 = False
    
    if len(re.findall('\w+-\w+', words[len(words) - 1])) > 0:
        found1 = True
    
    if len(re.findall('\w+-\w+', words[1])) > 0:
        found2 = True
    
    if len(re.findall('\d+', words[len(words) - 1])) > 0:
        found3 = True

    if found1:
        vector.append('1')
    else:
        vector.append('0')
    
    if found2:
        vector.append('1')
    else:
        vector.append('0')
        
    if found3:
        vector.append('1')
    else:
        vector.append('0')

    # feature 9: Does any word contain a year?
    found = False
    for word in words:
        if len(re.findall('20[0-1][0-7]', word)) > 0:
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 10: Does the string contain information on "inches"?
    found = False
    for word in words:
        word = word.lower()
        if ('inch' in word.lower()) or ('inches' in word.lower()) or ('"' in word.lower()):
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 11: Does the string contain information on "core"?
    found = False
    for word in words:
        if 'core' in word.lower():
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 12: Is there a word that starts with 'i'?
    found = False
    for word in words:
        if word.startswith('i'):
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 13: Are there two capitalized words concatenated into one word?
    found = False
    for word in words:
        if (len(re.findall('[A-Z][a-z]+', word))) > 1:
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 14: Is there a keyword immediately downstream?
    found = False
    for word in inst_down:
        word = word.lower()
        if word.find('laptop') != -1 or word.find('desktop') != -1 or word.find('tablet') != -1 or word.find('notebook') != -1:
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 15: Does 'is' or 'was' occur immediately downstream?
    found = False
    for word in inst_down[:2]:
        if word.lower() in ['is', 'was']:
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 16: Does '(' and/or ')' occur immediately downstream?
    found = False
    for word in inst_down[:2]:
        if '(' in word.lower():
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 17: Does '$' occur immediately downstream?
    found = False
    for word in inst_down[:2]:
        if '$' in word.lower():
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 18: Does 'the' occur immediately upstream?
    found = False
    for word in inst_up[-2:]:
        if word.lower() == 'the':
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 19: Does ',' or '.' occur immediately after the string?
    found = False
    if (inst_follow == ',') or (inst_follow == '.'):
        found = True

    if found:
        vector.append('1')
    else:
        vector.append('0')

    writer.writerow(vector)   
    fvs.append(vector)

csv_f.close()

In [74]:
import glob, re

file_path = 'selected_texts/labelled_test_texts/'
files = glob.glob(file_path + '*.txt')

name_list = []

for file in files:
    offset = 3
    
    with open(file, 'r') as f:
        text = f.read()

        # identify each product name using delimiter <p> and </>
        for inst in re.finditer('<p>(.+?)</>', text):
            start = inst.start() + 3    # start position
            end = inst.end() - 3        # end position
            
            if text[end - 1] == ' ':
                end -= 1
            
            name_list.append([text[start : end], [start - offset, end - offset]])
            offset += 6

with open('selected_texts/test_product_list.txt', 'w') as f:
    for n in name_list:
        print(n[0], n[1][0], n[1][1], file = f)