In [4]:
import re
import os
import glob
import random
import shutil

In [1]:
# DO NOT EXECUTE AGAIN!

file_dir = 'selected_texts/labelled_texts/'
files = glob.glob(file_dir + '*.txt')

train = 250
train_files = random.sample(files, train)

test = 125
remaining_files = list(set(files) - set(train_files))
test_files = random.sample(remaining_files, test)

train_des_dir = 'selected_texts/labelled_training_texts'
test_des_dir = 'selected_texts/labelled_test_texts'

for file in train_files:
    file = file[30:]
    src = os.path.join(file_dir, file)
    des = os.path.join(train_des_dir, file)
    shutil.copy(src, des)

for file in test_files:
    file = file[30:]
    src = os.path.join(file_dir, file)
    des = os.path.join(test_des_dir, file)
    shutil.copy(src, des)

In [6]:
file_dir = 'selected_texts/labelled_training_texts/'
files = glob.glob(file_dir + '*.txt')

# postive examples stored as 
# [string of product name, list of n words upstream, list of n words downstream, character immediately after the string]
examples = []

n = 8      # number of words ahead/after the string (user specified)

# inspect each text
for file in files:
    with open(file, 'r') as f:
        text = f.read()

        # identify each product name using delimiter <p> and </>
        for inst in re.finditer('<p>(.+?)</>', text):
            start = inst.start()    # start position
            end = inst.end()        # end position
            
            product_name_raw = text[start : end]
            product_name = product_name_raw[3 : -3]    # string of product name
            
            up = text[: start - 1].split()
            down = text[end + 1 :].split()
            
            inst_up = up[-n :]        # list of n words upstream
            inst_down = down[: n]     # list of n words downstream
            
            if end == len(text):
                inst_follow = ''
            else:
                inst_follow = text[end]   # character immediately after the string
            
            examples.append([product_name, inst_up, inst_down, inst_follow])

In [7]:
len(examples)

2439

In [8]:
# read brand names into a list
brand_file = open('brands.txt', 'r')
brands = brand_file.readlines()

for i in range(0, len(brands)):
    brands[i] = brands[i][:-1]
    brands[i] = brands[i].lower()

In [24]:
# generate feature vectors from positive examples
import csv

csv_f = open('pos_feat_vec.csv', 'w')

# header of the table
fields = ['product_name','num_of_words', 'total_str_len', 'avg_word_len', 'fraction_capitalized', 'num_of_non-Eng_words', 'num_of_digits', 'word(s)_with_uppercase_letters', 'starts_with_brand', 'paranthesis', ')_in_last_word', 
          '\w-\w_end', '\w-\w_second', '\d_end', 'contains_year', 'contains_inch_info', 'contains_core_info', 'starts_with_i', 'concatenated_words', 'keywords_downstream', 
          'is_was_downstream', '(_downstream', '$_downstream', 'the_upstream', ',_or_._after_string', '\w-\w_downstream', 'label']
writer = csv.writer(csv_f, delimiter = ',')
writer.writerow(fields)

fvs = []

for example in examples:
    product_name = example[0]
    inst_up = example[1]
    inst_down = example[2]
    inst_follow = example[3]
    
    # add product name at the front
    vector = []
    vector.append(product_name)

    # feature 1(a): number of words
    # feature 1(b): total number of characters in the string (excluding blank space)
    # feature 1(c): average word length
    words = product_name.split()
    vector.append(len(words))
    
    total_len = 0
    
    for word in words:
        total_len += len(word)
    
    vector.append(total_len)
    vector.append(total_len / len(words))

    # feature 2: What is the fraction of capitalized words?
    # feature 3: How many words are not English words?
    upper = 0
    total = 0
    
    for word in words:
        if word.isalpha():
            total += 1
            if word[0].isupper():
                upper += 1
    
    vector.append(upper / total)
    vector.append(len(words) - total)

    # feature 4: How many digits are in the string?
    found = 0
    for word in words:
        for char in word:
            if char.isdigit():
                found += 1

    vector.append(found)

    # feature 5: Is there a word whose characters are all uppercase (excluding the leading word)?
    found = False
    for word in words[1:]:      # skip 1st word
        if word.isalpha() and word.isupper():
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 6: Does the string start with a brand name, and the brand name only appears once?
    found = False
    
    if words[0].lower() in brands:
        found = True
    
    for word in words[1:]:
        if word.lower() in brands:
            found = False
    
    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 7(a): Does the string contain '(' and ')'?
    # feature 7(b): Does ')' appear in the last word as the last character?
    found1 = False
    found2 = False
    pos = -1
    
    for i, word in enumerate(words):
        if '(' in word:
            found1 = True
        if ')' in word:
            found2 = True

    if found1 and found2:
        vector.append('1')
    else:
        vector.append('0')
    
    last_word = words[len(words) - 1]
    
    if last_word[-1:] == ')':
        vector.append('1')
    else:
        vector.append('0')

    # feature 8(a): Does the last word contain a string of alphanumeric characters delimited by '-' ?
    # feature 8(b): Does the second word contain a string of alphanumeric characters delimited by '-'?
    # feature 8(c): Does the last word contain a number?
    found1 = False
    found2 = False
    found3 = False
    
    if len(re.findall('\w+-\w+', words[len(words) - 1])) > 0:
        found1 = True
    
    if len(re.findall('\w+-\w+', words[1])) > 0:
        found2 = True
    
    if len(re.findall('\d+', words[len(words) - 1])) > 0:
        found3 = True

    if found1:
        vector.append('1')
    else:
        vector.append('0')
    
    if found2:
        vector.append('1')
    else:
        vector.append('0')
        
    if found3:
        vector.append('1')
    else:
        vector.append('0')

    # feature 9: Does any word contain a year?
    found = False
    for word in words:
        if len(re.findall('20[0-1][0-7]', word)) > 0:
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 10: Does the string contain information on "inches"?
    found = False
    for word in words:
        word = word.lower()
        if ('inch' in word.lower()) or ('inches' in word.lower()) or ('"' in word.lower()):
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 11: Does the string contain information on "core"?
    found = False
    for word in words:
        if 'core' in word.lower():
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 12: Is there a word that starts with 'i'?
    found = False
    for word in words:
        if word.startswith('i'):
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 13: Are there two capitalized words concatenated into one word?
    found = False
    for word in words:
        if (len(re.findall('[A-Z][a-z]+', word))) > 1:
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 14: Is there a keyword immediately downstream?
    found = False
    for word in inst_down:
        word = word.lower()
        if word.find('laptop') != -1 or word.find('desktop') != -1 or word.find('tablet') != -1 or word.find('notebook') != -1:
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 15: Does 'is' or 'was' occur immediately downstream?
    found = False
    for word in inst_down[:2]:
        if word.lower() in ['is', 'was', 'has', 'and']:
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 16: Does '(' and/or ')' occur immediately downstream?
    found = False
    for word in inst_down[:2]:
        if '(' in word.lower():
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 17: Does '$' occur immediately downstream?
    found = False
    
    for word in inst_down[0:1]:
        if '$' in word.lower():
            found = True
            break
    
    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 18: Does 'the' occur immediately upstream?
    found = False
    for word in inst_up[-2:]:
        if word.lower() == 'the':
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 19: Does ',' or '.' occur immediately after the string?
    found = False
    if (inst_follow == ',') or (inst_follow == '.'):
        found = True

    if found:
        vector.append('1')
    else:
        vector.append('0')
    
    # feature 20: Does a word containing alphanumerical characters delimited by '-' occur right after the string?
    found = False
    for word in inst_down[0:1]:
        if len(re.findall('\w+-\w+', word)) > 0:
            found = True
            break
    
    if found:
        vector.append('1')
    else:
        vector.append('0')
    
    vector.append('1')

    writer.writerow(vector)   
    fvs.append(vector)

csv_f.close()