In [101]:
# read brand names into a list
brand_file = open('brands.txt', 'r')
brands = brand_file.readlines()

for i in range(0, len(brands)):
    brands[i] = brands[i][:-1].lower()

In [102]:
# extracting possible negative examples from training documents
import os,sys
import pandas as pd
import re
import glob
import random

file_path = 'selected_texts/labelled_training_texts/'
files = glob.glob(file_path + '*.txt')

output1 = []
output2 = []

for file in files:
    with open(file,'r') as f:
        text = f.read()
        words = text.split()

        start_index = [0]
        end_index = []

        for m in re.finditer('<p>(.+?)</>',text):
            end_index.append(m.start())
            start_index.append(m.end())

        end_index.append(len(text))

        text_fragments = []
        
        for i in range(0, len(start_index)):
            text_fragments.append(text[start_index[i] : end_index[i]])

        candidates = []
        n = 8
            
        for i, fragment in enumerate(text_fragments):
            fragment = fragment.split()

            for start in range(0, len(fragment)):
                for length in range(2, 8):
                    if (start + length) <= len(fragment):
                        candidate_string = fragment[start : start + length]
                        candidate_string = ' '.join(s for s in candidate_string)

                        before = text.find(candidate_string)
                        after = before + len(candidate_string)

                        text_before = text[:before].split()
                        text_after = text[after:].split()

                        for t in range(0, len(text_before)):
                            text_before[t] = text_before[t].replace('<p>', '')
                            text_before[t] = text_before[t].replace('</>', '')

                        for t in range(0, len(text_after)):
                            text_after[t] = text_after[t].replace('<p>', '')
                            text_after[t] = text_after[t].replace('</>', '')

                        candidate_up = ' '.join(s for s in text_before[-n:])
                        candidate_down = ' '.join(s for s in text_after[:n])
                        candidate_follow = candidate_string[-1:]

                        candidates.append([candidate_string, candidate_up, candidate_down, candidate_follow])
                        
        negative_examples_1 = []
        negative_examples_2 = []

        for candidate in candidates:
            candidate_string = candidate[0]
            words = candidate_string.split()

            # criterion 1: Are more than half of the words capitalized?
            capitalized = False
            found = 0
            for word in words:
                if word.isalpha() and word[0].isupper():
                    found += 1
            
            if found / len(words) >= 0.5:
                capitalized = True

            # criterion 2: Does the string starts with a brand name?
            start_with_brand = False
            found = False
            for brand in brands:
                if words[0].lower() == brand.lower():
                    found = True
                    break
            
            if found:
                start_with_brand = True
                
            # criterion 3: Does the string ends with a number or a word with alphanumeric characters delimited by '-'?
            found = False
            word = words[len(words) - 1]
            if len(re.findall('\d+', word)) > 0 or len(re.findall('\w+-\w+', word)) > 0:
                found = True
            
            end_with_alphanumeric = False
            
            if found:
                end_with_alphanumeric = True
                
            if capitalized and start_with_brand:
                if not end_with_alphanumeric:
                    negative_examples_1.append(candidate)
                else:
                    negative_examples_2.append(candidate)
            
    output1.extend(negative_examples_1)
    output2.extend(negative_examples_2)

print(len(output1), len(output2))   

8547 2466


In [122]:
output = output1 + output2
print(len(output))

11013


In [123]:
# randomly sample negative examples
t = 2000
neg_examples = random.sample(output, t)

In [94]:
# partial product names as potential negative examples
special_examples = []
n = 8

for file in files:
    with open(file,'r') as f:
        text = f.read()

        for m in re.finditer('<p>(.+?)</>',text):
            name_start = m.start()
            name_end = m.end()
            before_name = text[:name_start]
            after_name = text[name_end:]
            
            before_name = before_name.replace('<p>', '')
            before_name = before_name.replace('</>', '')
            after_name = after_name.replace('<p>', '')
            after_name = after_name.replace('</>', '')
            
            words = text[name_start + 3 : name_end - 3].split()
            words_before = before_name.split()
            words_after = after_name.split()
            
            for length in range(2, len(words)):
                candidate_string = words[:length]
                candidate_string = ' '.join(s for s in candidate_string)
                
                candidate_up = words_before[-n:]
                candidate_down = words[length:] + words_after[: n - (len(words) - length)]
                candidate_up = ' '.join(s for s in candidate_up)
                candidate_down = ' '.join(s for s in candidate_down)
                candidate_follow = candidate_string[-1:]
                
                special_examples.append([candidate_string, candidate_up, candidate_down, candidate_follow])

In [95]:
print(len(special_examples))

3329


In [113]:
# randomly sample negative examples
t = 2500
special_neg_examples = random.sample(special_examples, t)

In [124]:
# pool negative examples from different sources into a full table
total_neg_examples = neg_examples + special_neg_examples
print(len(total_neg_examples))

4000


In [125]:
# clean up the negative examples by removing punctuations at the end of the string
for example in total_neg_examples:
    string = example[0]
    length = len(string)
    
    if string[length - 1] in [',', '.',';']:
        example[3] = string[length - 1]
        example[0] = string[:-1]
    
    elif string[-2:] == '\'s':
        example[3] = '\''
        example[0] = string[:-2]

# shuffle the negative examples
random.shuffle(total_neg_examples)

In [126]:
# generate feature vectors from negative examples
import csv

csv_f = open('neg_feat_vec.csv', 'w')

# header of the table
fields = ['product_name','num_of_words', 'total_str_len', 'avg_word_len', 'fraction_capitalized', 'num_of_non-Eng_words', 'num_of_digits', 'word(s)_with_uppercase_letters', 'starts_with_brand', 'paranthesis', ')_in_last_word', 
          '\w-\w_end', '\w-\w_second', '\d_end', 'contains_year', 'contains_inch_info', 'contains_core_info', 'starts_with_i', 'concatenated_words', 'keywords_downstream', 
          'is_was_downstream', '(_downstream', '$_downstream', 'the_upstream', ',_or_._after_string', '\w-\w_downstream', 'label']
writer = csv.writer(csv_f, delimiter = ',')
writer.writerow(fields)

fvs = []

for example in total_neg_examples:
    product_name = example[0]
    inst_up = example[1].split(' ')
    inst_down = example[2].split(' ')
    inst_follow = example[3]
    
    # add product name at the front
    vector = []
    vector.append(product_name)

    # feature 1(a): number of words
    # feature 1(b): total number of characters in the string (excluding blank space)
    # feature 1(c): average word length
    words = product_name.split()
    vector.append(len(words))
    
    total_len = 0
    
    for word in words:
        total_len += len(word)
    
    vector.append(total_len)
    vector.append(total_len / len(words))

    # feature 2: What is the fraction of capitalized words?
    # feature 3: How many words are not English words?
    upper = 0
    total = 0
    
    for word in words:
        if word.isalpha():
            total += 1
            if word[0].isupper():
                upper += 1
    
    vector.append(upper / total)
    vector.append(len(words) - total)

    # feature 4: How many digits are in the string?
    found = 0
    for word in words:
        for char in word:
            if char.isdigit():
                found += 1

    vector.append(found)

    # feature 5: Is there a word whose characters are all capitalized (excluding the leading word)?
    found = False
    for word in words[1:]:      # skip 1st word
        if word.isalpha() and word.isupper():
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 6: Does the string start with a brand name, and the brand name only appears once?
    found = False
    
    if words[0].lower() in brands:
        found = True
    
    for word in words[1:]:
        if word.lower() in brands:
            found = False
    
    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 7(a): Does the string contain '(' and ')'?
    # feature 7(b): Does ')' appear in the last word as the last character?
    found1 = False
    found2 = False
    pos = -1
    
    for i, word in enumerate(words):
        if '(' in word:
            found1 = True
        if ')' in word:
            found2 = True

    if found1 and found2:
        vector.append('1')
    else:
        vector.append('0')
    
    last_word = words[len(words) - 1]
    
    if last_word[-1:] == ')':
        vector.append('1')
    else:
        vector.append('0')

    # feature 8(a): Does the last word contain a string of alphanumeric characters delimited by '-' ?
    # feature 8(b): Does the second word contain a string of alphanumeric characters delimited by '-'?
    # feature 8(c): Does the last word contain a number?
    found1 = False
    found2 = False
    found3 = False
    
    if len(re.findall('\w+-\w+', words[len(words) - 1])) > 0:
        found1 = True
    
    if len(re.findall('\w+-\w+', words[1])) > 0:
        found2 = True
    
    if len(re.findall('\d+', words[len(words) - 1])) > 0:
        found3 = True

    if found1:
        vector.append('1')
    else:
        vector.append('0')
    
    if found2:
        vector.append('1')
    else:
        vector.append('0')
        
    if found3:
        vector.append('1')
    else:
        vector.append('0')

    # feature 9: Does any word contain a year?
    found = False
    for word in words:
        if len(re.findall('20[0-1][0-7]', word)) > 0:
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 10: Does the string contain information on "inches"?
    found = False
    for word in words:
        word = word.lower()
        if ('inch' in word.lower()) or ('inches' in word.lower()) or ('"' in word.lower()):
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 11: Does the string contain information on "core"?
    found = False
    for word in words:
        if 'core' in word.lower():
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 12: Is there a word that starts with 'i'?
    found = False
    for word in words:
        if word.startswith('i'):
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 13: Are there two capitalized words concatenated into one word?
    found = False
    for word in words:
        if (len(re.findall('[A-Z][a-z]+', word))) > 1:
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 14: Is there a keyword immediately downstream?
    found = False
    for word in inst_down:
        word = word.lower()
        if word.find('laptop') != -1 or word.find('desktop') != -1 or word.find('tablet') != -1 or word.find('notebook') != -1:
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 15: Does 'is' or 'was' occur immediately downstream?
    found = False
    for word in inst_down[:2]:
        if word.lower() in ['is', 'was', 'has', 'and']:
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 16: Does '(' and/or ')' occur immediately downstream?
    found = False
    for word in inst_down[:2]:
        if '(' in word.lower():
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 17: Does '$' occur immediately downstream?
    found = False
    for word in inst_down[0:1]:
        if '$' in word.lower():
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 18: Does 'the' occur immediately upstream?
    found = False
    for word in inst_up[-2:]:
        if word.lower() == 'the':
            found = True
            break

    if found:
        vector.append('1')
    else:
        vector.append('0')

    # feature 19: Does ',' or '.' occur immediately after the string?
    found = False
    if (inst_follow == ',') or (inst_follow == '.'):
        found = True

    if found:
        vector.append('1')
    else:
        vector.append('0')
    
    # feature 20: Does a word containing alphanumerical characters delimited by '-' occur right after the string?
    found = False
    for word in inst_down[0:1]:
        if len(re.findall('\w+-\w+', word)) > 0:
            found = True
            break
    
    if found:
        vector.append('1')
    else:
        vector.append('0')
    
    vector.append('0')

    writer.writerow(vector)   
    fvs.append(vector)

csv_f.close()