*importing libraries*

In [1]:
import pandas as pd
from pandas import DataFrame
import nltk
from nltk.corpus import wordnet

*getting lists of words*

In [2]:
file = open('materials.txt', 'r')
materials = file.read().split(',')
file.close()
materials[:8]

['adobe', 'brick', 'matting', 'sod', 'stone', 'timber', 'wattle', 'aluminum']

In [3]:
file = open('shapes.txt', 'r')
shapes = file.read().split('\n')
file.close()
shapes[:7]

['aerodynamic', 'angular', 'flat', 'asymmetrical', 'bent', 'bulbous', 'chunky']

In [4]:
file = open('colors.txt', 'r')
colors = file.read().split('\n')
file.close()
colors[:9]

['black', 'blue', 'cyan', 'gray', 'green', 'grey', 'magenta', 'red', 'white']

In [5]:
file = open('sizes.txt', 'r')
sizes = file.read().split('\n')
file.close()
sizes[:8]

['abundant', 'jumbo', 'puny', 'big', 'boned', 'large', 'scrawny', 'chubby']

*open the data* 

In [6]:
df=pd.read_csv("item_mapping.csv")

df

Unnamed: 0.1,Unnamed: 0,mbta_part_number,description,mfg,mfg_itm_id
0,0,00118006,"STEEL-5/16"" ROUND POLISHED DRILL ROD",HEATHCOTE,GRADE O-1
1,1,00118009,"DRILL ROD - 1/2"" ROUND POLISHED STEEL",GENERIC,GRADE O-1
2,2,00170003,"ROD-WELDING-CARBON STEEL 1/8"" AWS-A5.18-ER70S-2",WELDERS,OX WELD 7
3,3,00170004,"ROD,EXTRACT ALLOY 3/ 32 (5LB. PACKAGE)",X-ERGON,L004-50000
4,4,00170005,"ROD, EXTRACT ALLOY 1 /8 (5 LB. PACKAGE)",X-ERGON,L004-4-0000
5,5,00170006,WIRE WELDING - 1/8 X .660 NH TM AC/DC ELECTRODE,ALL WELD,#66XHD
6,6,00170009,"ROD-WELDING-MILD STEEL 1/8"" AWS-E6011",COLONY,"1/8"""
7,7,00170010,"ROD-WELDING, MILD STEEL 3/32"" AWS-A5.1-E6011",ALL WELD,#230 ACP 611
8,8,00170011,"ROD-WELDING-CARBON STEEL 3/32 "" AWS-A5.1 - E-7018",DA-VIN CO,MS1120
9,9,00170012,"ROD-WELDING,LOW HY-S TEEL 5/32"" AWS-A5.5 E8016-C1",DA-VIN CO,5/32-MS1140


# main distance function

## (provisional)

In [7]:
def words(s):
    return list([t for t in s.replace(',', ' ').replace('-', ' ').split()])

In [8]:
def dist(i,j):
    #if same item, return 0
    if df["mbta_part_number"][i]==df["mbta_part_number"][j]:
        return 0
    else:
        a=df["description"][i]
        a_words = words(a)
        b=df["description"][j]
        b_words = words(b)
        nb_common_words = 0
        for word in a_words:
            if word in b_words:
                nb_common_words += 1
                
        return 1-2*nb_common_words/len(a_words+b_words) #return [nb_common_words, len(a_words), len(b_words)]
                


**tests**

In [9]:
def test(i,j):
    print('\t', i, '-', j, "distance:", dist(i,j))
    print(df["description"][i])
    print(df["description"][j])
    print()


In [10]:
test(0,1)
test(0,2)
test(0,3)
test(1,2)
test(15, 16)

	 0 - 1 distance: 0.16666666666666663
STEEL-5/16" ROUND POLISHED DRILL ROD
DRILL ROD - 1/2" ROUND POLISHED STEEL

	 0 - 2 distance: 0.7333333333333334
STEEL-5/16" ROUND POLISHED DRILL ROD
ROD-WELDING-CARBON STEEL 1/8" AWS-A5.18-ER70S-2

	 0 - 3 distance: 0.8461538461538461
STEEL-5/16" ROUND POLISHED DRILL ROD
ROD,EXTRACT ALLOY 3/ 32 (5LB. PACKAGE)

	 1 - 2 distance: 0.7333333333333334
DRILL ROD - 1/2" ROUND POLISHED STEEL
ROD-WELDING-CARBON STEEL 1/8" AWS-A5.18-ER70S-2

	 15 - 16 distance: 0
ELECTRODE-STAIN STEEL 1/8" AWS-A5.15-EN1CRFE-5
ELECTRODE-STAIN STEEL 1/8" AWS-A5.15-EN1CRFE-5



# Deeper analyse

**defining functions**

In [11]:
def word_analyse(word):
    if wordnet.synsets(word):
        #print(word+' is a word,', end=' ')
        if word.lower() in materials:
            #print('more precisely a material.')
            return 'material'
        elif word.lower() in colors:
            #print('more precisely a color.')
            return 'color'
        elif word.lower() in shapes:
            #print('more precisely a shape.')
            return 'shape'
        elif word.lower() in sizes:
            #print('more precisely a size.')
            return 'size'
        else:
            tok = nltk.word_tokenize(word.lower())
            #print(nltk.pos_tag(tok)[0][1])
            return nltk.pos_tag(tok)[0][1]
    else:
        #print(word+' is not a word,', end=' ')
        sub_words = word.replace('-', ' ').replace(',', ' ').split(' ')
        sub=False
        for word in sub_words:
            if wordnet.synsets(word):
                sub=True
        if sub:
            #print("subwords detected")
            return 'sub'
        else:
            #print('no subword detected')
            return 'none'

def printed_word_analyse(word):
    ana=word_analyse(word)
    if ana=='material':
        print(word+' is a material')
    elif ana=='color':
        print(word+' is a color')
    elif ana=='shape':
        print(word+' is a shape')
    elif ana=='size':
        print(word+' is a size')
    elif ana=='sub':
        print(word+' is not a word, but has subword(s)')
    elif ana=='none':
        print(word+' is not a word, and has no subword')
    else:
        print(word+' is a '+ana)
        
    return ana

printed_word_analyse('STEEL')
printed_word_analyse('POLISHED')
printed_word_analyse('DRILL')
printed_word_analyse('WELDING')
printed_word_analyse('CARBON')
printed_word_analyse('1/8"')
printed_word_analyse('AWS-A5.18-ER70S-2')
printed_word_analyse('2')
printed_word_analyse('A5.18')
printed_word_analyse('ELECTRODE')
printed_word_analyse('STAIN')
printed_word_analyse('ELECTRODE-STAIN')
print('\n\t...some tests')

STEEL is a material
POLISHED is a NNS
DRILL is a shape
WELDING is a VBG
CARBON is a NN
1/8" is not a word, and has no subword
AWS-A5.18-ER70S-2 is not a word, but has subword(s)
2 is a CD
A5.18 is not a word, and has no subword
ELECTRODE is a NN
STAIN is a NN
ELECTRODE-STAIN is not a word, but has subword(s)

	...some tests


In [12]:
def norm(ch):
    while ch[0]==' ':
        ch=ch[1:]
    
    while ch[-1]==' ':
        ch=ch[:-1]
    return ch


print('"'+norm('     test    ')+'"')
print('"'+norm('    test test    ')+'"')
print('"'+norm('test')+'"')
print('"'+norm('    test')+'"')
print('"'+norm('test ')+'"')

"test"
"test test"
"test"
"test"
"test"


**get_main: get the main word of the description, i.e. what it is**

*e.g.:* WIRE WELDING - 1/8 X .660 NH TM AC/DC ELECTRODE      =>     WIRE

In [13]:
def get_main(descr):    
    main=[t for t in descr.replace(',', '-').replace('WITH', '-').split('-')][0]
    if word_analyse(main)=='sub':
        for w in norm(main).split(' '):
            if word_analyse(w) in ['NN', 'NNS']:
                main=w
        
    if word_analyse(main) in ['material', 'color', 'shape', 'size', 'none', 'sub']:
        if len([t for t in descr.replace(',', '-').replace('WITH', '-').split('-')])==1:
            main=[t for t in descr.split(' ')][0]
        else:
            main=[t for t in descr.split(' ')][-1]
    
    return main



def printed_get_main(descr):
    print('\t'+descr)
    main=get_main(descr)
    print('main word: '+main)
    return main

a=df["description"][5]
print("getting main word:\n")
printed_get_main(a)
print('')

getting main word:

	WIRE WELDING - 1/8 X .660 NH TM AC/DC ELECTRODE
main word: WIRE



**=> this looks not to bas to me ;)**

In [14]:
def split_words(descr):
    l=descr.split(' ')
    i=0
    while i<len(l):
        if word_analyse(l[i])=='sub':
            subs=l[i].replace('-', ' ').replace(',', ' ').split(' ')
            l=l[:i]+subs+l[i+1:]
        i+=1
    return l


a=df["description"][15000]

print(a)
print("\n\tsplitting into words:\n")
print(split_words(a))

LOCK LEVER, ISLAND HOUSING, VACUUM ISLAND FAREBOX

	splitting into words:

['LOCK', 'LEVER', '', 'ISLAND', 'HOUSING', '', 'VACUUM', 'ISLAND', 'FAREBOX']


In [15]:
def full_analyse(descr):
    main=get_main(descr)
    d=descr.split('AWS')
    l=split_words(d[0])
    if len(d)==1:
        dic={'main':main}
    else:
        dic={'main':main, 'ref':'AWS'+d[1]}
    
    for w in l:
        if w=='':
            pass
        elif w=='-':
            pass
        elif w=='"':
            pass
        elif w==main:
            pass
        else:
            ana=word_analyse(w)
            try:
                dic[ana].append(w)
            except:
                dic[ana]=[w]
    return dic


def printed_full_analyse(descr):
    print('\t'+descr)
    dic=full_analyse(descr)
    for k in dic:
        print(k, ':', dic[k])
    return dic




a=df["description"][5]

print("full analysing:\n")
printed_full_analyse(a)
print('')

full analysing:

	WIRE WELDING - 1/8 X .660 NH TM AC/DC ELECTRODE
main : WIRE
NN : ['X', 'NH', 'TM', 'ELECTRODE']
none : ['1/8', '.660', 'AC/DC']
VBG : ['WELDING']



In [16]:
#some useful functions:
def has_nb(inputString):
    return any(char.isdigit() for char in inputString)
def has_no_nb(inputString):
    return not any(char.isdigit() for char in inputString)
def is_short(inputString):
    return len(inputString)<3
def is_long(inputString):
    return len(inputString)>=3

In [17]:
#an other useful function
def put_together(dic, new_key, keys, delete=False, condition=lambda a:True):
    dic[new_key]=[]
    for k in keys:
        try:
            dic[k]
        except:
            pass
        else:
            for w in dic[k]:
                if condition(w):
                    dic[new_key].append(w)
            if delete:
                del dic[k]

In [18]:
def short_analyse(descr):
    dic=full_analyse(descr)
    
    put_together(dic, 'size', ['CD', 'none'], False, has_nb)
    try:
        del dic['CD']
    except:
        pass
    
    other_keys=list(dic.keys())
    for k in ['size', 'none', 'main', 'ref']:
        if k in other_keys:
            other_keys.remove(k)
    put_together(dic, 'adjectives', other_keys, False, is_long)
    
    put_together(dic, 'unit1', ['none'], True, has_no_nb)
    put_together(dic, 'unit2', other_keys, True, is_short)
    put_together(dic, 'units', ['unit1', 'unit2'], True)

    return dic

def printed_short_analyse(descr):
    print('\t'+descr)
    dic=short_analyse(descr)
    for k in dic:
        print(k, ':', dic[k])
    return dic

a=df["description"][1]

print("analysing: *full*\n")
printed_full_analyse(a)
print("\n\nanalysing: *short*\n")
printed_short_analyse(a)
print('')

analysing: *full*

	DRILL ROD - 1/2" ROUND POLISHED STEEL
main : ROD
NNS : ['POLISHED']
none : ['1/2"']
material : ['STEEL']
shape : ['DRILL', 'ROUND']


analysing: *short*

	DRILL ROD - 1/2" ROUND POLISHED STEEL
units : []
size : ['1/2"']
main : ROD
adjectives : ['DRILL', 'ROUND', 'POLISHED', 'STEEL']



## tests!

In [19]:
for i in [0,1,2,4,5,10,50,100,500,1000,5000,10000,15000,20000,25000,30000]:
    print('\t'+str(i))
    a=df["description"][i]
    printed_short_analyse(a)
    print('')


	0
	STEEL-5/16" ROUND POLISHED DRILL ROD
units : []
size : ['5/16"']
main : ROD
adjectives : ['ROUND', 'DRILL', 'POLISHED', 'STEEL']

	1
	DRILL ROD - 1/2" ROUND POLISHED STEEL
units : []
size : ['1/2"']
main : ROD
adjectives : ['DRILL', 'ROUND', 'POLISHED', 'STEEL']

	2
	ROD-WELDING-CARBON STEEL 1/8" AWS-A5.18-ER70S-2
units : []
adjectives : ['CARBON', 'STEEL', 'WELDING']
main : ROD
ref : AWS-A5.18-ER70S-2
size : ['1/8"']

	4
	ROD, EXTRACT ALLOY 1 /8 (5 LB. PACKAGE)
units : ['LB.', 'PACKAGE)']
size : ['1', '/8', '(5']
main : ROD
adjectives : ['EXTRACT', 'ALLOY']

	5
	WIRE WELDING - 1/8 X .660 NH TM AC/DC ELECTRODE
units : ['AC/DC', 'X', 'NH', 'TM']
main : WIRE
adjectives : ['ELECTRODE', 'WELDING']
size : ['1/8', '.660']

	10
	ROD-WELDING, COPPER 1/8" AWS-A5.27-RCUZN-B
units : []
adjectives : ['WELDING', 'COPPER']
main : ROD
ref : AWS-A5.27-RCUZN-B
size : ['1/8"']

	50
	ROD-WELDING ALUMINUM 3/32" AWS-A5.3 E4043
units : []
adjectives : ['WELDING', 'ALUMINUM']
main : ROD
ref : AWS-A5.3 E4

### others stuff - unused by now

how to get definition & example of use of a word:

In [20]:
syn = wordnet.synsets("STEEL")
try:
    print(syn[0].definition())
    print(syn[0].examples())
except:
    print('none')

an alloy of iron with small amounts of carbon; widely used in construction; mechanical properties can be varied over a wide range
[]


Use the definition (if 2 words have close definition, then they are 'almost' the same, so add 0.5 to common words)?