In [8]:
import numpy as np
import pandas as pd
import re
pd.set_option('max_rows',200)

In [9]:
accred_df = pd.read_csv('Accreditation_Data.csv')
sample_df = pd.read_csv('sampleTestFile.csv')
test_df = pd.read_csv('testFile.csv')

dfs = [accred_df, sample_df, test_df]

print(len(accred_df))
print(len(sample_df))
print(len(test_df))



4383
100
392


Next step - treatment of URLs

In [10]:
for df in dfs:
    df['URL'] = df['URL'].str.lower()

def strip_url(url):
    stripped_url = url.replace('http://','').replace('https://','').replace('www.','')
    match = re.match("(.*)\.",stripped_url)
    
    if match:
        print(match.groups()[0])
        return match.groups()[0]
    else:
        print('nomatch:',stripped_url)
        return stripped_url
    
for df in dfs:
    df['stripped_url'] = df['URL'].apply(strip_url)

acc-careers
acc-careers
accschool.peedeeworld
acmt
aesa
aihs
akronbeautyschool
akronschools
alexacebeauty
alliedteched
americanbeautyacademy
americancollegeofhair
antiochcollege
antonellic
aoma
artisticbeautycolleges
artisticbeautycolleges
ashdowncollege
ati.ag.ohio-state
aticareertraining
audioschool
avtec.labor.state.ak
awc
bc.inter
beacon
bealcollege
beautycareers
beautyschool
bellevue
bellin
bellinghambeautyschool
beonair
berkeley.peralta
beta.nwiht
blottsalonschools
blue.ab
boe.kana.k12.wv
boe.mono.k12.wv
boe.putn.k12.wv
bramsonort
brc
brewstertech
brioacademy
brownsontechnicalschool
bryman-college
bryman-college
burlingtontech
californiacareerschool
canadacollege
capitol-college
carnegieinstitute
carouselbeauty
carouselbeauty
carouselbeauty
cccua
cci
cci
cci
ccsce
cdtschool
ceitraining
cempr
centralcareer
centralohio.dalecarnegie
century-school.bizhosting
cetcleveland
cfinstitute
charlesstuartschool
chase
nomatch: clank85462
classact1cosmetology
coastline.cccd
collegeofhairdesign

In [11]:
## this function does several things at once:
## 1) we now also split on hyphens. Webpages may be hyphenated, but this way we'll still pick them up instead of missing
## them if they drop the hyphen in URL.
## 2) apostrophes aren't allowed in URLs, so we delete all apostrophes.
## 3) any generic words like College, School etc. are omitted, because they are not specific enough to lead to a match.

def get_keywords(x):
    possible_words = re.split(r'\s|-', x) ## split on either space or hyphen
    omit_words = ['college','institute','school','schools','university','the','of',',','-','and']
    possible_words = [ word.lower() for word in possible_words ]  ## convert everything to lower case
    resultwords  = [word.replace('\'', '') for word in possible_words if word.lower() not in omit_words]  
    ## remove any apostrophes from words
    return resultwords

for df in dfs:
    df['keywords'] = df['Institution'].apply(get_keywords)

next: many websites are acronyms of the institution name. let's figure out an acronym for each name.

In [14]:
#following function returns potential college acronym in lowercase
def acronym(s):
    return "".join(c.lower() for c in s if c.isupper())

for df in dfs:
    df['acronym'] = df['Institution'].map(acronym)

In [15]:
def make_acronyms(X):
    if len(X['acronym']) >= 3:  ## length limit because one-letter acronyms risk false matches...
        return list([X['acronym'], X['acronym'][1:], X['acronym'][:-1]])
    else:
        return [X['acronym']]

for df in dfs:
    df['potential_acronyms'] = df.apply(make_acronyms, axis=1)
    
## list comprehension that cycles through all potential acronyms and checks if any are in url.
def check_any_acronym(x):
    in_url = ([ ac in x['URL'] for ac in x['potential_acronyms'] ])
    return(any(in_url))

for df in dfs:
    df['potential_acronym_in_url'] = df.apply(check_any_acronym, axis=1)

In [40]:
accred_df

Unnamed: 0,Institution,URL,stripped_url,keywords,acronym,potential_acronyms,potential_acronym_in_url,potential_acronyms_extra
0,American Commercial College of Texas,acc-careers.com,acc-careers,"[american, commercial, texas]",acct,"[acct, cct, acc]",True,
1,American Commercial College - Abilene,acc-careers.com,acc-careers,"[american, commercial, , , abilene]",acca,"[acca, cca, acc]",True,
2,Anson College of Cosmetology,accschool.peedeeworld.net,accschool.peedeeworld,"[anson, cosmetology]",acc,"[acc, cc, ac]",True,"[acc, cc, ac, acoc]"
3,American College of Medical Technology,acmt.ac/,acmt,"[american, medical, technology]",acmt,"[acmt, cmt, acm]",True,
4,Aviation and Electronic Schools of America - C...,aesa.com,aesa,"[aviation, electronic, america, , , colfax]",aesac,"[aesac, esac, aesa]",True,
5,American University of Health Sciences,aihs.edu,aihs,"[american, health, sciences]",auhs,"[auhs, uhs, auh]",False,
6,Gerber Akron Beauty School,akronbeautyschool.com,akronbeautyschool,"[gerber, akron, beauty]",gabs,"[gabs, abs, gab]",False,
7,Adult Vocational Services,akronschools.com,akronschools,"[adult, vocational, services]",avs,"[avs, vs, av]",False,"[avs, vs, av, avs]"
8,Alexandria Academy of Beauty Culture,alexacebeauty.com,alexacebeauty,"[alexandria, academy, beauty, culture]",aabc,"[aabc, abc, aab]",False,
9,Fortis Institute - Scranton,alliedteched.edu,alliedteched,"[fortis, , , scranton]",fis,"[fis, is, fi]",False,"[fis, is, fi, fi-s]"


In [47]:
## testing out a different version of make_acronyms that tries out a couple of other possible acronyms - will see if
## increases probability of false match too much.

def make_acronyms_extra(X):
    if len(X['acronym']) >= 3:  ## length limit because one-letter acronyms risk false matches...
        
        acs = list([X['acronym'], X['acronym'][1:], X['acronym'][:-1]])
        
        ## one additional acronym try - the first letter of every word, even filler words like of, the etc.
        acronym_allwords = "".join(word[0].lower() for word in X['Institution'].split())
        acs.append(acronym_allwords.replace('-',''))
        
        if len(X['acronym']) >= 4:
            acs.append(X['acronym'][:-2])
        
        print(acs)
        return acs
    
    else:
        print(X['acronym'])
        return [X['acronym']]

for df in dfs:
    df['potential_acronyms_extra'] = df.apply(make_acronyms_extra, axis=1)
    
## list comprehension that cycles through all potential acronyms and checks if any are in url.
def check_any_acronym_extra(x):
    in_url = ([ ac in x['URL'] for ac in x['potential_acronyms_extra'] ])
    return(any(in_url))

for df in dfs:
    df['potential_acronym_in_url_extra'] = df.apply(check_any_acronym_extra, axis=1)

['acct', 'cct', 'acc', 'accot', 'ac']
['acca', 'cca', 'acc', 'acca', 'ac']
['acc', 'cc', 'ac', 'acoc']
['acmt', 'cmt', 'acm', 'acomt', 'ac']
['aesac', 'esac', 'aesa', 'aaesoac', 'aes']
['auhs', 'uhs', 'auh', 'auohs', 'au']
['gabs', 'abs', 'gab', 'gabs', 'ga']
['avs', 'vs', 'av', 'avs']
['aabc', 'abc', 'aab', 'aaobc', 'aa']
['fis', 'is', 'fi', 'fis']
['abaw', 'baw', 'aba', 'abaw', 'ab']
['achcr', 'chcr', 'achc', 'acohcr', 'ach']
ac
['ach', 'ch', 'ac', 'ach']
['aoma', 'oma', 'aom', 'aoomaa', 'ao']
['ebsg', 'bsg', 'ebs', 'ebsg', 'eb']
['ebsw', 'bsw', 'ebs', 'ebsw', 'eb']
['achs', 'chs', 'ach', 'acohs', 'ac']
['osuati', 'suati', 'osuat', 'osuati', 'osua']
['atictcm', 'tictcm', 'atictc', 'actcm', 'atict']
['iar', 'ar', 'ia', 'ioar']
['avtc', 'vtc', 'avt', 'avtc', 'av']
['awc', 'wc', 'aw', 'awc']
['iauprb', 'auprb', 'iaupr', 'iauoprb', 'iaup']
bu
bc
['cab', 'ab', 'ca', 'caob']
['cahd', 'ahd', 'cah', 'caohd', 'ca']
bu
['bcbhsrt', 'cbhsrt', 'bcbhsr', 'bcbhsort', 'bcbhs']
['bbs', 'bs', 'bb', 'b

Unnamed: 0,Institution,URL,stripped_url,keywords,acronym,potential_acronyms,potential_acronym_in_url,potential_acronyms_extra,potential_acronym_in_url_extra
0,American Commercial College of Texas,acc-careers.com,acc-careers,"[american, commercial, texas]",acct,"[acct, cct, acc]",True,"[acct, cct, acc, accot, ac]",True
1,American Commercial College - Abilene,acc-careers.com,acc-careers,"[american, commercial, , , abilene]",acca,"[acca, cca, acc]",True,"[acca, cca, acc, acca, ac]",True
2,Anson College of Cosmetology,accschool.peedeeworld.net,accschool.peedeeworld,"[anson, cosmetology]",acc,"[acc, cc, ac]",True,"[acc, cc, ac, acoc]",True
3,American College of Medical Technology,acmt.ac/,acmt,"[american, medical, technology]",acmt,"[acmt, cmt, acm]",True,"[acmt, cmt, acm, acomt, ac]",True
4,Aviation and Electronic Schools of America - C...,aesa.com,aesa,"[aviation, electronic, america, , , colfax]",aesac,"[aesac, esac, aesa]",True,"[aesac, esac, aesa, aaesoac, aes]",True
