In [1]:
import numpy as np
import pandas as pd
import re
pd.set_option('max_rows',200)

In [2]:
accred_df = pd.read_csv('Accreditation_Data.csv')
sample_df = pd.read_csv('sampleTestFile.csv')
test_df = pd.read_csv('testFile.csv')

dfs = [accred_df, sample_df, test_df]

print(len(accred_df))
print(len(sample_df))
print(len(test_df))



4383
100
392


Next step - treatment of URLs

In [3]:
for df in dfs:
    df['URL'] = df['URL'].str.lower()

def strip_url(url):
    stripped_url = url.replace('http://','').replace('https://','').replace('www.','')
    match = re.match("(.*)\.",stripped_url)
    
    if match:
        print(match.groups()[0])
        return match.groups()[0]
    else:
        print('nomatch:',stripped_url)
        return stripped_url
    
for df in dfs:
    df['stripped_url'] = df['URL'].apply(strip_url)

acc-careers
acc-careers
accschool.peedeeworld
acmt
aesa
aihs
akronbeautyschool
akronschools
alexacebeauty
alliedteched
americanbeautyacademy
americancollegeofhair
antiochcollege
antonellic
aoma
artisticbeautycolleges
artisticbeautycolleges
ashdowncollege
ati.ag.ohio-state
aticareertraining
audioschool
avtec.labor.state.ak
awc
bc.inter
beacon
bealcollege
beautycareers
beautyschool
bellevue
bellin
bellinghambeautyschool
beonair
berkeley.peralta
beta.nwiht
blottsalonschools
blue.ab
boe.kana.k12.wv
boe.mono.k12.wv
boe.putn.k12.wv
bramsonort
brc
brewstertech
brioacademy
brownsontechnicalschool
bryman-college
bryman-college
burlingtontech
californiacareerschool
canadacollege
capitol-college
carnegieinstitute
carouselbeauty
carouselbeauty
carouselbeauty
cccua
cci
cci
cci
ccsce
cdtschool
ceitraining
cempr
centralcareer
centralohio.dalecarnegie
century-school.bizhosting
cetcleveland
cfinstitute
charlesstuartschool
chase
nomatch: clank85462
classact1cosmetology
coastline.cccd
collegeofhairdesign

In [4]:
## this function does several things at once:
## 1) we now also split on hyphens. Webpages may be hyphenated, but this way we'll still pick them up instead of missing
## them if they drop the hyphen in URL.
## 2) apostrophes aren't allowed in URLs, so we delete all apostrophes.
## 3) any generic words like College, School etc. are omitted, because they are not specific enough to lead to a match.

def get_keywords(x):
    possible_words = re.split(r'\s|-', x) ## split on either space or hyphen
    omit_words = ['college','institute','school','schools','university','the','of',',','-','and']
    possible_words = [ word.lower() for word in possible_words ]  ## convert everything to lower case
    resultwords  = [word.replace('\'', '') for word in possible_words if word.lower() not in omit_words]  
    ## remove any apostrophes from words
    resultwords = list(filter(None, resultwords)) # fastest way to remove any new empty strings
    return resultwords

for df in dfs:
    df['keywords'] = df['Institution'].apply(get_keywords)

next: many websites are acronyms of the institution name. let's figure out an acronym for each name.

In [5]:
#following function returns potential college acronym in lowercase
#one quick tweak from staring at data - some places start with The (uppercase T); let's 
def acronym(s):
    s = s.replace('The ','') #specifically handles initial The (which tends to mess up acronyms)
    return "".join(c.lower() for c in s if c.isupper())

for df in dfs:
    df['acronym'] = df['Institution'].map(acronym)

In [6]:
def make_acronyms(X):
    
    if len(X['acronym']) >= 3:  ## length limit because one-letter acronyms risk false matches...
        return list([X['acronym'], X['acronym'][1:], X['acronym'][:-1]])
    else:
        return [X['acronym']]

for df in dfs:
    df['potential_acronyms'] = df.apply(make_acronyms, axis=1)
    
## list comprehension that cycles through all potential acronyms and checks if any are in url.
def check_any_acronym(x):
    in_url = ([ ac in x['stripped_url'] for ac in x['potential_acronyms'] ])
    return(any(in_url))

for df in dfs:
    df['potential_acronym_in_url'] = df.apply(check_any_acronym, axis=1)

In [7]:
accred_df

Unnamed: 0,Institution,URL,stripped_url,keywords,acronym,potential_acronyms,potential_acronym_in_url
0,American Commercial College of Texas,acc-careers.com,acc-careers,"[american, commercial, texas]",acct,"[acct, cct, acc]",True
1,American Commercial College - Abilene,acc-careers.com,acc-careers,"[american, commercial, abilene]",acca,"[acca, cca, acc]",True
2,Anson College of Cosmetology,accschool.peedeeworld.net,accschool.peedeeworld,"[anson, cosmetology]",acc,"[acc, cc, ac]",True
3,American College of Medical Technology,acmt.ac/,acmt,"[american, medical, technology]",acmt,"[acmt, cmt, acm]",True
4,Aviation and Electronic Schools of America - C...,aesa.com,aesa,"[aviation, electronic, america, colfax]",aesac,"[aesac, esac, aesa]",True
5,American University of Health Sciences,aihs.edu,aihs,"[american, health, sciences]",auhs,"[auhs, uhs, auh]",False
6,Gerber Akron Beauty School,akronbeautyschool.com,akronbeautyschool,"[gerber, akron, beauty]",gabs,"[gabs, abs, gab]",False
7,Adult Vocational Services,akronschools.com,akronschools,"[adult, vocational, services]",avs,"[avs, vs, av]",False
8,Alexandria Academy of Beauty Culture,alexacebeauty.com,alexacebeauty,"[alexandria, academy, beauty, culture]",aabc,"[aabc, abc, aab]",False
9,Fortis Institute - Scranton,alliedteched.edu,alliedteched,"[fortis, scranton]",fis,"[fis, is, fi]",False


In [8]:
## testing out a different version of make_acronyms that tries out a couple of other possible acronyms - will see if
## increases probability of false match too much.

def make_acronyms_extra(X):
    
    ## one initial acronym try - the first letter of every word, even filler words like of, the etc.
    ## low success rate, but does happen sometimes and no other way of catching such cases
    acronym_allwords = "".join(word[0].lower() for word in X['Institution'].split())
    
    if len(X['acronym']) >= 3:  ## length limit because one-letter acronyms risk false matches...
        
        acs = list([X['acronym'], X['acronym'][1:], X['acronym'][:-1]])
        
        acs.append(acronym_allwords.replace('-',''))
        
        if len(X['acronym']) >= 4:
            acs.append(X['acronym'][:-2])
        
        print(acs)
        return acs
    
    else:
        acs = [X['acronym'],acronym_allwords]
        print(acs)
        return acs

for df in dfs:
    df['potential_acronyms_extra'] = df.apply(make_acronyms_extra, axis=1)
    
## list comprehension that cycles through all potential acronyms and checks if any are in url.
def check_any_acronym_extra(x):
    in_url = ([ ac in x['stripped_url'] for ac in x['potential_acronyms_extra'] ])
    return(any(in_url))

for df in dfs:
    df['potential_acronym_in_url_extra'] = df.apply(check_any_acronym_extra, axis=1)

['acct', 'cct', 'acc', 'accot', 'ac']
['acca', 'cca', 'acc', 'acca', 'ac']
['acc', 'cc', 'ac', 'acoc']
['acmt', 'cmt', 'acm', 'acomt', 'ac']
['aesac', 'esac', 'aesa', 'aaesoac', 'aes']
['auhs', 'uhs', 'auh', 'auohs', 'au']
['gabs', 'abs', 'gab', 'gabs', 'ga']
['avs', 'vs', 'av', 'avs']
['aabc', 'abc', 'aab', 'aaobc', 'aa']
['fis', 'is', 'fi', 'fis']
['abaw', 'baw', 'aba', 'abaw', 'ab']
['achcr', 'chcr', 'achc', 'acohcr', 'ach']
['ac', 'ac']
['ach', 'ch', 'ac', 'ach']
['aoma', 'oma', 'aom', 'aoomaa', 'ao']
['ebsg', 'bsg', 'ebs', 'ebsg', 'eb']
['ebsw', 'bsw', 'ebs', 'ebsw', 'eb']
['achs', 'chs', 'ach', 'acohs', 'ac']
['osuati', 'suati', 'osuat', 'osuati', 'osua']
['atictcm', 'tictcm', 'atictc', 'actcm', 'atict']
['iar', 'ar', 'ia', 'ioar']
['avtc', 'vtc', 'avt', 'avtc', 'av']
['awc', 'wc', 'aw', 'awc']
['iauprb', 'auprb', 'iaupr', 'iauoprb', 'iaup']
['bu', 'bu']
['bc', 'bc']
['cab', 'ab', 'ca', 'caob']
['cahd', 'ahd', 'cah', 'caohd', 'ca']
['bu', 'bu']
['bcbhsrt', 'cbhsrt', 'bcbhsr', 'bc

In [9]:
sample_df['correct_prediction_1'] = sample_df['isWrong'] == ~sample_df['potential_acronym_in_url']
sample_df['correct_prediction_2'] = sample_df['isWrong'] == ~sample_df['potential_acronym_in_url_extra']

print(sample_df['correct_prediction_1'].value_counts())
print(sample_df['correct_prediction_2'].value_counts())

True     67
False    33
Name: correct_prediction_1, dtype: int64
True     68
False    32
Name: correct_prediction_2, dtype: int64


In [10]:
sample_df[~sample_df['correct_prediction_1']]

Unnamed: 0,Institution,URL,isWrong,stripped_url,keywords,acronym,potential_acronyms,potential_acronym_in_url,potential_acronyms_extra,potential_acronym_in_url_extra,correct_prediction_1,correct_prediction_2
3,Suburban Technical School,suburbantech.com,0,suburbantech,"[suburban, technical]",sts,"[sts, ts, st]",False,"[sts, ts, st, sts]",False,False,False
7,Baker College,www.baker.edu,0,baker,[baker],bc,[bc],False,"[bc, bc]",False,False,False
9,Baruch College of the City University of New York,www.baruch.cuny.edu,0,baruch.cuny,"[baruch, city, new, york]",bccuny,"[bccuny, ccuny, bccun]",False,"[bccuny, ccuny, bccun, bcotcuony, bccu]",False,False,False
10,Bay de Noc Community College,www.baycollege.edu,0,baycollege,"[bay, de, noc, community]",bncc,"[bncc, ncc, bnc]",False,"[bncc, ncc, bnc, bdncc, bn]",False,False,False
13,Benedictine University,www.ben.edu,0,ben,[benedictine],bu,[bu],False,"[bu, bu]",False,False,False
14,SUNY at Binghamton,www.binghamton.edu,0,binghamton,"[suny, at, binghamton]",sunyb,"[sunyb, unyb, suny]",False,"[sunyb, unyb, suny, sab, sun]",False,False,False
19,SUNY College of Technology at Canton,www.canton.edu,0,canton,"[suny, technology, at, canton]",sunyctc,"[sunyctc, unyctc, sunyct]",False,"[sunyctc, unyctc, sunyct, scotac, sunyc]",False,False,False
21,New York City College of Technology of the Cit...,www.citytech.cuny.edu,0,citytech.cuny,"[new, york, city, technology, city, new, york]",nycctcuny,"[nycctcuny, ycctcuny, nycctcun]",False,"[nycctcuny, ycctcuny, nycctcun, nyccototcuony,...",False,False,False
23,SUNY College of Agriculture and Technology at ...,www.cobleskill.edu/,0,cobleskill,"[suny, agriculture, technology, at, cobleskill]",sunycatc,"[sunycatc, unycatc, sunycat]",False,"[sunycatc, unycatc, sunycat, scoaatac, sunyca]",False,False,False
29,Sacramento Job Corps Center,www.jcdc.jobcorps.org,0,jcdc.jobcorps,"[sacramento, job, corps, center]",sjcc,"[sjcc, jcc, sjc]",False,"[sjcc, jcc, sjc, sjcc, sj]",False,False,False


In [11]:
accred_df

Unnamed: 0,Institution,URL,stripped_url,keywords,acronym,potential_acronyms,potential_acronym_in_url,potential_acronyms_extra,potential_acronym_in_url_extra
0,American Commercial College of Texas,acc-careers.com,acc-careers,"[american, commercial, texas]",acct,"[acct, cct, acc]",True,"[acct, cct, acc, accot, ac]",True
1,American Commercial College - Abilene,acc-careers.com,acc-careers,"[american, commercial, abilene]",acca,"[acca, cca, acc]",True,"[acca, cca, acc, acca, ac]",True
2,Anson College of Cosmetology,accschool.peedeeworld.net,accschool.peedeeworld,"[anson, cosmetology]",acc,"[acc, cc, ac]",True,"[acc, cc, ac, acoc]",True
3,American College of Medical Technology,acmt.ac/,acmt,"[american, medical, technology]",acmt,"[acmt, cmt, acm]",True,"[acmt, cmt, acm, acomt, ac]",True
4,Aviation and Electronic Schools of America - C...,aesa.com,aesa,"[aviation, electronic, america, colfax]",aesac,"[aesac, esac, aesa]",True,"[aesac, esac, aesa, aaesoac, aes]",True
5,American University of Health Sciences,aihs.edu,aihs,"[american, health, sciences]",auhs,"[auhs, uhs, auh]",False,"[auhs, uhs, auh, auohs, au]",False
6,Gerber Akron Beauty School,akronbeautyschool.com,akronbeautyschool,"[gerber, akron, beauty]",gabs,"[gabs, abs, gab]",False,"[gabs, abs, gab, gabs, ga]",False
7,Adult Vocational Services,akronschools.com,akronschools,"[adult, vocational, services]",avs,"[avs, vs, av]",False,"[avs, vs, av, avs]",False
8,Alexandria Academy of Beauty Culture,alexacebeauty.com,alexacebeauty,"[alexandria, academy, beauty, culture]",aabc,"[aabc, abc, aab]",False,"[aabc, abc, aab, aaobc, aa]",False
9,Fortis Institute - Scranton,alliedteched.edu,alliedteched,"[fortis, scranton]",fis,"[fis, is, fi]",False,"[fis, is, fi, fis]",False


In [12]:
print(accred_df['potential_acronym_in_url'].value_counts(normalize = True))
print(accred_df['potential_acronym_in_url_extra'].value_counts(normalize = True))

False    0.624686
True     0.375314
Name: potential_acronym_in_url, dtype: float64
False    0.57974
True     0.42026
Name: potential_acronym_in_url_extra, dtype: float64


adding the extra possible acronyms increases the hit rate by another 12%. We can do some testing with an AUC-ROC model to figure out if we're having issues with false positives, but the limited training set is problematic.

In [13]:
def check_any_keyword(x):
    in_url = ([ ac in x['URL'] for ac in x['keywords'] ])
    return(any(in_url))

for df in dfs:
    df['keyword_in_url'] = df.apply(check_any_keyword, axis=1)

In [14]:
for df in dfs:
    df['matched_url'] = df['keyword_in_url'] | df['potential_acronym_in_url']
    df['matched_url_extra'] = df['keyword_in_url'] | df['potential_acronym_in_url_extra']
    
sample_df['correct_prediction'] = sample_df['isWrong'] == ~sample_df['matched_url']
sample_df['correct_prediction_extra'] = sample_df['isWrong'] == ~sample_df['matched_url_extra']

print(sample_df['correct_prediction'].value_counts())
print(sample_df['correct_prediction_extra'].value_counts())

True     95
False     5
Name: correct_prediction, dtype: int64
True     96
False     4
Name: correct_prediction_extra, dtype: int64


In [15]:
sample_df[sample_df['correct_prediction'] == False]

Unnamed: 0,Institution,URL,isWrong,stripped_url,keywords,acronym,potential_acronyms,potential_acronym_in_url,potential_acronyms_extra,potential_acronym_in_url_extra,correct_prediction_1,correct_prediction_2,keyword_in_url,matched_url,matched_url_extra,correct_prediction,correct_prediction_extra
13,Benedictine University,www.ben.edu,0,ben,[benedictine],bu,[bu],False,"[bu, bu]",False,False,False,False,False,False,False,False
42,New Mexico Institute of Mining and Technology,www.nmt.edu,0,nmt,"[new, mexico, mining, technology]",nmimt,"[nmimt, mimt, nmim]",False,"[nmimt, mimt, nmim, nmiomat, nmi]",False,False,False,False,False,False,False,False
58,The General Theological Seminary,www.remingtoncollege.edu,1,remingtoncollege,"[general, theological, seminary]",gts,"[gts, ts, gt]",True,"[gts, ts, gt, tgts]",True,False,False,False,True,True,False,False
68,Sunstate Academy,www.sandiegojobcorps.org,1,sandiegojobcorps,"[sunstate, academy]",sa,[sa],True,"[sa, sa]",True,False,False,False,True,True,False,False
94,Teachers College of Columbia University,www.tc.edu,0,tc,"[teachers, columbia]",tccu,"[tccu, ccu, tcc]",False,"[tccu, ccu, tcc, tcocu, tc]",True,False,True,False,False,True,False,True


Pretty cool - we already hit a 97% success rate on the trial data.

In [16]:
print(accred_df['matched_url'].value_counts(normalize = True))
print(accred_df['matched_url_extra'].value_counts(normalize = True))

True     0.865617
False    0.134383
Name: matched_url, dtype: float64
True     0.889802
False    0.110198
Name: matched_url_extra, dtype: float64


Performance on training data may exaggerate success - looking at much bigger database of accredited schools, fit rate is only 89% or so

In [17]:
accred_df[accred_df['matched_url_extra'] == False]

Unnamed: 0,Institution,URL,stripped_url,keywords,acronym,potential_acronyms,potential_acronym_in_url,potential_acronyms_extra,potential_acronym_in_url_extra,keyword_in_url,matched_url,matched_url_extra
5,American University of Health Sciences,aihs.edu,aihs,"[american, health, sciences]",auhs,"[auhs, uhs, auh]",False,"[auhs, uhs, auh, auohs, au]",False,False,False,False
7,Adult Vocational Services,akronschools.com,akronschools,"[adult, vocational, services]",avs,"[avs, vs, av]",False,"[avs, vs, av, avs]",False,False,False,False
9,Fortis Institute - Scranton,alliedteched.edu,alliedteched,"[fortis, scranton]",fis,"[fis, is, fi]",False,"[fis, is, fi, fis]",False,False,False,False
27,Career Academy of Hair Design,beautyschool.edu,beautyschool,"[career, academy, hair, design]",cahd,"[cahd, ahd, cah]",False,"[cahd, ahd, cah, caohd, ca]",False,False,False,False
31,Ohio Center for Broadcasting - Colorado Campus,beonair.com,beonair,"[ohio, center, for, broadcasting, colorado, ca...",ocbcc,"[ocbcc, cbcc, ocbc]",False,"[ocbcc, cbcc, ocbc, ocfbcc, ocb]",False,False,False,False
34,Ohio State School of Cosmetology,blottsalonschools.com,blottsalonschools,"[ohio, state, cosmetology]",ossc,"[ossc, ssc, oss]",False,"[ossc, ssc, oss, ossoc, os]",False,False,False,False
36,Ben Franklin Career & Technical Education Center,boe.kana.k12.wv.us/,boe.kana.k12.wv,"[ben, franklin, career, &, technical, educatio...",bfctec,"[bfctec, fctec, bfcte]",False,"[bfctec, fctec, bfcte, bfc&tec, bfct]",False,False,False,False
37,Monongalia County Technical Education Center,boe.mono.k12.wv.us/montec,boe.mono.k12.wv,"[monongalia, county, technical, education, cen...",mctec,"[mctec, ctec, mcte]",False,"[mctec, ctec, mcte, mctec, mct]",False,False,False,False
38,Putnam Career and Technical Center,boe.putn.k12.wv.us/pctc,boe.putn.k12.wv,"[putnam, career, technical, center]",pctc,"[pctc, ctc, pct]",False,"[pctc, ctc, pct, pcatc, pc]",False,False,False,False
42,Marinello School of Beauty,brioacademy.com,brioacademy,"[marinello, beauty]",msb,"[msb, sb, ms]",False,"[msb, sb, ms, msob]",False,False,False,False


I feel pretty OK with practically all of the misses, except that it bothers me that my algorithm misses some fairly obvious matches like University of Wisconsin - Madison -> wisc and Washington College -> washcoll. I'm going to make one last variable that checks if potential abbreviations are included in the URL name.

In [18]:
def make_abbreviations(keywords):
    ## use the first three letters of each keyword as a potential abbreviation used in url
    return [ s[:3] for s in keywords ]

for df in dfs:
    df['potential_abbreviations'] = df['keywords'].apply(make_abbreviations)

def check_any_abbrev(x):
    in_url = ([ ab in x['stripped_url'] for ab in x['potential_abbreviations'] ])
    return(any(in_url))

for df in dfs:
    df['abbrev_in_url'] = df.apply(check_any_abbrev, axis=1)

In [19]:
accred_df

Unnamed: 0,Institution,URL,stripped_url,keywords,acronym,potential_acronyms,potential_acronym_in_url,potential_acronyms_extra,potential_acronym_in_url_extra,keyword_in_url,matched_url,matched_url_extra,potential_abbreviations,abbrev_in_url
0,American Commercial College of Texas,acc-careers.com,acc-careers,"[american, commercial, texas]",acct,"[acct, cct, acc]",True,"[acct, cct, acc, accot, ac]",True,False,True,True,"[ame, com, tex]",False
1,American Commercial College - Abilene,acc-careers.com,acc-careers,"[american, commercial, abilene]",acca,"[acca, cca, acc]",True,"[acca, cca, acc, acca, ac]",True,False,True,True,"[ame, com, abi]",False
2,Anson College of Cosmetology,accschool.peedeeworld.net,accschool.peedeeworld,"[anson, cosmetology]",acc,"[acc, cc, ac]",True,"[acc, cc, ac, acoc]",True,False,True,True,"[ans, cos]",False
3,American College of Medical Technology,acmt.ac/,acmt,"[american, medical, technology]",acmt,"[acmt, cmt, acm]",True,"[acmt, cmt, acm, acomt, ac]",True,False,True,True,"[ame, med, tec]",False
4,Aviation and Electronic Schools of America - C...,aesa.com,aesa,"[aviation, electronic, america, colfax]",aesac,"[aesac, esac, aesa]",True,"[aesac, esac, aesa, aaesoac, aes]",True,False,True,True,"[avi, ele, ame, col]",False
5,American University of Health Sciences,aihs.edu,aihs,"[american, health, sciences]",auhs,"[auhs, uhs, auh]",False,"[auhs, uhs, auh, auohs, au]",False,False,False,False,"[ame, hea, sci]",False
6,Gerber Akron Beauty School,akronbeautyschool.com,akronbeautyschool,"[gerber, akron, beauty]",gabs,"[gabs, abs, gab]",False,"[gabs, abs, gab, gabs, ga]",False,True,True,True,"[ger, akr, bea]",True
7,Adult Vocational Services,akronschools.com,akronschools,"[adult, vocational, services]",avs,"[avs, vs, av]",False,"[avs, vs, av, avs]",False,False,False,False,"[adu, voc, ser]",False
8,Alexandria Academy of Beauty Culture,alexacebeauty.com,alexacebeauty,"[alexandria, academy, beauty, culture]",aabc,"[aabc, abc, aab]",False,"[aabc, abc, aab, aaobc, aa]",False,True,True,True,"[ale, aca, bea, cul]",True
9,Fortis Institute - Scranton,alliedteched.edu,alliedteched,"[fortis, scranton]",fis,"[fis, is, fi]",False,"[fis, is, fi, fis]",False,False,False,False,"[for, scr]",False


In [20]:
for df in dfs:
    df['matched_url_abbrev'] = df['keyword_in_url'] | df['potential_acronym_in_url_extra'] | df['abbrev_in_url']
    
sample_df['correct_prediction_abbrev'] = sample_df['isWrong'] == ~sample_df['matched_url_abbrev']

print(sample_df['correct_prediction'].value_counts())
print(sample_df['correct_prediction_extra'].value_counts())
print(sample_df['correct_prediction_abbrev'].value_counts())

True     95
False     5
Name: correct_prediction, dtype: int64
True     96
False     4
Name: correct_prediction_extra, dtype: int64
True     96
False     4
Name: correct_prediction_abbrev, dtype: int64


In [21]:
sample_df[sample_df['correct_prediction_abbrev'] == False]

Unnamed: 0,Institution,URL,isWrong,stripped_url,keywords,acronym,potential_acronyms,potential_acronym_in_url,potential_acronyms_extra,potential_acronym_in_url_extra,...,correct_prediction_2,keyword_in_url,matched_url,matched_url_extra,correct_prediction,correct_prediction_extra,potential_abbreviations,abbrev_in_url,matched_url_abbrev,correct_prediction_abbrev
39,Newbridge College - El Cajon,www.newpaltz.edu,1,newpaltz,"[newbridge, el, cajon]",ncec,"[ncec, cec, nce]",False,"[ncec, cec, nce, ncec, nc]",False,...,True,False,False,False,True,True,"[new, el, caj]",True,True,False
42,New Mexico Institute of Mining and Technology,www.nmt.edu,0,nmt,"[new, mexico, mining, technology]",nmimt,"[nmimt, mimt, nmim]",False,"[nmimt, mimt, nmim, nmiomat, nmi]",False,...,False,False,False,False,False,False,"[new, mex, min, tec]",False,False,False
58,The General Theological Seminary,www.remingtoncollege.edu,1,remingtoncollege,"[general, theological, seminary]",gts,"[gts, ts, gt]",True,"[gts, ts, gt, tgts]",True,...,False,False,True,True,False,False,"[gen, the, sem]",False,True,False
68,Sunstate Academy,www.sandiegojobcorps.org,1,sandiegojobcorps,"[sunstate, academy]",sa,[sa],True,"[sa, sa]",True,...,False,False,True,True,False,False,"[sun, aca]",False,True,False


pretty interesting - with abbreviations, all of a sudden Newbridge College - El Cajon could reasonably have a URL of newpaltz.edu since new shared by both. Abbreviations may also add to the problem of false matching by guessing on too many potential acronyms.

In [24]:
print(accred_df['matched_url'].value_counts(normalize = True))
print(accred_df['matched_url_extra'].value_counts(normalize = True))
print(accred_df['matched_url_abbrev'].value_counts(normalize = True))

True     0.865617
False    0.134383
Name: matched_url, dtype: float64
True     0.889802
False    0.110198
Name: matched_url_extra, dtype: float64
True     0.913301
False    0.086699
Name: matched_url_abbrev, dtype: float64


Nice to see that adding the abbreviations in paid off - bump to 91.3% success rate with accredited institutions is pretty good. From visual inspection many of the remaining misses would require a great deal of extra effort to account for (e.g. Aviation Institute of Maintenance - Dallas -> fixthatplane.com)

In [25]:
accred_df[accred_df['matched_url_abbrev'] == False]

Unnamed: 0,Institution,URL,stripped_url,keywords,acronym,potential_acronyms,potential_acronym_in_url,potential_acronyms_extra,potential_acronym_in_url_extra,keyword_in_url,matched_url,matched_url_extra,potential_abbreviations,abbrev_in_url,matched_url_abbrev
5,American University of Health Sciences,aihs.edu,aihs,"[american, health, sciences]",auhs,"[auhs, uhs, auh]",False,"[auhs, uhs, auh, auohs, au]",False,False,False,False,"[ame, hea, sci]",False,False
7,Adult Vocational Services,akronschools.com,akronschools,"[adult, vocational, services]",avs,"[avs, vs, av]",False,"[avs, vs, av, avs]",False,False,False,False,"[adu, voc, ser]",False,False
9,Fortis Institute - Scranton,alliedteched.edu,alliedteched,"[fortis, scranton]",fis,"[fis, is, fi]",False,"[fis, is, fi, fis]",False,False,False,False,"[for, scr]",False,False
27,Career Academy of Hair Design,beautyschool.edu,beautyschool,"[career, academy, hair, design]",cahd,"[cahd, ahd, cah]",False,"[cahd, ahd, cah, caohd, ca]",False,False,False,False,"[car, aca, hai, des]",False,False
31,Ohio Center for Broadcasting - Colorado Campus,beonair.com,beonair,"[ohio, center, for, broadcasting, colorado, ca...",ocbcc,"[ocbcc, cbcc, ocbc]",False,"[ocbcc, cbcc, ocbc, ocfbcc, ocb]",False,False,False,False,"[ohi, cen, for, bro, col, cam]",False,False
34,Ohio State School of Cosmetology,blottsalonschools.com,blottsalonschools,"[ohio, state, cosmetology]",ossc,"[ossc, ssc, oss]",False,"[ossc, ssc, oss, ossoc, os]",False,False,False,False,"[ohi, sta, cos]",False,False
36,Ben Franklin Career & Technical Education Center,boe.kana.k12.wv.us/,boe.kana.k12.wv,"[ben, franklin, career, &, technical, educatio...",bfctec,"[bfctec, fctec, bfcte]",False,"[bfctec, fctec, bfcte, bfc&tec, bfct]",False,False,False,False,"[ben, fra, car, &, tec, edu, cen]",False,False
42,Marinello School of Beauty,brioacademy.com,brioacademy,"[marinello, beauty]",msb,"[msb, sb, ms]",False,"[msb, sb, ms, msob]",False,False,False,False,"[mar, bea]",False,False
44,Everest College - Los Angeles,bryman-college.com,bryman-college,"[everest, los, angeles]",ecla,"[ecla, cla, ecl]",False,"[ecla, cla, ecl, ecla, ec]",False,False,False,False,"[eve, los, ang]",False,False
45,Everest College - Torrance,bryman-college.com,bryman-college,"[everest, torrance]",ect,"[ect, ct, ec]",False,"[ect, ct, ec, ect]",False,False,False,False,"[eve, tor]",False,False


Probably the next step would be to feed in names of major state university systems (e.g. SUNY) or state abbreviations as something to look out for, but that's beyond what's needed for the current problem.