In [1]:
import pandas as pd
pd.set_option('max_rows',200)

In [2]:
accred_df = pd.read_csv('Accreditation_Data.csv')
sample_df = pd.read_csv('sampleTestFile.csv')
test_df = pd.read_csv('testFile.csv')

dfs = [accred_df, sample_df, test_df]

In [3]:
print(len(accred_df))
print(len(sample_df))
print(len(test_df))

4383
100
392


In [4]:
accred_df[:100]

Unnamed: 0,Institution,URL
0,American Commercial College of Texas,acc-careers.com
1,American Commercial College - Abilene,acc-careers.com
2,Anson College of Cosmetology,accschool.peedeeworld.net
3,American College of Medical Technology,acmt.ac/
4,Aviation and Electronic Schools of America - C...,aesa.com
5,American University of Health Sciences,aihs.edu
6,Gerber Akron Beauty School,AkronBeautySchool.com
7,Adult Vocational Services,akronschools.com
8,Alexandria Academy of Beauty Culture,alexacebeauty.com
9,Fortis Institute - Scranton,alliedteched.edu


In [5]:
sample_df[:100]

Unnamed: 0,Institution,URL,isWrong
0,Barton College,bestcarehealth.com,1
1,Northern New Mexico College,nnmcc.edu,0
2,Northern Virginia Community College,southeasterncareercollege.edu,1
3,Suburban Technical School,suburbantech.com,0
4,Stratford University,valleycareercollege.com,1
5,Saddleback Valley U.S.D. - Adult Education,www.ais.edu,1
6,Remington College-Fort Worth Campus,www.babel.edu,1
7,Baker College,www.baker.edu,0
8,Nazareth College of Rochester,www.barton.edu,1
9,Baruch College of the City University of New York,www.baruch.cuny.edu,0


In [6]:
test_df[:100]

Unnamed: 0,Institution,URL,Prediction
0,Paul Mitchell The School - Danbury,americanacademyofcosmetology.com,
1,Colleen O'Hara's Beauty Academy,californiabeautyschool.com,
2,Capri Cosmetology Learning Center,Caprinow.com,
3,Career Beauty College,careerbeautycollege.com,
4,Career Colleges of America - South Gate,CAREERCOLLEGES.ORG,
5,Charleston Cosmetology Institute,careerta.edu,
6,Carsten Institute of Cosmetology,carsteninstitute.com,
7,C.C.B. School of Atlanta,ccbenglish.com/,
8,Charlotte Technical Center,charlottetechcenter.ccps.k12.fl.us,
9,Claremore Beauty College,claremorebeautycollege.com,


Immediate pre-processing step we want to take - convert any urls that have uppercase to all lowercase for consistency.

In [7]:
for df in dfs:
    df['URL'] = df['URL'].str.lower()

dealing with strings is hard - clearly need to make up some variables. First ones that come to mind - 1) do any words match?; 2) is the acronym contained in the website name?; 3) is the name a perfect match? (super-good fit)

Let's start with the acronym.

In [8]:
#following function returns potential college acronym in lowercase
def acronym(s):
    return "".join(c.lower() for c in s if c.isupper())

print(acronym("HeLLo WorLD"))

hllwld


In [9]:
for df in dfs:
    df['acronym'] = df['Institution'].map(acronym)
    #df['potential_acronyms'] = list([df['acronym'], df['acronym'][1:], df['acronym'][:-1]])

In [10]:
accred_df[:150]

Unnamed: 0,Institution,URL,acronym
0,American Commercial College of Texas,acc-careers.com,acct
1,American Commercial College - Abilene,acc-careers.com,acca
2,Anson College of Cosmetology,accschool.peedeeworld.net,acc
3,American College of Medical Technology,acmt.ac/,acmt
4,Aviation and Electronic Schools of America - C...,aesa.com,aesac
5,American University of Health Sciences,aihs.edu,auhs
6,Gerber Akron Beauty School,akronbeautyschool.com,gabs
7,Adult Vocational Services,akronschools.com,avs
8,Alexandria Academy of Beauty Culture,alexacebeauty.com,aabc
9,Fortis Institute - Scranton,alliedteched.edu,fis


let's see how good of a first try this is.

In [11]:
def check(x):    
    return x['acronym'] in x['URL']

for df in dfs:
    df['acronym_in_url'] = df.apply(check, axis=1)

In [12]:
accred_df[:20]

Unnamed: 0,Institution,URL,acronym,acronym_in_url
0,American Commercial College of Texas,acc-careers.com,acct,False
1,American Commercial College - Abilene,acc-careers.com,acca,False
2,Anson College of Cosmetology,accschool.peedeeworld.net,acc,True
3,American College of Medical Technology,acmt.ac/,acmt,True
4,Aviation and Electronic Schools of America - C...,aesa.com,aesac,False
5,American University of Health Sciences,aihs.edu,auhs,False
6,Gerber Akron Beauty School,akronbeautyschool.com,gabs,False
7,Adult Vocational Services,akronschools.com,avs,False
8,Alexandria Academy of Beauty Culture,alexacebeauty.com,aabc,False
9,Fortis Institute - Scranton,alliedteched.edu,fis,False


In [13]:
accred_df['acronym_in_url'].value_counts(normalize = True)

False    0.71937
True     0.28063
Name: acronym_in_url, dtype: float64

sweet - we're already hitting 28% of accredited institutions. Let's see if we can further improve this by being flexible with acronyms.

In [14]:
qq = accred_df['acronym'][0]
rr = list([qq,qq[1:],qq[:-1]])
print(rr)

['acct', 'cct', 'acc']


In [15]:
def make_acronyms(X):
    if len(X['acronym']) >= 3:
        return list([X['acronym'], X['acronym'][1:], X['acronym'][:-1]])
    else:
        return [X['acronym']]

for df in dfs:
    df['potential_acronyms'] = df.apply(make_acronyms, axis=1)

In [16]:
accred_df

Unnamed: 0,Institution,URL,acronym,acronym_in_url,potential_acronyms
0,American Commercial College of Texas,acc-careers.com,acct,False,"[acct, cct, acc]"
1,American Commercial College - Abilene,acc-careers.com,acca,False,"[acca, cca, acc]"
2,Anson College of Cosmetology,accschool.peedeeworld.net,acc,True,"[acc, cc, ac]"
3,American College of Medical Technology,acmt.ac/,acmt,True,"[acmt, cmt, acm]"
4,Aviation and Electronic Schools of America - C...,aesa.com,aesac,False,"[aesac, esac, aesa]"
5,American University of Health Sciences,aihs.edu,auhs,False,"[auhs, uhs, auh]"
6,Gerber Akron Beauty School,akronbeautyschool.com,gabs,False,"[gabs, abs, gab]"
7,Adult Vocational Services,akronschools.com,avs,False,"[avs, vs, av]"
8,Alexandria Academy of Beauty Culture,alexacebeauty.com,aabc,False,"[aabc, abc, aab]"
9,Fortis Institute - Scranton,alliedteched.edu,fis,False,"[fis, is, fi]"


In [17]:
## list comprehension that cycles through all potential acronyms and checks if any are in url.
def check_any(x):
    in_url = ([ ac in x['URL'] for ac in x['potential_acronyms'] ])
    return(any(in_url))

In [22]:
for df in dfs:
    df['potential_acronym_in_url'] = df.apply(check_any, axis=1)

In [26]:
sample_df[:100]

Unnamed: 0,Institution,URL,isWrong,acronym,acronym_in_url,potential_acronyms,potential_acronym_in_url
0,Barton College,bestcarehealth.com,1,bc,False,[bc],False
1,Northern New Mexico College,nnmcc.edu,0,nnmc,True,"[nnmc, nmc, nnm]",True
2,Northern Virginia Community College,southeasterncareercollege.edu,1,nvcc,False,"[nvcc, vcc, nvc]",False
3,Suburban Technical School,suburbantech.com,0,sts,False,"[sts, ts, st]",False
4,Stratford University,valleycareercollege.com,1,su,False,[su],False
5,Saddleback Valley U.S.D. - Adult Education,www.ais.edu,1,svusdae,False,"[svusdae, vusdae, svusda]",False
6,Remington College-Fort Worth Campus,www.babel.edu,1,rcfwc,False,"[rcfwc, cfwc, rcfw]",False
7,Baker College,www.baker.edu,0,bc,False,[bc],False
8,Nazareth College of Rochester,www.barton.edu,1,ncr,False,"[ncr, cr, nc]",False
9,Baruch College of the City University of New York,www.baruch.cuny.edu,0,bccuny,False,"[bccuny, ccuny, bccun]",False


In [28]:
print(accred_df['acronym_in_url'].value_counts(normalize = True))
print(accred_df['potential_acronym_in_url'].value_counts(normalize = True))

False    0.71937
True     0.28063
Name: acronym_in_url, dtype: float64
False    0.618982
True     0.381018
Name: potential_acronym_in_url, dtype: float64


Picked up an additional 10% of accredited institutions - cool.

In [None]:
['College','Institute','School','University']