In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_rows',200)

In [2]:
accred_df = pd.read_csv('Accreditation_Data.csv')
sample_df = pd.read_csv('sampleTestFile.csv')
test_df = pd.read_csv('testFile.csv')

dfs = [accred_df, sample_df, test_df]

In [3]:
print(len(accred_df))
print(len(sample_df))
print(len(test_df))

4383
100
392


In [4]:
accred_df[:100]

Unnamed: 0,Institution,URL
0,American Commercial College of Texas,acc-careers.com
1,American Commercial College - Abilene,acc-careers.com
2,Anson College of Cosmetology,accschool.peedeeworld.net
3,American College of Medical Technology,acmt.ac/
4,Aviation and Electronic Schools of America - C...,aesa.com
5,American University of Health Sciences,aihs.edu
6,Gerber Akron Beauty School,AkronBeautySchool.com
7,Adult Vocational Services,akronschools.com
8,Alexandria Academy of Beauty Culture,alexacebeauty.com
9,Fortis Institute - Scranton,alliedteched.edu


In [5]:
sample_df[:100]

Unnamed: 0,Institution,URL,isWrong
0,Barton College,bestcarehealth.com,1
1,Northern New Mexico College,nnmcc.edu,0
2,Northern Virginia Community College,southeasterncareercollege.edu,1
3,Suburban Technical School,suburbantech.com,0
4,Stratford University,valleycareercollege.com,1
5,Saddleback Valley U.S.D. - Adult Education,www.ais.edu,1
6,Remington College-Fort Worth Campus,www.babel.edu,1
7,Baker College,www.baker.edu,0
8,Nazareth College of Rochester,www.barton.edu,1
9,Baruch College of the City University of New York,www.baruch.cuny.edu,0


In [6]:
test_df[:100]

Unnamed: 0,Institution,URL,Prediction
0,Paul Mitchell The School - Danbury,americanacademyofcosmetology.com,
1,Colleen O'Hara's Beauty Academy,californiabeautyschool.com,
2,Capri Cosmetology Learning Center,Caprinow.com,
3,Career Beauty College,careerbeautycollege.com,
4,Career Colleges of America - South Gate,CAREERCOLLEGES.ORG,
5,Charleston Cosmetology Institute,careerta.edu,
6,Carsten Institute of Cosmetology,carsteninstitute.com,
7,C.C.B. School of Atlanta,ccbenglish.com/,
8,Charlotte Technical Center,charlottetechcenter.ccps.k12.fl.us,
9,Claremore Beauty College,claremorebeautycollege.com,


Immediate pre-processing step we want to take - convert any urls that have uppercase to all lowercase for consistency.

In [7]:
for df in dfs:
    df['URL'] = df['URL'].str.lower()

dealing with strings is hard - clearly need to make up some variables. First ones that come to mind - 1) do any words match?; 2) is the acronym contained in the website name?; 3) is the name a perfect match? (super-good fit)

Let's start with the acronym.

In [8]:
#following function returns potential college acronym in lowercase
def acronym(s):
    return "".join(c.lower() for c in s if c.isupper())

print(acronym("HeLLo WorLD"))

hllwld


In [9]:
for df in dfs:
    df['acronym'] = df['Institution'].map(acronym)
    #df['potential_acronyms'] = list([df['acronym'], df['acronym'][1:], df['acronym'][:-1]])

In [10]:
accred_df[:150]

Unnamed: 0,Institution,URL,acronym
0,American Commercial College of Texas,acc-careers.com,acct
1,American Commercial College - Abilene,acc-careers.com,acca
2,Anson College of Cosmetology,accschool.peedeeworld.net,acc
3,American College of Medical Technology,acmt.ac/,acmt
4,Aviation and Electronic Schools of America - C...,aesa.com,aesac
5,American University of Health Sciences,aihs.edu,auhs
6,Gerber Akron Beauty School,akronbeautyschool.com,gabs
7,Adult Vocational Services,akronschools.com,avs
8,Alexandria Academy of Beauty Culture,alexacebeauty.com,aabc
9,Fortis Institute - Scranton,alliedteched.edu,fis


let's see how good of a first try this is.

In [11]:
def check(x):    
    return x['acronym'] in x['URL']

for df in dfs:
    df['acronym_in_url'] = df.apply(check, axis=1)

In [12]:
accred_df[:20]

Unnamed: 0,Institution,URL,acronym,acronym_in_url
0,American Commercial College of Texas,acc-careers.com,acct,False
1,American Commercial College - Abilene,acc-careers.com,acca,False
2,Anson College of Cosmetology,accschool.peedeeworld.net,acc,True
3,American College of Medical Technology,acmt.ac/,acmt,True
4,Aviation and Electronic Schools of America - C...,aesa.com,aesac,False
5,American University of Health Sciences,aihs.edu,auhs,False
6,Gerber Akron Beauty School,akronbeautyschool.com,gabs,False
7,Adult Vocational Services,akronschools.com,avs,False
8,Alexandria Academy of Beauty Culture,alexacebeauty.com,aabc,False
9,Fortis Institute - Scranton,alliedteched.edu,fis,False


In [13]:
accred_df['acronym_in_url'].value_counts(normalize = True)

False    0.71937
True     0.28063
Name: acronym_in_url, dtype: float64

sweet - we're already hitting 28% of accredited institutions. Let's see if we can further improve this by being flexible with acronyms.

In [14]:
qq = accred_df['acronym'][0]
rr = list([qq,qq[1:],qq[:-1]])
print(rr)

['acct', 'cct', 'acc']


In [15]:
def make_acronyms(X):
    if len(X['acronym']) >= 3:
        return list([X['acronym'], X['acronym'][1:], X['acronym'][:-1]])
    else:
        return [X['acronym']]

for df in dfs:
    df['potential_acronyms'] = df.apply(make_acronyms, axis=1)

In [16]:
accred_df

Unnamed: 0,Institution,URL,acronym,acronym_in_url,potential_acronyms
0,American Commercial College of Texas,acc-careers.com,acct,False,"[acct, cct, acc]"
1,American Commercial College - Abilene,acc-careers.com,acca,False,"[acca, cca, acc]"
2,Anson College of Cosmetology,accschool.peedeeworld.net,acc,True,"[acc, cc, ac]"
3,American College of Medical Technology,acmt.ac/,acmt,True,"[acmt, cmt, acm]"
4,Aviation and Electronic Schools of America - C...,aesa.com,aesac,False,"[aesac, esac, aesa]"
5,American University of Health Sciences,aihs.edu,auhs,False,"[auhs, uhs, auh]"
6,Gerber Akron Beauty School,akronbeautyschool.com,gabs,False,"[gabs, abs, gab]"
7,Adult Vocational Services,akronschools.com,avs,False,"[avs, vs, av]"
8,Alexandria Academy of Beauty Culture,alexacebeauty.com,aabc,False,"[aabc, abc, aab]"
9,Fortis Institute - Scranton,alliedteched.edu,fis,False,"[fis, is, fi]"


In [17]:
## list comprehension that cycles through all potential acronyms and checks if any are in url.
def check_any_acronym(x):
    in_url = ([ ac in x['URL'] for ac in x['potential_acronyms'] ])
    return(any(in_url))

In [18]:
for df in dfs:
    df['potential_acronym_in_url'] = df.apply(check_any_acronym, axis=1)

In [19]:
sample_df[:100]

Unnamed: 0,Institution,URL,isWrong,acronym,acronym_in_url,potential_acronyms,potential_acronym_in_url
0,Barton College,bestcarehealth.com,1,bc,False,[bc],False
1,Northern New Mexico College,nnmcc.edu,0,nnmc,True,"[nnmc, nmc, nnm]",True
2,Northern Virginia Community College,southeasterncareercollege.edu,1,nvcc,False,"[nvcc, vcc, nvc]",False
3,Suburban Technical School,suburbantech.com,0,sts,False,"[sts, ts, st]",False
4,Stratford University,valleycareercollege.com,1,su,False,[su],False
5,Saddleback Valley U.S.D. - Adult Education,www.ais.edu,1,svusdae,False,"[svusdae, vusdae, svusda]",False
6,Remington College-Fort Worth Campus,www.babel.edu,1,rcfwc,False,"[rcfwc, cfwc, rcfw]",False
7,Baker College,www.baker.edu,0,bc,False,[bc],False
8,Nazareth College of Rochester,www.barton.edu,1,ncr,False,"[ncr, cr, nc]",False
9,Baruch College of the City University of New York,www.baruch.cuny.edu,0,bccuny,False,"[bccuny, ccuny, bccun]",False


In [20]:
print(accred_df['acronym_in_url'].value_counts(normalize = True))
print(accred_df['potential_acronym_in_url'].value_counts(normalize = True))

False    0.71937
True     0.28063
Name: acronym_in_url, dtype: float64
False    0.618982
True     0.381018
Name: potential_acronym_in_url, dtype: float64


Picked up an additional 10% of accredited institutions - cool.

### Step 2: check for the presence of whole words

In [21]:
omitted_words = ['college','institute','school','university','the','of',',','-']

In [22]:
accred_df.loc[[0]]

Unnamed: 0,Institution,URL,acronym,acronym_in_url,potential_acronyms,potential_acronym_in_url
0,American Commercial College of Texas,acc-careers.com,acct,False,"[acct, cct, acc]",True


In [23]:
def get_keywords_noschooltypes(x):
    possible_words = str.split(x.lower())
    omit_words = ['college','institute','school','schools','university','the','of',',','-','and']
    resultwords  = [word for word in possible_words if word.lower() not in omit_words]
    return resultwords

In [24]:
def get_keywords(x):
    possible_words = str.split(x.lower())
    omit_words = ['the','of',',','-','and']
    resultwords  = [word for word in possible_words if word.lower() not in omit_words]
    return resultwords

In [25]:
for df in dfs:
    df['keywords'] = df['Institution'].apply(get_keywords)
    df['keywords_noschooltypes'] = df['Institution'].apply(get_keywords_noschooltypes)

In [26]:
accred_df

Unnamed: 0,Institution,URL,acronym,acronym_in_url,potential_acronyms,potential_acronym_in_url,keywords,keywords_noschooltypes
0,American Commercial College of Texas,acc-careers.com,acct,False,"[acct, cct, acc]",True,"[american, commercial, college, texas]","[american, commercial, texas]"
1,American Commercial College - Abilene,acc-careers.com,acca,False,"[acca, cca, acc]",True,"[american, commercial, college, abilene]","[american, commercial, abilene]"
2,Anson College of Cosmetology,accschool.peedeeworld.net,acc,True,"[acc, cc, ac]",True,"[anson, college, cosmetology]","[anson, cosmetology]"
3,American College of Medical Technology,acmt.ac/,acmt,True,"[acmt, cmt, acm]",True,"[american, college, medical, technology]","[american, medical, technology]"
4,Aviation and Electronic Schools of America - C...,aesa.com,aesac,False,"[aesac, esac, aesa]",True,"[aviation, electronic, schools, america, colfax]","[aviation, electronic, america, colfax]"
5,American University of Health Sciences,aihs.edu,auhs,False,"[auhs, uhs, auh]",False,"[american, university, health, sciences]","[american, health, sciences]"
6,Gerber Akron Beauty School,akronbeautyschool.com,gabs,False,"[gabs, abs, gab]",False,"[gerber, akron, beauty, school]","[gerber, akron, beauty]"
7,Adult Vocational Services,akronschools.com,avs,False,"[avs, vs, av]",False,"[adult, vocational, services]","[adult, vocational, services]"
8,Alexandria Academy of Beauty Culture,alexacebeauty.com,aabc,False,"[aabc, abc, aab]",False,"[alexandria, academy, beauty, culture]","[alexandria, academy, beauty, culture]"
9,Fortis Institute - Scranton,alliedteched.edu,fis,False,"[fis, is, fi]",False,"[fortis, institute, scranton]","[fortis, scranton]"


In [27]:
## seems inefficient to have another function, but if I want to be able to create new columns with the apply function,
## then I have to be taking in rows of the whole dataframe, not just a list of possible words
def check_any_keyword(x):
    in_url = ([ ac in x['URL'] for ac in x['keywords'] ])
    return(any(in_url))

def check_any_keyword_noschooltypes(x):
    in_url = ([ ac in x['URL'] for ac in x['keywords_noschooltypes'] ])
    return(any(in_url))

In [28]:
for df in dfs:
    df['keyword_in_url'] = df.apply(check_any_keyword, axis=1)
    df['keyword_in_url_noschooltypes'] = df.apply(check_any_keyword_noschooltypes, axis=1)

In [29]:
accred_df

Unnamed: 0,Institution,URL,acronym,acronym_in_url,potential_acronyms,potential_acronym_in_url,keywords,keywords_noschooltypes,keyword_in_url,keyword_in_url_noschooltypes
0,American Commercial College of Texas,acc-careers.com,acct,False,"[acct, cct, acc]",True,"[american, commercial, college, texas]","[american, commercial, texas]",False,False
1,American Commercial College - Abilene,acc-careers.com,acca,False,"[acca, cca, acc]",True,"[american, commercial, college, abilene]","[american, commercial, abilene]",False,False
2,Anson College of Cosmetology,accschool.peedeeworld.net,acc,True,"[acc, cc, ac]",True,"[anson, college, cosmetology]","[anson, cosmetology]",False,False
3,American College of Medical Technology,acmt.ac/,acmt,True,"[acmt, cmt, acm]",True,"[american, college, medical, technology]","[american, medical, technology]",False,False
4,Aviation and Electronic Schools of America - C...,aesa.com,aesac,False,"[aesac, esac, aesa]",True,"[aviation, electronic, schools, america, colfax]","[aviation, electronic, america, colfax]",False,False
5,American University of Health Sciences,aihs.edu,auhs,False,"[auhs, uhs, auh]",False,"[american, university, health, sciences]","[american, health, sciences]",False,False
6,Gerber Akron Beauty School,akronbeautyschool.com,gabs,False,"[gabs, abs, gab]",False,"[gerber, akron, beauty, school]","[gerber, akron, beauty]",True,True
7,Adult Vocational Services,akronschools.com,avs,False,"[avs, vs, av]",False,"[adult, vocational, services]","[adult, vocational, services]",False,False
8,Alexandria Academy of Beauty Culture,alexacebeauty.com,aabc,False,"[aabc, abc, aab]",False,"[alexandria, academy, beauty, culture]","[alexandria, academy, beauty, culture]",True,True
9,Fortis Institute - Scranton,alliedteched.edu,fis,False,"[fis, is, fi]",False,"[fortis, institute, scranton]","[fortis, scranton]",False,False


In [30]:
print(accred_df['keyword_in_url_noschooltypes'].value_counts(normalize = True))
print(accred_df['potential_acronym_in_url'].value_counts(normalize = True))

True     0.550764
False    0.449236
Name: keyword_in_url_noschooltypes, dtype: float64
False    0.618982
True     0.381018
Name: potential_acronym_in_url, dtype: float64


In [31]:
for df in dfs:
    df['matching_url'] = df['keyword_in_url_noschooltypes'] | df['potential_acronym_in_url']

In [32]:
accred_df

Unnamed: 0,Institution,URL,acronym,acronym_in_url,potential_acronyms,potential_acronym_in_url,keywords,keywords_noschooltypes,keyword_in_url,keyword_in_url_noschooltypes,matching_url
0,American Commercial College of Texas,acc-careers.com,acct,False,"[acct, cct, acc]",True,"[american, commercial, college, texas]","[american, commercial, texas]",False,False,True
1,American Commercial College - Abilene,acc-careers.com,acca,False,"[acca, cca, acc]",True,"[american, commercial, college, abilene]","[american, commercial, abilene]",False,False,True
2,Anson College of Cosmetology,accschool.peedeeworld.net,acc,True,"[acc, cc, ac]",True,"[anson, college, cosmetology]","[anson, cosmetology]",False,False,True
3,American College of Medical Technology,acmt.ac/,acmt,True,"[acmt, cmt, acm]",True,"[american, college, medical, technology]","[american, medical, technology]",False,False,True
4,Aviation and Electronic Schools of America - C...,aesa.com,aesac,False,"[aesac, esac, aesa]",True,"[aviation, electronic, schools, america, colfax]","[aviation, electronic, america, colfax]",False,False,True
5,American University of Health Sciences,aihs.edu,auhs,False,"[auhs, uhs, auh]",False,"[american, university, health, sciences]","[american, health, sciences]",False,False,False
6,Gerber Akron Beauty School,akronbeautyschool.com,gabs,False,"[gabs, abs, gab]",False,"[gerber, akron, beauty, school]","[gerber, akron, beauty]",True,True,True
7,Adult Vocational Services,akronschools.com,avs,False,"[avs, vs, av]",False,"[adult, vocational, services]","[adult, vocational, services]",False,False,False
8,Alexandria Academy of Beauty Culture,alexacebeauty.com,aabc,False,"[aabc, abc, aab]",False,"[alexandria, academy, beauty, culture]","[alexandria, academy, beauty, culture]",True,True,True
9,Fortis Institute - Scranton,alliedteched.edu,fis,False,"[fis, is, fi]",False,"[fortis, institute, scranton]","[fortis, scranton]",False,False,False


In [34]:
print(accred_df['matching_url'].value_counts(normalize = True))
print(sample_df['matching_url'].value_counts(normalize = True))
print(test_df['matching_url'].value_counts(normalize = True))

True     0.85535
False    0.14465
Name: matching_url, dtype: float64
False    0.51
True     0.49
Name: matching_url, dtype: float64
False    0.502551
True     0.497449
Name: matching_url, dtype: float64


In [35]:
accred_df[accred_df['matching_url'] == False]

Unnamed: 0,Institution,URL,acronym,acronym_in_url,potential_acronyms,potential_acronym_in_url,keywords,keywords_noschooltypes,keyword_in_url,keyword_in_url_noschooltypes,matching_url
5,American University of Health Sciences,aihs.edu,auhs,False,"[auhs, uhs, auh]",False,"[american, university, health, sciences]","[american, health, sciences]",False,False,False
7,Adult Vocational Services,akronschools.com,avs,False,"[avs, vs, av]",False,"[adult, vocational, services]","[adult, vocational, services]",False,False,False
9,Fortis Institute - Scranton,alliedteched.edu,fis,False,"[fis, is, fi]",False,"[fortis, institute, scranton]","[fortis, scranton]",False,False,False
27,Career Academy of Hair Design,beautyschool.edu,cahd,False,"[cahd, ahd, cah]",False,"[career, academy, hair, design]","[career, academy, hair, design]",False,False,False
31,Ohio Center for Broadcasting - Colorado Campus,beonair.com,ocbcc,False,"[ocbcc, cbcc, ocbc]",False,"[ohio, center, for, broadcasting, colorado, ca...","[ohio, center, for, broadcasting, colorado, ca...",False,False,False
34,Ohio State School of Cosmetology,blottsalonschools.com,ossc,False,"[ossc, ssc, oss]",False,"[ohio, state, school, cosmetology]","[ohio, state, cosmetology]",True,False,False
36,Ben Franklin Career & Technical Education Center,boe.kana.k12.wv.us/,bfctec,False,"[bfctec, fctec, bfcte]",False,"[ben, franklin, career, &, technical, educatio...","[ben, franklin, career, &, technical, educatio...",False,False,False
37,Monongalia County Technical Education Center,boe.mono.k12.wv.us/montec,mctec,False,"[mctec, ctec, mcte]",False,"[monongalia, county, technical, education, cen...","[monongalia, county, technical, education, cen...",False,False,False
42,Marinello School of Beauty,brioacademy.com,msb,False,"[msb, sb, ms]",False,"[marinello, school, beauty]","[marinello, beauty]",False,False,False
44,Everest College - Los Angeles,bryman-college.com,ecla,False,"[ecla, cla, ecl]",False,"[everest, college, los, angeles]","[everest, los, angeles]",True,False,False


In [36]:
sample_df

Unnamed: 0,Institution,URL,isWrong,acronym,acronym_in_url,potential_acronyms,potential_acronym_in_url,keywords,keywords_noschooltypes,keyword_in_url,keyword_in_url_noschooltypes,matching_url
0,Barton College,bestcarehealth.com,1,bc,False,[bc],False,"[barton, college]",[barton],False,False,False
1,Northern New Mexico College,nnmcc.edu,0,nnmc,True,"[nnmc, nmc, nnm]",True,"[northern, new, mexico, college]","[northern, new, mexico]",False,False,True
2,Northern Virginia Community College,southeasterncareercollege.edu,1,nvcc,False,"[nvcc, vcc, nvc]",False,"[northern, virginia, community, college]","[northern, virginia, community]",True,False,False
3,Suburban Technical School,suburbantech.com,0,sts,False,"[sts, ts, st]",False,"[suburban, technical, school]","[suburban, technical]",True,True,True
4,Stratford University,valleycareercollege.com,1,su,False,[su],False,"[stratford, university]",[stratford],False,False,False
5,Saddleback Valley U.S.D. - Adult Education,www.ais.edu,1,svusdae,False,"[svusdae, vusdae, svusda]",False,"[saddleback, valley, u.s.d., adult, education]","[saddleback, valley, u.s.d., adult, education]",False,False,False
6,Remington College-Fort Worth Campus,www.babel.edu,1,rcfwc,False,"[rcfwc, cfwc, rcfw]",False,"[remington, college-fort, worth, campus]","[remington, college-fort, worth, campus]",False,False,False
7,Baker College,www.baker.edu,0,bc,False,[bc],False,"[baker, college]",[baker],True,True,True
8,Nazareth College of Rochester,www.barton.edu,1,ncr,False,"[ncr, cr, nc]",False,"[nazareth, college, rochester]","[nazareth, rochester]",False,False,False
9,Baruch College of the City University of New York,www.baruch.cuny.edu,0,bccuny,False,"[bccuny, ccuny, bccun]",False,"[baruch, college, city, university, new, york]","[baruch, city, new, york]",True,True,True


thoughts: mostly, pretty good
-could improve with some partial string matching
-could expand list of partial acronyms a little bit (include lower case, drop last two letters)
-omit apostrophes

In [37]:
sample_df[['matching_url','isWrong']]

Unnamed: 0,matching_url,isWrong
0,False,1
1,True,0
2,False,1
3,True,0
4,False,1
5,False,1
6,False,1
7,True,0
8,False,1
9,True,0


In [42]:
sample_df.corr()

Unnamed: 0,isWrong,acronym_in_url,potential_acronym_in_url,keyword_in_url,keyword_in_url_noschooltypes,matching_url
isWrong,1.0,-0.311873,-0.429969,-0.595477,-0.659082,-0.861662
acronym_in_url,-0.311873,1.0,0.707283,-0.227969,-0.201431,0.394366
potential_acronym_in_url,-0.429969,0.707283,1.0,-0.112872,-0.069279,0.557578
keyword_in_url,-0.595477,-0.227969,-0.112872,1.0,0.914659,0.598453
keyword_in_url_noschooltypes,-0.659082,-0.201431,-0.069279,0.914659,1.0,0.699854
matching_url,-0.861662,0.394366,0.557578,0.598453,0.699854,1.0


In [49]:
sample_df

Unnamed: 0,Institution,URL,isWrong,acronym,acronym_in_url,potential_acronyms,potential_acronym_in_url,keywords,keywords_noschooltypes,keyword_in_url,keyword_in_url_noschooltypes,matching_url,correct_prediction
0,Barton College,bestcarehealth.com,1,bc,False,[bc],False,"[barton, college]",[barton],False,False,False,True
1,Northern New Mexico College,nnmcc.edu,0,nnmc,True,"[nnmc, nmc, nnm]",True,"[northern, new, mexico, college]","[northern, new, mexico]",False,False,True,False
2,Northern Virginia Community College,southeasterncareercollege.edu,1,nvcc,False,"[nvcc, vcc, nvc]",False,"[northern, virginia, community, college]","[northern, virginia, community]",True,False,False,True
3,Suburban Technical School,suburbantech.com,0,sts,False,"[sts, ts, st]",False,"[suburban, technical, school]","[suburban, technical]",True,True,True,False
4,Stratford University,valleycareercollege.com,1,su,False,[su],False,"[stratford, university]",[stratford],False,False,False,True
5,Saddleback Valley U.S.D. - Adult Education,www.ais.edu,1,svusdae,False,"[svusdae, vusdae, svusda]",False,"[saddleback, valley, u.s.d., adult, education]","[saddleback, valley, u.s.d., adult, education]",False,False,False,True
6,Remington College-Fort Worth Campus,www.babel.edu,1,rcfwc,False,"[rcfwc, cfwc, rcfw]",False,"[remington, college-fort, worth, campus]","[remington, college-fort, worth, campus]",False,False,False,True
7,Baker College,www.baker.edu,0,bc,False,[bc],False,"[baker, college]",[baker],True,True,True,False
8,Nazareth College of Rochester,www.barton.edu,1,ncr,False,"[ncr, cr, nc]",False,"[nazareth, college, rochester]","[nazareth, rochester]",False,False,False,True
9,Baruch College of the City University of New York,www.baruch.cuny.edu,0,bccuny,False,"[bccuny, ccuny, bccun]",False,"[baruch, college, city, university, new, york]","[baruch, city, new, york]",True,True,True,False


In [53]:
sample_df['correct_prediction'] = sample_df['isWrong'] == ~sample_df['matching_url']

In [55]:
sample_df['correct_prediction'].value_counts()

True     93
False     7
Name: correct_prediction, dtype: int64

In [58]:
sample_df[sample_df['correct_prediction'] == False]

Unnamed: 0,Institution,URL,isWrong,acronym,acronym_in_url,potential_acronyms,potential_acronym_in_url,keywords,keywords_noschooltypes,keyword_in_url,keyword_in_url_noschooltypes,matching_url,correct_prediction
13,Benedictine University,www.ben.edu,0,bu,False,[bu],False,"[benedictine, university]",[benedictine],False,False,False,False
33,The Master's College and Seminary,www.masters.edu,0,tmcs,False,"[tmcs, mcs, tmc]",False,"[master's, college, seminary]","[master's, seminary]",False,False,False,False
42,New Mexico Institute of Mining and Technology,www.nmt.edu,0,nmimt,False,"[nmimt, mimt, nmim]",False,"[new, mexico, institute, mining, technology]","[new, mexico, mining, technology]",False,False,False,False
53,Randolph-Macon Woman's College,www.randolphcollege.edu,0,rmwc,False,"[rmwc, mwc, rmw]",False,"[randolph-macon, woman's, college]","[randolph-macon, woman's]",True,False,False,False
63,Best Care College,www.rop.cc,1,bcc,False,"[bcc, cc, bc]",True,"[best, care, college]","[best, care]",False,False,True,False
68,Sunstate Academy,www.sandiegojobcorps.org,1,sa,True,[sa],True,"[sunstate, academy]","[sunstate, academy]",False,False,True,False
94,Teachers College of Columbia University,www.tc.edu,0,tccu,False,"[tccu, ccu, tcc]",False,"[teachers, college, columbia, university]","[teachers, columbia]",False,False,False,False
