In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()


In [2]:
#pkg = mp.jupyter.open_package()
pkg = mp.jupyter.open_source_package()
pkg

In [3]:
from pathlib import Path


name_rows = []

for f in (Path(pkg.path).parent/'data'/'FR ENR').glob('*.csv'):
    t =f.stem.replace('FR ENR','');
    year = int(t.split()[-1])
    school = t.replace(str(year),'')
    name_rows.append({'year':year, 'campus':school, 'path':f})
    
paths_df = pd.DataFrame(name_rows)

# Check that all of the records are there. 
assert all(paths_df.groupby('year').count().campus == 10)
assert all(paths_df.groupby('campus').count().year == 7)

In [4]:
frames = []


for idx, r in paths_df.iterrows():
    
    df = pd.read_csv(r.path, delimiter='\t', encoding='utf-16').assign(year=r.year, campus=r.campus)

        
    frames.append(df)
    
t = pd.concat(frames).reset_index(drop=True)
t = t.drop(columns=['Calculation1'])
t.columns = ['school', 'city', 'region', 'count', 'all', 'black', 'aian', 'hips', 'nhpi', 
              'asian', 'white', 'na', 'international', 'year', 'campus']
uc_df = t[['year', 'school',  'campus', 'city', 'region', 'count', 'all', 'black', 'aian', 'hips', 'nhpi', 
              'asian', 'white', 'na', 'international']]

def strip_words(v):
    import re
    
    v = v.lower()
    v = re.sub('school$','',v)
    v = v.replace('high','').replace('senior','').replace('hs','')
    v = re.sub('\s+',' ',v)
    
    return v

uc_df['match_school'] = uc_df.school.apply(strip_words)

uc_df.sample(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uc_df['match_school'] = uc_df.school.apply(strip_words)


Unnamed: 0,year,school,campus,city,region,count,all,black,aian,hips,nhpi,asian,white,na,international,match_school
106263,2017,DEL LAGO ACADEMY,Meced,Escondido,San Diego,App,17.0,,,13.0,,,,,,del lago academy
156347,2017,ALISO NIGUEL HIGH SCHOOL,Los Angeles,Aliso Viejo,Orange,Enr,16.0,,,,,6.0,7.0,,,aliso niguel
171512,2019,RESEDA HIGH SCHOOL,Berkeley,Reseda,Los Angeles,Enr,,,,,,,,,,reseda
91200,2018,LOWELL HIGH SCHOOL,Meced,San Francisco,San Francisco,App,84.0,,,16.0,,55.0,5.0,,,lowell
72487,2019,HIGHLAND HIGH SCHOOL,Davis,Bakersfield,Kern,Adm,9.0,,,6.0,,,,,,land
154851,2020,LEADERSHIP PUBLIC SCH HAYWARD,Berkeley,Hayward,Alameda,App,39.0,,,19.0,,16.0,,,,leadership public sch hayward
15029,2020,SCOTTS VALLEY HIGH SCHOOL,Los Angeles,Scotts Valley,Santa Cruz,Enr,7.0,,,,,,4.0,,,scotts valley
69480,2017,FAIRFAX HIGH SCHOOL,San Diego,Los Angeles,Los Angeles,App,68.0,9.0,,23.0,,30.0,5.0,,,fairfax
23575,2022,ANIMO PAT BROWN CHARTER HS,Los Angeles,Los Angeles,Los Angeles,Adm,3.0,,,,,,,,,animo pat brown charter
23930,2022,CENTRAL EAST HIGH SCHOOL,Los Angeles,Fresno,Fresno,Enr,3.0,,,,,,,,,central east


In [5]:
sch_pkg = mp.open_package('http://library.metatab.org/cde.ca.gov-schools-2.1.1.csv')

# Create Dataframes
public_schools_df = sch_pkg.resource('public_schools').dataframe()
pubhs_df = public_schools_df[public_schools_df.eilcode == "HS"]


In [6]:
# Build a dict that maps names to CDS codes
from thefuzz import process
from thefuzz import fuzz

match_map = {}

for county, g in pubhs_df.groupby('county'):
    match_map[county.lower()] = {}
    for idx, r in g.iterrows():
        
        school = strip_words(r.school.lower())
        
        match_map[county.lower()][school] = r.cdscode

In [7]:
def find_match(region, school):
    
    region = region.lower()
    school == school.lower()
    
    name_map = match_map[region]
    
    match  = process.extractOne(school, name_map.keys(), scorer=fuzz.partial_token_sort_ratio)
    
    return {
        'score': match[1],
        'qscore': match[1]//10,
        'match_school': school,
        'match': match[0],
        'region': region,
        'cdscode': name_map[match[0]]
    }

In [8]:
# Use fuzzy matching to find the CDScode from the UC high schools names
from tqdm.auto import tqdm

matches = []
for idx, r in tqdm(list(uc_df[['region','match_school']].drop_duplicates().iterrows())):
    matches.append(find_match(r.region,r.match_school))
    
match_df = pd.DataFrame(matches)

match_df.head()

  0%|          | 0/1675 [00:00<?, ?it/s]

Unnamed: 0,score,qscore,match_school,match,region,cdscode
0,80,8,a b miller,fontana a. b. miller,san bernardino,36677103630555
1,100,10,abraham lincoln,lincoln (abraham),san francisco,38684783833241
2,100,10,abraham lincoln,abraham lincoln,santa clara,43696664333795
3,71,7,academia avance charter,arcadia,los angeles,19642611930288
4,81,8,academic leadership community,animo leadership,los angeles,19647091996313


In [9]:
# Report on what percentage of the matches will be used. 80% look like a good cut off, but there are
# certainly a few false positivies & negatives. 
match_df.groupby(match_df.score >= 80).qscore.count()/len(match_df)

score
False    0.172537
True     0.827463
Name: qscore, dtype: float64

In [10]:
t = uc_df.merge(match_df, on='match_school')
t = t.rename(columns={'school':'uc_school'}).merge(pubhs_df, on='cdscode')

cols = ['year', 'cdscode', 'ncesdist', 'ncesschool',  'school', 'uc_school', 'campus', 
        'city_x', 'city_y', 'region_x', 'county', 'district',
        'count', 'all', 'black', 'aian', 'hips', 'nhpi', 'asian', 'white', 'na', 'international',  
        'region_y',  'charter', 'virtual', 'magnet', 'eilname',  'gsserved',
         'zip', 'latitude', 'longitude' ]

df = t[cols].copy().rename(columns={'city_x':'uc_city','city_y':'cde_city', 'region_x':'county'})

df.sample(10).head()

Unnamed: 0,year,cdscode,ncesdist,ncesschool,school,uc_school,campus,uc_city,cde_city,county,...,international,region_y,charter,virtual,magnet,eilname,gsserved,zip,latitude,longitude
149322,2019,19647331937838,622710,3330,San Pedro Senior High,SAN PEDRO HIGH SCHOOL,Meced,San Pedro,San Pedro,Los Angeles,...,,los angeles,N,C,Y,High School,9-12,90731-3925,33.73059,-118.29906
44194,2022,1612590132688,628050,4255,Dewey Academy High,EAST BAY INNOVATION ACADEMY,Meced,Oakland,Oakland,Alameda,...,,alameda,N,N,N,High School,10-12,94606-2285,37.796705,-122.25817
10399,2016,19647330111583,622710,11636,Animo Jackie Robinson High,ANIMO JACKIE ROBINSON CHRT SCH,Davis,Los Angeles,Los Angeles,Los Angeles,...,,los angeles,Y,N,N,High School,9-12,90007-4333,34.018395,-118.27561
176269,2016,43694274335428,611820,1306,William C. Overfelt High,W C OVERFELT HIGH SCHOOL,Universitywide,San Jose,San Jose,Santa Clara,...,,santa clara,N,C,Y,High School,9-12,95122-1712,37.330593,-121.82838
131351,2019,50712175036256,630030,4703,Patterson High,PATTERSON HIGH SCHOOL,Irvine,Patterson,Patterson,Stanislaus,...,,stanislaus,N,N,N,High School,9-12,95363-2215,37.471617,-121.13662
