In [1]:
import pandas as pd
%matplotlib inline
import pylab as plt
import numpy as np
import scipy as sc
import scipy.stats

In [2]:
from difflib import SequenceMatcher

# Finding school CDS numbers

Unfortunately, our main dataset came without California CDS numbers which allow for uniquely identifying high schools. In this file we will match schools with their CDS numbers using their names.

The `packed` data frame is the main dataset we are trying to extend.
The `ap_scores` data frame contains the CDS numbers with school names. 

In [3]:
packed = pd.read_csv('data/distances.csv')
ap_scores = pd.read_csv('data/test_scores/ap/processed.csv')

First we will extract the school names from both datasets.

In [4]:
cal_schools = packed[packed['state'] == 'California']
main_df_school_names = sorted(cal_schools['school'].unique())
ap_df_school_names = [name for name in ap_scores['school_name'].unique() if name is not np.nan]

In [5]:
display(packed.head(1))
display(ap_scores.head(1))

Unnamed: 0,campus,year,school,school_num,city,county,state,country,region,ethnicity,app_num,adm_num,enr_num,app_gpa,adm_gpa,enr_gpa,distance
0,Berkeley,1994.0,ABRAHAM LINCOLN HIGH SCHOOL,51520.0,Los Angeles,Los Angeles,California,USA,Los Angeles,All,14.0,,,3.62,,,601648.0


Unnamed: 0,ap_num_scr1,ap_num_scr2,ap_num_scr3,ap_num_scr4,ap_num_scr5,ap_num_test_takers,enroll12,school_name,school_num,year
0,0.0,0.0,0.0,0.0,0.0,0.0,224.0,Juvenile Hall/Court,130401.0,1998.0


Next, we will use the built-in Python `SequenceMatcher` to find which school names from `packed` correspond with which school names from `ap_scores`.

Note:
 - We converted all the names to lowercase, since the matcher is case sensitve
 - We removed the words *high* and *school* since they appear on and off randomly in the school names.
 - Our script outputs the currently processed school name every 50 school names. This is used to monitor the progress of the script

In [7]:
matchings = {}
i = 0
count = len(main_df_school_names)

def preprocess_school_name(school_name):
    return school_name.lower().replace("high", "").replace("school", "")

for hs_name in main_df_school_names:
    highest_score = -1
    chosen = None
    for match_with in ap_df_school_names:
        try:
            similarity_score = SequenceMatcher(
                None,
                preprocess_school_name(hs_name),
                preprocess_school_name(match_with)
            ).ratio()
        except:
            print(hs_name, match_with)
            raise
        if similarity_score > highest_score:
            highest_score = similarity_score
            chosen = match_with
    matchings[hs_name] = {'match': chosen, 'score': highest_score}
    i += 1
    if i % 50 == 0:
        print(i, count, hs_name, chosen, highest_score)

matchings

50 860 ALLIANCE TED K TAJIMA HIGH SCH Alliance Dr. Olga Mohan High 0.56
100 860 APPLIED TECHNOLOGY CENTER Applied Technology Center 1.0
150 860 BALDWIN PARK HIGH SCHOOL Baldwin Park High 0.9629629629629629
200 860 BRANHAM HIGH SCHOOL Branham High 0.9411764705882353
250 860 CALVARY MURRIETA CHRISTIAN SCH Murrieta Mesa High 0.5454545454545454
300 860 CESAR CHAVEZ HIGH SCHOOL Cesar Chavez High 0.9629629629629629
350 860 COAST UNION HIGH SCHOOL Coast Union High 0.96
400 860 CRISTO REY HS-SACRAMENTO West Sacramento School Fo 0.6046511627906976
450 860 DIXON HIGH SCHOOL Dixon High 0.9230769230769231
500 860 EL CAMINO FUNDAMENTAL HS El Camino Fundamental High 0.9565217391304348
550 860 EXCELSIOR CHARTER SCHOOLS Excelsior Charter 0.9444444444444444
600 860 FRONTIER HIGH SCHOOL Frontier High 0.9473684210526315
650 860 GRAUER SCHOOL Grace High 0.7692307692307693
700 860 HESPERIA CHRISTIAN SCHOOL Hesperia High School 0.6896551724137931
750 860 INDERKUM HIGH SCHOOL Inderkum High 0.9473684210526315

{'A B MILLER HIGH SCHOOL': {'match': 'Fontana A. B. Miller High',
  'score': 0.6666666666666666},
 'ABC CHRISTIAN SCHOOL': {'match': 'ACE Charter High',
  'score': 0.6153846153846154},
 'ABRAHAM LINCOLN HIGH SCHOOL': {'match': 'Abraham Lincoln High',
  'score': 0.9696969696969697},
 'ACACIAWOOD SCHOOL': {'match': 'Arcadia High', 'score': 0.631578947368421},
 'ACAD FOR ACADEMIC EXCELLENCE': {'match': 'Academy for Academic Excellence',
  'score': 0.9491525423728814},
 'ACADEMIA AVANCE CHARTER SCHOOL': {'match': 'Academia Avance Charter',
  'score': 0.9787234042553191},
 'ACADEMIC LEADERSHIP COMMUNITY': {'match': 'Academic Leadership Community',
  'score': 1.0},
 'ACADEMIES EDUCATION/EMPOWERMNT': {'match': 'Academies of Education and Empowerm',
  'score': 0.8307692307692308},
 'ACADEMY MED HEALTH SCI AT RHS': {'match': 'Academy of Medical & Health Sciences at',
  'score': 0.7352941176470589},
 'ACADEMY MULTILINGUAL ARTS SCI': {'match': 'Academy for Multiligual Arts and Sc',
  'score': 0.8

We will filter out all the matches with a score less than 0.75

In [8]:
good_matches = {k: v for k, v in matchings.items() if v['score'] > .75}

Only about 2/3 of the California schools were properly matched

In [9]:
len(good_matches)

617

Next, we will perform a visual inspection to filter out wrong matches that passed our filtering

In [10]:
{k: v['match'] for k,v in good_matches.items()}

{'ABRAHAM LINCOLN HIGH SCHOOL': 'Abraham Lincoln High',
 'ACAD FOR ACADEMIC EXCELLENCE': 'Academy for Academic Excellence',
 'ACADEMIA AVANCE CHARTER SCHOOL': 'Academia Avance Charter',
 'ACADEMIC LEADERSHIP COMMUNITY': 'Academic Leadership Community',
 'ACADEMIES EDUCATION/EMPOWERMNT': 'Academies of Education and Empowerm',
 'ACADEMY MULTILINGUAL ARTS SCI': 'Academy for Multiligual Arts and Sc',
 'ACADEMY OF CAREERS/EXPLORATION': 'Academy of Careers & Exploration',
 'ACADEMY OF MEDICAL ARTS AT CARSON': 'Academy of Medical Arts at Carson High',
 'ACADEMY OF THE CANYONS': 'Academy of the Canyons',
 'ACADEMY OF THE REDWOODS': 'Academy of the Redwoods',
 'ACADEMY SCIENCE & ENGINEERING': 'Academy of Science and Engineering',
 'ACALANES HIGH SCHOOL': 'Acalanes High',
 'ACE CHARTER HIGH SCHOOL': 'ACE Charter High',
 'ADELANTO HIGH SCHOOL': 'Adelanto High',
 'ADOLFO CAMARILLO HIGH SCHOOL': 'Adolfo Camarillo High',
 'ADRIAN C WILCOX HIGH SCHOOL': 'Adrian Wilcox High',
 'AGOURA HIGH SCHOOL': 'A

In [11]:
wrong_matches = [
    'LE GRAND UNION HIGH SCHOOL',
    'LAGUNA BLANCA SCHOOL',
    'LA SIERRA ACADEMY',
    'HORIZON JUNIOR SENIOR HS',
    'HORIZON CHRISTIAN ACADEMY',
    'HIGH BLUFF ACADEMY',
    'HARKER SCHOOL',
    'GRAUER SCHOOL',
    'GEORGE WASHINGTON PREP HS',
    'FELICITAS GONZALO MENDEZ LC1B',
    'FAIRMONT PREPARATORY ACADEMY',
    'EXCEL COLLEGE PREPARATORY HS',
    'ENGINEERING & TECH ACAD AT THS',
    'EL CAMINO HIGH',
    'EL CAMINO HIGH SCHOOL',
    'EAST BAY WALDORF SCHOOL',
    'DESERT CHRISTIAN ACADEMY',
    'DELPHI ACADEMY',
    'DAV STARR JORDAN HIGH SCHOOL',
    'DAVID STARR JORDAN HS',
    'CROSSROADS CHRISTIAN SCHOOL',
    'COMPTON SENIOR HIGH SCHOOL',
    'COLLEGE PREPARATORY SCHOOL',
    'ANACAPA SCHOOL',
    'ARROWSMITH ACADEMY',
    'ARROYO PACIFIC ACADEMY',
    'BELL GARDENS SENIOR HS',
    'BELLARMINE COLLEGE PREPARATORY',
    'BELMONT COLLEGE PREP SCHOOL',
    'BELMONT HIGH SCHOOL',
    'BISHOP QUINN HIGH SCHOOL',
    'BISHOPS SCHOOL',
    'BONITA VISTA HIGH SCHOOL',
    'BRENTWOOD SCHOOL',
    'BRIDGEMONT HIGH SCHOOL',
    'BRIDGES ACADEMY',
    'CAL COAST ACADEMY',
    'CALIFORNIA VIRTUAL ACADEMY LA',
    'CALIFORNIA VIRTUAL ACADEMY SD',
    'CALIFORNIA VIRTUAL ACADEMY SM',
    'CALIFORNIA VIRTUAL ACD-LA HIGH',
    'CAMBRIAN ACADEMY',
    'CAPISTRANO VALLEY CHRSTN SCH',
    'CENTRAL VALLEY CHRISTIAN HS',
    'CHAMINADE COLLEGE PREPARATORY',
    'CHICO SENIOR HIGH SCHOOL',
    'CHRISTBRIDGE ACADEMY',
    'CLEVELAND HIGH SCHOOL'
]

In [12]:
for wrong_match in wrong_matches:
    del good_matches[wrong_match]

We end up with 569 good matches and 291 bad matches.

In [13]:
len(good_matches)

569

In [14]:
bad_matches = {k: v for k, v in matchings.items() if k not in good_matches}

In [15]:
len(bad_matches)

291

We perform a quick visual check to make sure most of our bad matches are indeed incorrect.

In [16]:
bad_matches

{'A B MILLER HIGH SCHOOL': {'match': 'Fontana A. B. Miller High',
  'score': 0.6666666666666666},
 'ABC CHRISTIAN SCHOOL': {'match': 'ACE Charter High',
  'score': 0.6153846153846154},
 'ACACIAWOOD SCHOOL': {'match': 'Arcadia High', 'score': 0.631578947368421},
 'ACADEMY MED HEALTH SCI AT RHS': {'match': 'Academy of Medical & Health Sciences at',
  'score': 0.7352941176470589},
 'ACADEMY OF SCIENTIFIC EXPLORAT': {'match': 'Academy of Careers and Explora',
  'score': 0.7},
 'ACADEMY OUR LADY OF PEACE': {'match': 'Academy For Academic Exce',
  'score': 0.68},
 'ACADEMY-SAN FRAN @ MCATEER': {'match': 'Academy (The)- SF @McAteer',
  'score': 0.7307692307692307},
 'AGBU MANOOGIAN-DEMIRDJIAN SCH': {'match': 'Village Academy High School at Indian Hi',
  'score': 0.5084745762711864},
 'AGBU VATCHE AND TAMAR MANOUKIAN HS': {'match': 'ARTLAB at Sonia Sotomayor Learning',
  'score': 0.5},
 'AL ARQAM COLL PREPARATORY SCH': {'match': 'NAVA College Preparatory Academy',
  'score': 0.6885245901639344

Next, we append the new school number to our main dataset. Then, we save it as a `.csv` file.

In [17]:
packed['school_num'] = np.nan
for k, v in good_matches.items():
    school_num = list(ap_scores[ap_scores['school_name'] == v['match']]['school_num'])[0]
    packed.loc[packed['school'] == k, 'school_num'] = school_num
packed

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
5        NaN
6        NaN
7        NaN
8        NaN
9        NaN
10       NaN
11       NaN
12       NaN
13       NaN
14       NaN
15       NaN
16       NaN
17       NaN
18       NaN
19       NaN
20       NaN
21       NaN
22       NaN
23       NaN
24       NaN
25       NaN
26       NaN
27       NaN
28       NaN
29       NaN
          ..
341754   NaN
341755   NaN
341756   NaN
341757   NaN
341758   NaN
341759   NaN
341760   NaN
341761   NaN
341762   NaN
341763   NaN
341764   NaN
341765   NaN
341766   NaN
341767   NaN
341768   NaN
341769   NaN
341770   NaN
341771   NaN
341772   NaN
341773   NaN
341774   NaN
341775   NaN
341776   NaN
341777   NaN
341778   NaN
341779   NaN
341780   NaN
341781   NaN
341782   NaN
341783   NaN
Name: school_num, Length: 341784, dtype: float64

Unnamed: 0,campus,year,school,school_num,city,county,state,country,region,ethnicity,app_num,adm_num,enr_num,app_gpa,adm_gpa,enr_gpa,distance
0,Berkeley,1994.0,ABRAHAM LINCOLN HIGH SCHOOL,3833241.0,Los Angeles,Los Angeles,California,USA,Los Angeles,All,14.0,,,3.620000,,,601648.0
1,Berkeley,1994.0,ABRAHAM LINCOLN HIGH SCHOOL,3833241.0,Los Angeles,Los Angeles,California,USA,Los Angeles,Asian,8.0,,,3.620000,,,601648.0
2,Berkeley,1994.0,ABRAHAM LINCOLN HIGH SCHOOL,3833241.0,Los Angeles,Los Angeles,California,USA,Los Angeles,Hispanic/ Latino,5.0,,,3.620000,,,601648.0
3,Berkeley,1994.0,ABRAHAM LINCOLN HIGH SCHOOL,3833241.0,San Francisco,San Francisco,California,USA,San Francisco,All,58.0,8.0,7.0,3.682931,4.121250,4.088571,33037.0
4,Berkeley,1994.0,ABRAHAM LINCOLN HIGH SCHOOL,3833241.0,San Francisco,San Francisco,California,USA,San Francisco,Asian,50.0,8.0,7.0,3.682931,4.121250,4.088571,33037.0
5,Berkeley,1994.0,ABRAHAM LINCOLN HIGH SCHOOL,3833241.0,San Jose,Santa Clara,California,USA,Santa Clara,All,14.0,,,3.640714,,,76043.0
6,Berkeley,1994.0,ABRAHAM LINCOLN HIGH SCHOOL,3833241.0,San Jose,Santa Clara,California,USA,Santa Clara,Hispanic/ Latino,6.0,,,3.640714,,,76043.0
7,Berkeley,1994.0,ACADEMY OUR LADY OF PEACE,,San Diego,San Diego,California,USA,San Diego,All,5.0,,,3.786000,,,790444.0
8,Berkeley,1994.0,ACALANES HIGH SCHOOL,730283.0,Lafayette,Contra Costa,California,USA,Contra Costa,All,61.0,30.0,13.0,3.557869,3.828333,3.563846,21980.0
9,Berkeley,1994.0,ACALANES HIGH SCHOOL,730283.0,Lafayette,Contra Costa,California,USA,Contra Costa,Asian,16.0,4.0,,3.557869,3.828333,,21980.0


In [19]:
packed.to_csv('data/distances_and_cds.csv', sep=',', index=False)