In [1]:
import numpy as np
import pandas as pd
import geopandas

random_state = 42

In [2]:
path = ''

In [3]:
# Load the LEAIDs which were train/test split in the coi_data_cleaning notebook
leaids_train_2010 = pd.read_csv(path+'data_cleaned/leaids_train_2010.csv', index_col=0).squeeze('columns')
leaids_train_2015 = pd.read_csv(path+'data_cleaned/leaids_train_2015.csv', index_col=0).squeeze('columns')
leaids_test_2010 = pd.read_csv(path+'data_cleaned/leaids_test_2010.csv', index_col=0).squeeze('columns')
leaids_test_2015 = pd.read_csv(path+'data_cleaned/leaids_test_2015.csv', index_col=0).squeeze('columns')

In [4]:
leaids_train_2010.head()

0    3904737
1    4823310
2    4807890
3     804860
4    4838900
Name: 0, dtype: int64

In [5]:
# Load SEDA long file
seda = pd.read_csv(path+'data_raw/seda_geodist_long_cs_4.1.csv')

In [6]:
seda.columns

Index(['fips', 'stateabb', 'sedalea', 'sedaleaname', 'subject', 'grade',
       'year', 'cs_mn_all', 'cs_mnse_all', 'totgyb_all', 'cs_mn_asn',
       'cs_mnse_asn', 'totgyb_asn', 'cs_mn_blk', 'cs_mnse_blk', 'totgyb_blk',
       'cs_mn_ecd', 'cs_mnse_ecd', 'totgyb_ecd', 'cs_mn_fem', 'cs_mnse_fem',
       'totgyb_fem', 'cs_mn_hsp', 'cs_mnse_hsp', 'totgyb_hsp', 'cs_mn_mal',
       'cs_mnse_mal', 'totgyb_mal', 'cs_mn_mfg', 'cs_mnse_mfg', 'totgyb_mfg',
       'cs_mn_mtr', 'cs_mnse_mtr', 'totgyb_mtr', 'cs_mn_nam', 'cs_mnse_nam',
       'totgyb_nam', 'cs_mn_nec', 'cs_mnse_nec', 'totgyb_nec', 'cs_mn_neg',
       'cs_mnse_neg', 'totgyb_neg', 'cs_mn_wag', 'cs_mnse_wag', 'totgyb_wag',
       'cs_mn_wbg', 'cs_mnse_wbg', 'totgyb_wbg', 'cs_mn_whg', 'cs_mnse_whg',
       'totgyb_whg', 'cs_mn_wht', 'cs_mnse_wht', 'totgyb_wht', 'cs_mn_wmg',
       'cs_mnse_wmg', 'totgyb_wmg', 'cs_mn_wng', 'cs_mnse_wng', 'totgyb_wng'],
      dtype='object')

In [7]:
# Split SEDA data by filtering with the split LEAIDs
seda_train_2010 = seda[seda['sedalea'].isin(leaids_train_2010)]

# Remove the 3rd grade, because those are included in the COI data
seda_train_2010 = seda_train_2010[seda_train_2010['grade'] > 3]

# Only years 2011-2015
seda_train_2010 = seda_train_2010[(seda_train_2010['year'] >= 2011) & (seda_train_2010['year'] <= 2015)]

seda_train_2010.shape

(418786, 61)

In [8]:
# Split SEDA data by filtering with the split LEAIDs
seda_train_2015 = seda[seda['sedalea'].isin(leaids_train_2015)]

# Remove the 3rd grade, because those are included in the COI data
seda_train_2015 = seda_train_2015[seda_train_2015['grade'] > 3]

# Only years 2016-2018 (last year of set)
seda_train_2015 = seda_train_2015[seda_train_2015['year'] >= 2016]

seda_train_2015.shape

(239837, 61)

In [9]:
# Split SEDA data by filtering with the split LEAIDs
seda_test_2010 = seda[seda['sedalea'].isin(leaids_test_2010)]

# Remove the 3rd grade, because those are included in the COI data
seda_test_2010 = seda_test_2010[seda_test_2010['grade'] > 3]

# Only years 2011-2015
seda_test_2010 = seda_test_2010[(seda_test_2010['year'] >= 2011) & (seda_test_2010['year'] <= 2015)]

seda_test_2010.shape

(104359, 61)

In [10]:
# Split SEDA data by filtering with the split LEAIDs
seda_test_2015 = seda[seda['sedalea'].isin(leaids_test_2010)]

# Remove the 3rd grade, because those are included in the COI data
seda_test_2015 = seda_test_2015[seda_test_2015['grade'] > 3]

# Only years 2016-2018 (last year of set)
seda_test_2015 = seda_test_2015[seda_test_2015['year'] >= 2016]

seda_test_2015.shape

(59992, 61)

In [11]:
# Write SEDA train/test to csv
seda_train_2010.to_csv(path+'data_cleaned/seda_train_2010.csv')
seda_train_2015.to_csv(path+'data_cleaned/seda_train_2015.csv')
seda_test_2010.to_csv(path+'data_cleaned/seda_test_2010.csv')
seda_test_2015.to_csv(path+'data_cleaned/seda_test_2015.csv')

In [12]:
#load the school district data file from the shp file
school_data = geopandas.read_file(path+'data_raw/SCHOOLDISTRICT_SY1314_TL15.zip')

#rename a few columns for reporting needs
school_data = school_data.rename(columns={'GEOID': 'sedalea', 'INTPTLAT':'latitude', 'INTPTLON': 'longitude'})

#subset the dataframe columns
school_data_map = school_data[['sedalea', 'latitude', 'longitude']]

#convert sedalea column from object to int data type to clean up leading zeros in some rows
school_data_map.loc[:,'sedalea'] = school_data_map.loc[:,'sedalea'].astype(str).astype('int64')

#there are 13,590 school districts and their associated latitude and longitude values
#for uploading to github
school_data_map.to_pickle(path+'data_cleaned/school_data_for_map.pkl')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value
