# Title - Milestone I

## 1) Import dependencies

In [1]:
import sys
!{sys.executable} -m pip install -r requirements.txt



In [1]:
import os

import pandas as pd
from pyspark.sql import SparkSession
import numpy as np

import geopandas as gpd
from shapely.geometry import Point, Polygon

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

In [12]:
from src.zip_streaming import zip_streaming
from src.parcel_partition_write import parcel_partition_write
from src.parcel_partition_read import parcel_partition_read

from src.district_location import get_district_location
from src.school_location import get_school_location

from src.district_score import get_district_score
from src.school_score import get_school_score

## 2) Run data pipeline

### 2a) Run CoreLogic data pipeline

In [3]:
required_files = ['resources/data/df_parcel.pkl']
fresh_run = False
partition_column = "PROPERTY INDICATOR CODE"
partition_value = "10"

if all(os.path.exists(file) for file in required_files) and fresh_run == False:
    print("✅ Required files found. Running secondary run...")
    df_parcel = pd.read_pickle("resources/data/df_parcel.pkl")

else:
    print("🚀 Required files not found. Running initial run...")

    ## Unzip and slice the orignal coreLogic dataset
    zip_streaming(zip_file_path = 'resources/data/coreLogic.zip', 
                 file_name_in_zip = 'coreLogic.txt',
                 output_path = 'resources/data/filtered_core_logic.txt',
                 chunk_size = 10000,
                 state = 'MI')

    ## Partition the coreLogic slice using Spark
    parcel_partition_write(input_file = 'resources/data/filtered_core_logic.txt', 
                           partition_column = partition_column)

    ## Load selected partition into Pandas DataFrame and format as GeoDataFrame
    df_parcel = parcel_partition_read(partition_folder = f"resources/data/parition_{partition_column}",
                                     partition_value = partition_value,
                                     pkl_path = 'resources/data/df_parcel.pkl',
                                     pkl_save = True)

    print("✅ Initial run complete. Required files should now be saved.")


✅ Required files found. Running secondary run...


### 2b) Run education score data pipeline

In [13]:
required_files = ['resources/data/df_school_score.csv', 'resources/data/df_district_score.csv']
fresh_run = False
input_files = {'atrisk':'resources/data/At Risk Student file for Statewide, school year 2020-21.csv',
               'enrollment':'resources/data/College Enrollment by High School & 24 College Credits file for Statewide, school year 2020-21.csv',
               'assessment':'resources/data/High School Assessments file for Statewide, school year 2020-21.csv', 
               'effectiveness':'resources/data/Educator Effectiveness file for Statewide, school year 2020-21.csv'}

if all(os.path.exists(file) for file in required_files) and fresh_run == False:
    print("✅ Required files found. Running secondary run...")
    df_school_score = pd.read_csv("resources/data/df_school_score.csv")
    df_district_score = pd.read_csv("resources/data/df_district_score.csv")

elif all(os.path.exists(file) for file in input_files.values()):
    print("🚀 Required files not found. Running initial run...")

    df_school_score = get_school_score(enrollment = input_files['enrollment'], 
                                       assessment  = input_files['assessment'],
                                       effectiveness  = input_files['effectiveness'],
                                       csv_path = 'resources/data/df_district_score.csv',
                                       csv_save = True)
    df_district_score = get_district_score(atrisk = input_files['atrisk'],
                                           enrollment = input_files['enrollment'],
                                           assessment = input_files['assessment'],
                                           csv_path = 'resources/data/df_district_score.csv',
                                           csv_save = True)

    print("✅ Initial run complete. Required files should now be saved.")
else:
    print("❌ Input files not found. Donwload input files or update input_files dictionary.")



🚀 Required files not found. Running initial run...
✅ Initial run complete. Required files should now be saved.


### 2c) Run location data pipeline

In [None]:
df_district_location = get_district_location()
df_school_location = get_school_location()

0 Schools in the list
0 Schools in the list
0 Schools in the list
47 Schools in the list
191 Schools in the list
152 Schools in the list
597 Schools in the list
263 Schools in the list
0 Schools in the list
0 Schools in the list
0 Schools in the list
8 Schools in the list
401 Schools in the list
177 Schools in the list
380 Schools in the list
258 Schools in the list
0 Schools in the list
0 Schools in the list
0 Schools in the list
25 Schools in the list
51 Schools in the list
88 Schools in the list
103 Schools in the list
21 Schools in the list
0 Schools in the list
0 Schools in the list
0 Schools in the list
6 Schools in the list
67 Schools in the list
36 Schools in the list
24 Schools in the list
0 Schools in the list
0 Schools in the list
0 Schools in the list
25 Schools in the list
10 Schools in the list
5 Schools in the list
40 Schools in the list
6 Schools in the list
0 Schools in the list
4 Schools in the list
4 Schools in the list
24 Schools in the list
6 Schools in the list
7 

In [14]:
df_school_location

Unnamed: 0,school_code,school_name,latitude,longitude,geometry
0,00298,Mars Elementary School,41.945548,-86.342830,POINT (41.94555 -86.34283)
1,00296,Berrien Springs Middle School,41.945252,-86.349935,POINT (41.94525 -86.34994)
2,03217,Riverside School,42.173867,-86.388401,POINT (42.17387 -86.3884)
3,02183,W-A-Y Eau Claire,41.985570,-86.307310,POINT (41.98557 -86.30731)
4,00401,Bridgman High School,41.938988,-86.544837,POINT (41.93899 -86.54484)
...,...,...,...,...,...
3061,04357,Washington Middle School,47.243475,-88.446993,POINT (47.24348 -88.44699)
3062,05135,Grant Township School,47.467769,-87.890108,POINT (47.46777 -87.89011)
3063,05362,Powell Twp. Elementary School,46.817643,-87.729177,POINT (46.81764 -87.72918)
3064,09566,CHS-Horizons School,47.305195,-88.360071,POINT (47.3052 -88.36007)


## 3) Data wrangling & exploratory analysis

### 3a) Missing values & Imputation

### 3b) Variable distributions

In [15]:
df_district_score

Unnamed: 0,district_code,college_not_ready,percent_met_sci,percent_met_soc,total_enrolled,amalgam_education_score,log_correct_education_score
0,3010.0,79.2,19.5,48.7,54.1177,2.456812e-07,-15.219231
1,3020.0,61.8,20.4,53.3,50.0000,2.976351e-07,-15.027398
2,3030.0,77.1,18.6,44.8,47.2973,3.290927e-07,-14.926926
3,3040.0,70.8,18.3,51.9,39.2157,3.792175e-07,-14.785156
4,3050.0,82.9,5.0,22.7,48.1482,2.207342e-06,-13.023722
...,...,...,...,...,...,...,...
625,67055.0,77.9,9.3,28.0,37.8049,1.303988e-06,-13.550084
626,83010.0,70.5,14.1,50.5,46.6667,4.268677e-07,-14.666792
627,83060.0,75.4,10.0,28.8,38.1818,1.206090e-06,-13.628127
628,83070.0,77.9,10.0,23.3,31.3726,1.756128e-06,-13.252399
