In [19]:
import pandas as pd
import numpy as np
import glob
import os

## Impute zip codes from congressional districts

In [5]:
import json
from scipy import spatial
import math
from tqdm import tqdm

In [6]:
def load_precincts_geojson():
    return json.load(open('./data/registration/precincts-with-results.geojson'))

def load_zip_coordinates():
    df = pd.read_csv('./data/registration/uszips.csv')

    return df[['zip', 'lat', 'lng']]

def get_geo_id_coordinates_from_geojson(geojson):
    recs = []
    for item in geojson['features']:
        all_coordinates = flatten_coordinates(item['geometry']['coordinates'])
        coordinates = np.array(flatten_coordinates(item['geometry']['coordinates']))
        rec = {
            'geo_id': item['properties']['GEOID'],
            'lat': np.mean(coordinates[:, 1]),
            'lng': np.mean(coordinates[:, 0])
        }
        recs.append(rec)

    return pd.DataFrame(recs)

def flatten_coordinates(arr):
    if type(arr[0]) != list:
        return [arr]

    result = []
    for sub_arr in arr:
        for coord in flatten_coordinates(sub_arr):
            result.append(coord)
            
    return result

def get_closest_location_indexes(left_df, right_df):
    result = []
    left_sphere_coordinates = [
        location_sphere_coordinates(row['lat'], row['lng'])
        for _index, row in left_df.iterrows()
    ]
    right_sphere_coordinates = [
        location_sphere_coordinates(row['lat'], row['lng'])
        for _index, row in right_df.iterrows()
    ]
    # kd-tree much more faster than pairwise comparison
    tree = spatial.KDTree(right_sphere_coordinates)
    for left_coordinate in left_sphere_coordinates:
        _dist, index = tree.query(left_coordinate)
        result.append(index)

    return result

def location_sphere_coordinates(lat, lng):
    lat = math.radians(lat)
    lng = math.radians(lng)
    x = np.cos(lng) * np.sin(lat)
    y = np.sin(lng) * np.sin(lat)
    z = np.cos(lat)

    return (x, y, z)

precincts_geojson = load_precincts_geojson()
precincts_coordinates = get_geo_id_coordinates_from_geojson(precincts_geojson)

zip_coordinates = load_zip_coordinates()

closest_zip_indexes = get_closest_location_indexes(precincts_coordinates, zip_coordinates)
geo_id_to_zip = pd.DataFrame({
    'geo_id': precincts_coordinates['geo_id'],
    'zip': np.array(zip_coordinates['zip'])[closest_zip_indexes]
})

In [7]:
geo_id_to_zip['geoid_code'] = geo_id_to_zip['geo_id'].apply(lambda x: x[:5])
geo_id_to_zip

Unnamed: 0,geo_id,zip,geoid_code
0,05047-1-A (Oz Wd 1),72949,05047
1,05149-11 - Dutch Creek,72833,05149
2,05081-Franklin Township,71859,05081
3,05027-McNeil City,71752,05027
4,05027-Taylor Township,71861,05027
...,...,...,...
146591,56007-01-01,82334,56007
146592,56045-05-01,82723,56045
146593,56045-05-02,82723,56045
146594,56045-05-03,82723,56045


In [6]:
geo_id_to_zip.to_csv('./data/registration/geoid_to_zip.csv')

In [8]:
ca_df = pd.read_csv("./data/registration/ca_ev_registrations_public.csv", dtype={'County GEOID': str})
ca_df = ca_df.drop(columns=['Vehicle ID', 'DMV ID', 'DMV Snapshot', 'Registration Expiration Date', 'Geography'])
ca_df['Drivetrain Type'] = ca_df['Vehicle Name'].apply(lambda x: 'PHEV' if (('hybrid' in x.lower()) or ('plug' in x.lower())) else 'BEV')
ca_df['Vehicle Make'] = ca_df['Vehicle Name'].apply(lambda x: x.split()[0])
ca_df['Vehicle Model'] = ca_df['Vehicle Name'].apply(lambda x: ' '.join(x.split()[1:]))
ca_df = ca_df.rename(columns={'Registration Valid Date': 'Registration Date', 'State Abbreviation': 'State'})
ca_df = ca_df.drop(columns=['Vehicle Name'])

In [9]:
ca_df

Unnamed: 0,County GEOID,Registration Date,State,Drivetrain Type,Vehicle Make,Vehicle Model
0,06099,2011-01-01,CA,BEV,Chevrolet,Volt
1,06105,2011-01-01,CA,BEV,Nissan,Leaf
2,06103,2011-01-01,CA,BEV,Chevrolet,Volt
3,06099,2011-01-01,CA,BEV,Tesla,Roadster
4,06099,2011-01-01,CA,BEV,Tesla,Roadster
...,...,...,...,...,...,...
2542438,06075,2020-01-01,CA,PHEV,Toyota,Prius Plug-in
2542439,06075,2020-01-01,CA,PHEV,Toyota,Prius Plug-in
2542440,06075,2020-01-01,CA,PHEV,Toyota,Prius Plug-in
2542441,06075,2020-01-01,CA,PHEV,Toyota,Prius Plug-in


In [None]:
geo_id_to_zip[geo_id_to_zip['geoid_code'] == ca_df.iloc[0,0]]['zip']

In [18]:
for i in range(0,10,5):
    print(i)

0
5


In [17]:
def find_zips_from_geoid(df, chunk_size, start=0):
    with tqdm(total=int((len(df)-start)/chunk_size)) as pbar:
        for i in range(start, len(df), chunk_size):
            if len(df) - i < chunk_size:
                chunk = ca_df.iloc[i:len(df)+i,:].copy()
            else:
                chunk = ca_df.iloc[i:i+chunk_size,:].copy()
            chunk['ZIP Code'] = chunk['County GEOID'].apply(lambda x: set(geo_id_to_zip[geo_id_to_zip['geoid_code'] == x]['zip']))
            chunk.to_csv(f'./data/registration/ca_chunks/CA_reg_chunk{int(i/chunk_size):05d}.csv')
            pbar.update(len(df)-start)

In [29]:
chunk = ca_df.iloc[0:10000,:].copy()
chunk['ZIP Code'] = chunk['County GEOID'].apply(lambda x: set(geo_id_to_zip[geo_id_to_zip['geoid_code'] == x]['zip']))
chunk.to_csv(f'./data/registration/ca_chunks/oneCA_reg_chunk00000.csv')

NameError: name 'i' is not defined

In [30]:
chunk.to_csv(f'./data/registration/ca_chunks/oneCA_reg_chunk00000.csv')

In [18]:
find_zips_from_geoid(ca_df, 10000)

2443it [00:11, 217.59it/s]


In [32]:
pd.read_csv('./data/registration/ca_chunks/CA_reg_chunk00000.csv', index_col=0)

Unnamed: 0,County GEOID,Registration Date,State,Drivetrain Type,Vehicle Make,Vehicle Model,ZIP Code
0,6099,2011-01-01,CA,BEV,Chevrolet,Volt,"{95360, 95361, 95363, 95367, 95368, 95380, 953..."
1,6105,2011-01-01,CA,BEV,Nissan,Leaf,"{95552, 95527, 96041, 96010, 95563, 96076, 955..."
2,6103,2011-01-01,CA,BEV,Chevrolet,Volt,"{96035, 96007, 96029, 96075, 96076, 96074, 960..."
3,6099,2011-01-01,CA,BEV,Tesla,Roadster,"{95360, 95361, 95363, 95367, 95368, 95380, 953..."
4,6099,2011-01-01,CA,BEV,Tesla,Roadster,"{95360, 95361, 95363, 95367, 95368, 95380, 953..."
...,...,...,...,...,...,...,...
9995,6037,2012-01-01,CA,BEV,Chevrolet,Volt,"{90631, 90638, 90639, 90640, 90065, 90650, 906..."
9996,6037,2012-01-01,CA,BEV,Chevrolet,Volt,"{90631, 90638, 90639, 90640, 90065, 90650, 906..."
9997,6037,2012-01-01,CA,BEV,Chevrolet,Volt,"{90631, 90638, 90639, 90640, 90065, 90650, 906..."
9998,6037,2012-01-01,CA,BEV,Chevrolet,Volt,"{90631, 90638, 90639, 90640, 90065, 90650, 906..."


In [40]:
path = './data/registration/ca_chunks/'
files = glob.glob(os.path.join(path, "*.csv"))
ca_list = []

for file in files:
    ca_list.append(pd.read_csv(file, index_col=0))

In [42]:
frame = pd.concat(ca_list, axis=0)

Unnamed: 0,County GEOID,Registration Date,State,Drivetrain Type,Vehicle Make,Vehicle Model,ZIP Code
1560000,Unknown,2019-01-01,CA,BEV,Tesla,Model S,set()
1560001,Unknown,2019-01-01,CA,BEV,Tesla,Model S,set()
1560002,Unknown,2019-01-01,CA,BEV,Tesla,Model S,set()
1560003,Unknown,2019-01-01,CA,BEV,Tesla,Model S,set()
1560004,Unknown,2019-01-01,CA,BEV,Tesla,Model S,set()
...,...,...,...,...,...,...,...
1739995,6055,2019-01-01,CA,BEV,Chevrolet,Bolt EV,"{94562, 94599, 95431, 94567, 95688, 94503, 945..."
1739996,6055,2019-01-01,CA,BEV,Chevrolet,Bolt EV,"{94562, 94599, 95431, 94567, 95688, 94503, 945..."
1739997,6055,2019-01-01,CA,BEV,Chevrolet,Bolt EV,"{94562, 94599, 95431, 94567, 95688, 94503, 945..."
1739998,6055,2019-01-01,CA,BEV,Chevrolet,Bolt EV,"{94562, 94599, 95431, 94567, 95688, 94503, 945..."


In [44]:
frame = frame[frame['County GEOID'] != 'Unknown']

In [51]:
frame = frame.sort_index()

In [52]:
frame

Unnamed: 0,County GEOID,Registration Date,State,Drivetrain Type,Vehicle Make,Vehicle Model,ZIP Code
0,6099,2011-01-01,CA,BEV,Chevrolet,Volt,"{95360, 95361, 95363, 95367, 95368, 95380, 953..."
1,6105,2011-01-01,CA,BEV,Nissan,Leaf,"{95552, 95527, 96041, 96010, 95563, 96076, 955..."
2,6103,2011-01-01,CA,BEV,Chevrolet,Volt,"{96035, 96007, 96029, 96075, 96076, 96074, 960..."
3,6099,2011-01-01,CA,BEV,Tesla,Roadster,"{95360, 95361, 95363, 95367, 95368, 95380, 953..."
4,6099,2011-01-01,CA,BEV,Tesla,Roadster,"{95360, 95361, 95363, 95367, 95368, 95380, 953..."
...,...,...,...,...,...,...,...
2542438,6075,2020-01-01,CA,PHEV,Toyota,Prius Plug-in,"{94607, 94102, 94103, 94104, 94105, 94107, 941..."
2542439,6075,2020-01-01,CA,PHEV,Toyota,Prius Plug-in,"{94607, 94102, 94103, 94104, 94105, 94107, 941..."
2542440,6075,2020-01-01,CA,PHEV,Toyota,Prius Plug-in,"{94607, 94102, 94103, 94104, 94105, 94107, 941..."
2542441,6075,2020-01-01,CA,PHEV,Toyota,Prius Plug-in,"{94607, 94102, 94103, 94104, 94105, 94107, 941..."


In [53]:
frame = frame.drop(columns=['County GEOID'])

In [55]:
frame.to_csv('./data/registration/ca_chunks/frame.csv')

In [56]:
frame = pd.read_csv('./data/registration/ca_chunks/frame.csv', index_col=0)

In [57]:
frame

Unnamed: 0,Registration Date,State,Drivetrain Type,Vehicle Make,Vehicle Model,ZIP Code
0,2011-01-01,CA,BEV,Chevrolet,Volt,"{95360, 95361, 95363, 95367, 95368, 95380, 953..."
1,2011-01-01,CA,BEV,Nissan,Leaf,"{95552, 95527, 96041, 96010, 95563, 96076, 955..."
2,2011-01-01,CA,BEV,Chevrolet,Volt,"{96035, 96007, 96029, 96075, 96076, 96074, 960..."
3,2011-01-01,CA,BEV,Tesla,Roadster,"{95360, 95361, 95363, 95367, 95368, 95380, 953..."
4,2011-01-01,CA,BEV,Tesla,Roadster,"{95360, 95361, 95363, 95367, 95368, 95380, 953..."
...,...,...,...,...,...,...
2542438,2020-01-01,CA,PHEV,Toyota,Prius Plug-in,"{94607, 94102, 94103, 94104, 94105, 94107, 941..."
2542439,2020-01-01,CA,PHEV,Toyota,Prius Plug-in,"{94607, 94102, 94103, 94104, 94105, 94107, 941..."
2542440,2020-01-01,CA,PHEV,Toyota,Prius Plug-in,"{94607, 94102, 94103, 94104, 94105, 94107, 941..."
2542441,2020-01-01,CA,PHEV,Toyota,Prius Plug-in,"{94607, 94102, 94103, 94104, 94105, 94107, 941..."


In [58]:
frame.loc[0,'ZIP Code']

'{95360, 95361, 95363, 95367, 95368, 95380, 95382, 95386, 95387, 95307, 95313, 95316, 95319, 95323, 95326, 95328, 95350, 95351, 95354, 95355, 95356, 95357, 95358}'

In [66]:
import re
def parse_set_string(s):
    if s == 'set()':
        return None  # or return set() if you prefer
    else:
        string_nums_only = re.sub('[^0-9,]', '', s)  
        split_nums = string_nums_only.split(',')
        return set(split_nums)
        # return set(map(str, split_nums))

In [68]:
type(parse_set_string(frame.loc[0,'ZIP Code']))

set