# Load Conf and Credentials

## Load Directory Locations

In [2]:
import json
import os

# Check if the file exists and load the JSON file into a dictionary
file_path = r'C:\Users\mike\Develop\Projects\Code Notebook\Credentials\locations_conf.json'
if os.path.exists(file_path):
    with open(file_path, 'r') as f:
        locations_data = json.load(f)
    print(locations_data)
else:
    print(f"File not found: {file_path}")

{'Common_Funcs_Dir': '/Users/mike/Develop/Projects/Code Notebook/Common/Functions', 'Credentials_Dir': '/Users/mike/Develop/Projects/Code Notebook/Credentials', 'Rel_Pickes_Dir': '../.pickles', 'Pub_Data_Dir': "'/Users/mike/Data/Public", 'BQ_Service_Key': '/Users/mike/Develop/Conf/GCP Service Keys/mikecancell-development-0bcca41f8486.json'}


### Get the Common Funcs Dir into the Sys Path
This appears to be required bc the Funcs are .py files vs .ipynb files

In [3]:
import sys
sys.path.append(locations_data['Common_Funcs_Dir'])
from func_Load_Data_to_Frame import *

# Load The Source Data to a DF

In [4]:
import pandas as pd
import glob

# Get all files matching the pattern
source_pattern = os.path.join(locations_data['Pub_Data_Dir'].strip("'"), 'LAPD_Crime_Data.json_*')
source_files = glob.glob(source_pattern)

# Load and concatenate all matching files into a single DataFrame
df_list = [pd.read_json(file, compression='zip') for file in source_files]
df = pd.concat(df_list, ignore_index=True)

# Display the resulting DataFrame
print(df.head())

       dr_no                date_rptd                 date_occ  time_occ  \
0    1307355  2010-02-20T00:00:00.000  2010-02-20T00:00:00.000      1350   
1   11401303  2010-09-13T00:00:00.000  2010-09-12T00:00:00.000        45   
2   70309629  2010-08-09T00:00:00.000  2010-08-09T00:00:00.000      1515   
3   90631215  2010-01-05T00:00:00.000  2010-01-05T00:00:00.000       150   
4  100100501  2010-01-03T00:00:00.000  2010-01-02T00:00:00.000      2100   

   area  area_name  rpt_dist_no  part_1_2  crm_cd  \
0    13     Newton         1385         2     900   
1    14    Pacific         1485         2     740   
2    13     Newton         1324         2     946   
3     6  Hollywood          646         2     900   
4     1    Central          176         1     122   

                                         crm_cd_desc  ... crm_cd_1  \
0                           VIOLATION OF COURT ORDER  ...    900.0   
1  VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...  ...    740.0   
2             

# Evaluate the Schmema Properties

## Published Schema:
https://data.lacity.org/Public-Safety/Crime-Data-from-2020-to-Present/2nrs-mtv8/about_data

## Generated Schema in the df

In [5]:
# Show schema
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3157000 entries, 0 to 3156999
Data columns (total 28 columns):
 #   Column          Dtype  
---  ------          -----  
 0   dr_no           int64  
 1   date_rptd       object 
 2   date_occ        object 
 3   time_occ        int64  
 4   area            int64  
 5   area_name       object 
 6   rpt_dist_no     int64  
 7   part_1_2        int64  
 8   crm_cd          int64  
 9   crm_cd_desc     object 
 10  mocodes         object 
 11  vict_age        int64  
 12  vict_sex        object 
 13  vict_descent    object 
 14  premis_cd       float64
 15  premis_desc     object 
 16  status          object 
 17  status_desc     object 
 18  crm_cd_1        float64
 19  location        object 
 20  lat             float64
 21  lon             float64
 22  cross_street    object 
 23  weapon_used_cd  float64
 24  weapon_desc     object 
 25  crm_cd_2        float64
 26  crm_cd_3        float64
 27  crm_cd_4        float64
dtypes: float64(8

## Now Align the Data Types using the Published Schema @
https://data.lacity.org/Public-Safety/Crime-Data-from-2020-to-Present/2nrs-mtv8/about_data

### Convert the DR # (Div of Records Num)

In [6]:
# Convert dr_no to a formatted string and update in place
df['dr_no'] = df['dr_no'].apply(lambda x: f"{str(x)[:2]}-{str(x)[2:4]}-{str(x)[4:]}")

# Display the type and head of the dr_no column
print(df['dr_no'].dtype)
print(df['dr_no'].head())

object
0      13-07-355
1     11-40-1303
2     70-30-9629
3     90-63-1215
4    10-01-00501
Name: dr_no, dtype: object


### Convert the Date Reported to a Date in format YYYY-MM-DD

In [7]:
# Convert date columns to datetime
df['date_rptd'] = pd.to_datetime(df['date_rptd'])
df['date_occ'] = pd.to_datetime(df['date_occ'])

# Display the updated DataFrame with the new formatted date_rptd and date_occ columns
print(df[['date_rptd', 'date_occ']].head())
print(df[['date_rptd', 'date_occ']].dtypes)

   date_rptd   date_occ
0 2010-02-20 2010-02-20
1 2010-09-13 2010-09-12
2 2010-08-09 2010-08-09
3 2010-01-05 2010-01-05
4 2010-01-03 2010-01-02
date_rptd    datetime64[ns]
date_occ     datetime64[ns]
dtype: object


### Combine the Date Occurred and Time Occ into a single Timestamp and remove the orinal Time Occ col

In [8]:
# Combine date_occ and time_occ into a single datetime column
df['datetime_occ'] = pd.to_datetime(df['date_occ'].dt.strftime('%Y-%m-%d') + ' ' + df['time_occ'].astype(str).str.zfill(4).str[:2] + ':' + df['time_occ'].astype(str).str.zfill(4).str[2:] + ':00', format='%Y-%m-%d %H:%M:%S')

# Drop the original time_occ and date_occ columns
df.drop(columns=['time_occ', 'date_occ'], inplace=True)

# Reorder columns to place datetime_occ in the original position of time_occ
cols = df.columns.tolist()
time_occ_index = cols.index('datetime_occ')
cols.insert(3, cols.pop(time_occ_index))
df = df[cols]

# Display the updated DataFrame with the modified datetime_occ column
print(df[['datetime_occ']].head())
# Display the data type of the datetime_occ column
print(df['datetime_occ'].dtypes)

         datetime_occ
0 2010-02-20 13:50:00
1 2010-09-12 00:45:00
2 2010-08-09 15:15:00
3 2010-01-05 01:50:00
4 2010-01-02 21:00:00
datetime64[ns]


### Convert the Category Type Cols to Category types for efficiency

In [9]:
# Convert categorical columns to category dtype
categorical_columns = ['area_name', 'crm_cd_desc', 'mocodes', 'vict_sex', 'vict_descent', 'premis_desc', 'weapon_desc', 'status', 'status_desc', 'location', 'cross_street']
for col in categorical_columns:
    df[col] = df[col].astype('category')


### Convert the Various Codes to Short Ints for space

In [10]:
# Recheck the integer columns as some may have changed and convert to a short int
integer_columns = ['area', 'rpt_dist_no', 'part_1_2', 'crm_cd', 'vict_age', 'premis_cd', 'weapon_used_cd', 'crm_cd_1', 'crm_cd_2', 'crm_cd_3', 'crm_cd_4']
for col in integer_columns:
    if col in df.columns:
        df[col] = df[col].fillna(0).astype('int16')

# Display the columns and their data types
print(df[integer_columns].dtypes)

area              int16
rpt_dist_no       int16
part_1_2          int16
crm_cd            int16
vict_age          int16
premis_cd         int16
weapon_used_cd    int16
crm_cd_1          int16
crm_cd_2          int16
crm_cd_3          int16
crm_cd_4          int16
dtype: object


### Convert the Lat and Lon Cols to Floats

In [11]:
# Convert latitude and longitude to float64
df['lat'] = df['lat'].astype('float64')
df['lon'] = df['lon'].astype('float64')

# Verify changes
print(df[['lat', 'lon']].dtypes)

lat    float64
lon    float64
dtype: object


# Normalize the df into Dims and Facts
See Schema here:
https://lucid.app/lucidspark/a6f7a7bf-63d8-4aa7-ac61-6caa235ac916/edit?invitationId=inv_43e7079e-9d82-4f94-8907-0b93e812a852

## Dimension Areas

In [12]:
# Create dimension tables
dim_area = (
    df[['area', 'area_name']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'area': 'fk_area'})
)
# Show datatypes of the dimension table
print(dim_area.dtypes)

# Show some sample data from the dimension table
print(dim_area.head())

fk_area         int16
area_name    category
dtype: object
   fk_area  area_name
0       13     Newton
1       14    Pacific
2        6  Hollywood
3        1    Central
4       11  Northeast


## Dimension Crimes

In [13]:
dim_crime = (
    df[['crm_cd', 'crm_cd_desc']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'crm_cd': 'fk_crm_cd'})
)
# Display a few rows of the dimension table
print(dim_crime.head())

   fk_crm_cd                                        crm_cd_desc
0        900                           VIOLATION OF COURT ORDER
1        740  VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...
2        946                          OTHER MISCELLANEOUS CRIME
3        122                                    RAPE, ATTEMPTED
4        442           SHOPLIFTING - PETTY THEFT ($950 & UNDER)


## Dimension Victim Sex

In [14]:
dim_victim_sex = (
    df[['vict_sex']]
    .drop_duplicates()
    .reset_index(drop=True)
)

# Set any missing data to 'X'
dim_victim_sex['vict_sex'] = dim_victim_sex['vict_sex'].fillna('X')

# Add sex_desc column
dim_victim_sex['sex_desc'] = dim_victim_sex['vict_sex'].map({
    'M': 'Male',
    'F': 'Female',
    'X': 'UNKNOWN',
    'H': 'UNKNOWN'
})

# Drop duplicates
dim_victim_sex = dim_victim_sex.drop_duplicates()

# Drop any rows with NaN values
dim_victim_sex = dim_victim_sex.dropna()

# Show datatypes of the dimension table
print(dim_victim_sex.dtypes)
print(dim_victim_sex.head())

vict_sex    category
sex_desc      object
dtype: object
  vict_sex sex_desc
0        M     Male
1        F   Female
2        X  UNKNOWN
4        H  UNKNOWN


## Dimension Victim Descent

In [15]:
dim_victim = (
    df[['vict_descent']]
    .drop_duplicates()
    .reset_index(drop=True)
)

# Set any missing data to 'X'
dim_victim['vict_descent'] = dim_victim['vict_descent'].fillna('X')

# Add descent_desc column
dim_victim['descent_desc'] = dim_victim['vict_descent'].map({
    'A': 'Other Asian',
    'B': 'Black',
    'C': 'Chinese',
    'D': 'Cambodian',
    'F': 'Filipino',
    'G': 'Guamanian',
    'H': 'Hispanic/Latin/Mexican',
    'I': 'American Indian/Alaskan Native',
    'J': 'Japanese',
    'K': 'Korean',
    'L': 'Laotian',
    'O': 'Other',
    'P': 'Pacific Islander',
    'S': 'Samoan',
    'U': 'Hawaiian',
    'V': 'Vietnamese',
    'W': 'White',
    'X': 'Unknown',
    'Z': 'Asian Indian'
})

# Drop duplicates
dim_victim = dim_victim.drop_duplicates()

# Drop any rows with NaN values
dim_victim = dim_victim.dropna()

# Show datatypes of the dimension table
print(dim_victim.dtypes)
print(dim_victim.head(20))

vict_descent    category
descent_desc      object
dtype: object
   vict_descent                    descent_desc
0             H          Hispanic/Latin/Mexican
1             W                           White
2             B                           Black
3             A                     Other Asian
4             O                           Other
5             X                         Unknown
6             K                          Korean
8             I  American Indian/Alaskan Native
9             J                        Japanese
10            F                        Filipino
11            C                         Chinese
12            P                Pacific Islander
13            V                      Vietnamese
14            U                        Hawaiian
15            G                       Guamanian
16            D                       Cambodian
17            S                          Samoan
18            Z                    Asian Indian
19            L         

## Dim Premises

In [16]:
dim_premise = (
    df[['premis_cd', 'premis_desc']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'premis_cd': 'fk_premis_cd'})
)
# Show datatypes of the dimension table
print(dim_premise.dtypes)

# Show some sample data from the dimension table
print(dim_premise.head())

fk_premis_cd       int16
premis_desc     category
dtype: object
   fk_premis_cd             premis_desc
0           501  SINGLE FAMILY DWELLING
1           101                  STREET
2           103                   ALLEY
3           404        DEPARTMENT STORE
4           710           OTHER PREMISE


## Dim Weapons

In [17]:
dim_weapon = (
    df[['weapon_used_cd', 'weapon_desc']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'weapon_used_cd': 'fk_weapon_used_cd'})
)
# Show datatypes of the dimension table
print(dim_weapon.dtypes)

# Show some sample data from the dimension table
print(dim_weapon.head())

fk_weapon_used_cd       int16
weapon_desc          category
dtype: object
   fk_weapon_used_cd                                     weapon_desc
0                  0                                             NaN
1                102                                        HAND GUN
2                400  STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)
3                500                     UNKNOWN WEAPON/OTHER WEAPON
4                511                                   VERBAL THREAT


## Dim Status

In [18]:
dim_status = (
    df[['status', 'status_desc']]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename(columns={'status': 'fk_status'})
)
# Show datatypes of the dimension table
print(dim_status.dtypes)
print(dim_status.head())

fk_status      category
status_desc    category
dtype: object
  fk_status   status_desc
0        AA  Adult Arrest
1        IC   Invest Cont
2        AO   Adult Other
3        JA    Juv Arrest
4        JO     Juv Other


## Dim Locations

In [19]:
dim_location = (
    df[['lat', 'lon']]
    .drop_duplicates()
    .reset_index(drop=True)
)

# Add additional columns based on Schema at https://geocode.maps.co/reverse
# This data will be added downstream in the ETL process
dim_location['geo_place_id'] = None
dim_location['geo_osm_type'] = None
dim_location['geo_osm_id'] = None
dim_location['geo_display_name'] = None
dim_location['geo_road'] = None
dim_location['geo_neighbourhood'] = None
dim_location['geo_suburb'] = None
dim_location['geo_city'] = None
dim_location['geo_state'] = None
dim_location['geo_ISO3166-2-lvl4'] = None
dim_location['geo_postcode'] = None
dim_location['geo_country'] = None
dim_location['geo_country_code'] = None
dim_location['geo_boundingbox'] = None
# Show datatypes of the dimension table
print(dim_location.dtypes)

# Show some sample data from the dimension table
print(dim_location.head())

lat                   float64
lon                   float64
geo_place_id           object
geo_osm_type           object
geo_osm_id             object
geo_display_name       object
geo_road               object
geo_neighbourhood      object
geo_suburb             object
geo_city               object
geo_state              object
geo_ISO3166-2-lvl4     object
geo_postcode           object
geo_country            object
geo_country_code       object
geo_boundingbox        object
dtype: object
       lat       lon geo_place_id geo_osm_type geo_osm_id geo_display_name  \
0  33.9825 -118.2695         None         None       None             None   
1  33.9599 -118.3962         None         None       None             None   
2  34.0224 -118.2524         None         None       None             None   
3  34.1016 -118.3295         None         None       None             None   
4  34.0387 -118.2488         None         None       None             None   

  geo_road geo_neighbourhood geo_subu

## Dim Census Data

In [20]:
dim_census = (
    df[['lat', 'lon']]
    .drop_duplicates()
    .reset_index(drop=True)
)
# Add additional columns based on the provided JSON schema
dim_census['block_fips'] = None
dim_census['county_fips'] = None
dim_census['county_name'] = None
dim_census['state_fips'] = None
dim_census['state_code'] = None
dim_census['state_name'] = None
dim_census['block_pop_2020'] = None
dim_census['amt'] = None
dim_census['bea'] = None
dim_census['bta'] = None
dim_census['cma'] = None
dim_census['eag'] = None
dim_census['ivm'] = None
dim_census['mea'] = None
dim_census['mta'] = None
dim_census['pea'] = None
dim_census['rea'] = None
dim_census['rpc'] = None
dim_census['vpc'] = None

# Display the updated DataFrame with the new columns
print(dim_census.head())

       lat       lon block_fips county_fips county_name state_fips state_code  \
0  33.9825 -118.2695       None        None        None       None       None   
1  33.9599 -118.3962       None        None        None       None       None   
2  34.0224 -118.2524       None        None        None       None       None   
3  34.1016 -118.3295       None        None        None       None       None   
4  34.0387 -118.2488       None        None        None       None       None   

  state_name block_pop_2020   amt  ...   bta   cma   eag   ivm   mea   mta  \
0       None           None  None  ...  None  None  None  None  None  None   
1       None           None  None  ...  None  None  None  None  None  None   
2       None           None  None  ...  None  None  None  None  None  None   
3       None           None  None  ...  None  None  None  None  None  None   
4       None           None  None  ...  None  None  None  None  None  None   

    pea   rea   rpc   vpc  
0  None  None  N

## Create the Fact Table

In [21]:
# Create fact table
crime_facts = df[
    [
        # Unique Facts
        'dr_no', 'date_rptd', 'datetime_occ', 'rpt_dist_no', 'vict_age', 
        # Dim Locations
        'lat', 'lon', 
        # Dim Area
        'area', 
        # Dim Premise
        'premis_cd',
        # Dim Crime
        'crm_cd', 
        # Dim Victim Sex
        'vict_sex', 
        # Dim Victim Descent
        'vict_descent',
        # Dim Weapon
        'weapon_used_cd',
        # Dim Status 
        'status'
    ]
].copy()

# Add new columns for the first day of the month
crime_facts['mon_rptd'] = crime_facts['date_rptd'].dt.to_period('M').dt.to_timestamp()
crime_facts['mon_occ'] = crime_facts['datetime_occ'].dt.to_period('M').dt.to_timestamp()

# Handle missing data
crime_facts['vict_sex'] = crime_facts['vict_sex'].cat.add_categories(['Unknown']).fillna('Unknown')
crime_facts['vict_descent'] = crime_facts['vict_descent'].cat.add_categories(['Unknown']).fillna('Unknown')
crime_facts['status'] = crime_facts['status'].cat.add_categories(['Unknown']).fillna('Unknown')

# Display schema and types
print(crime_facts.info())

# Display fact table
display(crime_facts.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3157000 entries, 0 to 3156999
Data columns (total 16 columns):
 #   Column          Dtype         
---  ------          -----         
 0   dr_no           object        
 1   date_rptd       datetime64[ns]
 2   datetime_occ    datetime64[ns]
 3   rpt_dist_no     int16         
 4   vict_age        int16         
 5   lat             float64       
 6   lon             float64       
 7   area            int16         
 8   premis_cd       int16         
 9   crm_cd          int16         
 10  vict_sex        category      
 11  vict_descent    category      
 12  weapon_used_cd  int16         
 13  status          category      
 14  mon_rptd        datetime64[ns]
 15  mon_occ         datetime64[ns]
dtypes: category(3), datetime64[ns](4), float64(2), int16(6), object(1)
memory usage: 213.8+ MB
None


Unnamed: 0,dr_no,date_rptd,datetime_occ,rpt_dist_no,vict_age,lat,lon,area,premis_cd,crm_cd,vict_sex,vict_descent,weapon_used_cd,status,mon_rptd,mon_occ
0,13-07-355,2010-02-20,2010-02-20 13:50:00,1385,48,33.9825,-118.2695,13,501,900,M,H,0,AA,2010-02-01,2010-02-01
1,11-40-1303,2010-09-13,2010-09-12 00:45:00,1485,0,33.9599,-118.3962,14,101,740,M,W,0,IC,2010-09-01,2010-09-01
2,70-30-9629,2010-08-09,2010-08-09 15:15:00,1324,0,34.0224,-118.2524,13,103,946,M,H,0,IC,2010-08-01,2010-08-01
3,90-63-1215,2010-01-05,2010-01-05 01:50:00,646,47,34.1016,-118.3295,6,101,900,F,W,102,IC,2010-01-01,2010-01-01
4,10-01-00501,2010-01-03,2010-01-02 21:00:00,176,47,34.0387,-118.2488,1,103,122,F,H,400,IC,2010-01-01,2010-01-01


# Downstream Post Processes - Retrieve Geo Code into Location Dim

## Load the Location Dim from the Pickle

In [22]:
import pandas as pd
import os
import zipfile

# Set the pickle file path
pickle_file = 'dim_location.pkl'
pickle_path = os.path.join(locations_data['Rel_Pickes_Dir'], pickle_file)

# Check if the pickle file is zipped
if os.path.exists(pickle_path + '.zip'):
	with zipfile.ZipFile(pickle_path + '.zip', 'r') as z:
		with z.open(pickle_file) as f:
			dim_location = pd.read_pickle(f)
else:
	# Load the dimension table from the pickle file
	try:
		dim_location = pd.read_pickle(pickle_path)
	except FileNotFoundError:
		print("Pickle file not found. Please ensure the file exists.")

# Display the loaded DataFrame
print(dim_location.head())

        lat       lon geo_place_id geo_osm_type geo_osm_id  \
0   33.8201 -118.3015    281393593          way   13356241   
6   34.0326 -118.3941    284925955          way  165791832   
7   33.9875 -118.4668    281444680          way  168954633   
11  34.1707 -118.6565    285003597          way  402526611   
12  33.9638 -118.2629    281472844          way  165899885   

                                     geo_display_name               geo_road  \
0   Halldale Avenue, Los Angeles, California, 9050...        Halldale Avenue   
6   South Canfield Avenue, Castle Heights, Los Ang...  South Canfield Avenue   
7   Venice Way, Venice Canal Historic District, Ve...             Venice Way   
11  Burbank Boulevard, Los Angeles, Los Angeles Co...      Burbank Boulevard   
12  Stanford Avenue, Florence, Los Angeles, Los An...        Stanford Avenue   

                 geo_neighbourhood geo_suburb     geo_city   geo_state  \
0                             None       None  Los Angeles  California  

## Load the Census Dim from the Pickle 

In [23]:
import pandas as pd
import os
import zipfile

# Set the pickle file path
pickle_file = 'dim_census.pkl'
pickle_path = os.path.join(locations_data['Rel_Pickes_Dir'], pickle_file)

# Check if the pickle file is zipped
if os.path.exists(pickle_path + '.zip'):
	with zipfile.ZipFile(pickle_path + '.zip', 'r') as z:
		with z.open(pickle_file) as f:
			dim_census = pd.read_pickle(f)
else:
	# Load the dimension table from the pickle file
	try:
		dim_census = pd.read_pickle(pickle_path)
	except FileNotFoundError:
		print("Pickle file not found. Please ensure the file exists.")

# Display the loaded DataFrame
print(dim_census.head())

       lat       lon       block_fips county_fips         county_name  \
0  34.0210 -118.3002  060372095102002       06037  Los Angeles County   
1  34.1576 -118.4387  060372933041015       06037  Los Angeles County   
2  34.0820 -118.2130  060371835201010       06037  Los Angeles County   
3  34.0642 -118.2771  060372267011001       06037  Los Angeles County   
4  34.0536 -118.2788  060372079021009       06037  Los Angeles County   

  state_fips state_code  state_name block_pop_2020     amt  ...     bta  \
0         06         CA  California            587  AMT006  ...  BTA262   
1         06         CA  California            257  AMT006  ...  BTA262   
2         06         CA  California            302  AMT006  ...  BTA262   
3         06         CA  California            144  AMT006  ...  BTA262   
4         06         CA  California             28  AMT006  ...  BTA262   

      cma     eag     ivm     mea     mta     pea     rea     rpc     vpc  
0  CMA002  EAG706  IVM002  MEA044 

## Find the Locations that need to be Processed

In [24]:
tmp_locs_to_be_processed = dim_location[dim_location['geo_place_id'].isnull()][['lat', 'lon']].drop_duplicates()
print(f"Number of records in the df: {len(tmp_locs_to_be_processed)}")

Number of records in the df: 0


## Find the Census Tracts that need to be Processed

In [25]:
tmp_census_block_fips_to_be_processed = dim_census[dim_census['block_fips'].isnull()][['lat', 'lon']].drop_duplicates()
print(f"Number of records in the df: {len(tmp_census_block_fips_to_be_processed)}")

Number of records in the df: 0


## Get the API Credentials for GeoCode Rev Lookup

In [26]:
import json

# Set the path for the JSON file
credentials_dir = locations_data['Credentials_Dir']
json_file_path = os.path.join(credentials_dir, 'geocode_api_key.json')

# Read the credentials from the JSON file
with open(json_file_path, 'r') as f:
    api_key_data = json.load(f)

geo_api_credentials = {
    'API_Key': api_key_data['API_Key'],
    'Reverse_URL': api_key_data['Reverse_URL'],
    'Max_Lookups': api_key_data['Max_Lookups'],
    'Rate_Limit_per_Sec': api_key_data['Rate_Limit_per_Sec']
}

# Mask the API key for security
masked_key = geo_api_credentials['API_Key'][:5] + 'xxxxx'
geo_api_credentials['API_Key'] = masked_key

print(geo_api_credentials)


{'API_Key': '679e7xxxxx', 'Reverse_URL': 'https://geocode.maps.co/reverse', 'Max_Lookups': 50000, 'Rate_Limit_per_Sec': 1}


## Use the GeoCode API to Fetch Location Details using the Reverse Lookup from the Lat/Lon
See https://geocode.maps.co/

### Write a Func for the Fetch
This will be moved to Common Funcs Later

In [27]:
import requests
from tqdm import tqdm
import time

def Fetch_Geo_Data(api_credentials, locations):
    base_url = api_credentials['Reverse_URL']
    api_key = api_credentials['API_Key']
    max_lookups = api_credentials['Max_Lookups']
    rate_limit_per_sec = api_credentials['Rate_Limit_per_Sec']
    
    geo_data = []
    
    for index, row in tqdm(locations.iterrows(), total=locations.shape[0], desc="Fetching Geo Data"):
        if len(geo_data) >= max_lookups:
            print("Reached the maximum number of lookups for the day.")
            break
        
        params = {
            'lat': row['lat'],
            'lon': row['lon'],
            'api_key': api_key
        }
        
        attempts = 0
        while attempts < 3:
            try:
                response = requests.get(base_url, params=params)
                url = f"{base_url}?lat={row['lat']}&lon={row['lon']}&api_key={api_key}"
                response = requests.get(url)
                if response.status_code == 200:
                    if 'place_id' not in response.json():
                        print(f"Response content for lat: {row['lat']}, lon: {row['lon']} - {response.json()}")
                    geo_data.append(response.json())
                    break
                else:
                    attempts += 1
                    time.sleep(rate_limit_per_sec)
            except requests.exceptions.RequestException as e:
                print(f"Request failed: {e}")
                attempts += 1
                time.sleep(rate_limit_per_sec)
        
        if attempts == 3:
            print(f"Failed to fetch data for lat: {row['lat']}, lon: {row['lon']} after 3 attempts.")
            geo_data.append(None)
    
    return geo_data

### Now call the Func

In [28]:

# Example usage
geo_data = Fetch_Geo_Data(geo_api_credentials, tmp_locs_to_be_processed)
print(geo_data[:5])
# Update the dim_location DataFrame with the fetched geo data
for i, data in enumerate(geo_data):
    if data:
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_place_id'] = data.get('place_id')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_osm_type'] = data.get('osm_type')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_osm_id'] = data.get('osm_id')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_display_name'] = data.get('display_name')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_road'] = data.get('address', {}).get('road')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_neighbourhood'] = data.get('address', {}).get('neighbourhood')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_suburb'] = data.get('address', {}).get('suburb')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_city'] = data.get('address', {}).get('city')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_state'] = data.get('address', {}).get('state')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_ISO3166-2-lvl4'] = data.get('address', {}).get('ISO3166-2-lvl4')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_postcode'] = data.get('address', {}).get('postcode')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_country'] = data.get('address', {}).get('country')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_country_code'] = data.get('address', {}).get('country_code')
        dim_location.at[tmp_locs_to_be_processed.index[i], 'geo_boundingbox'] = data.get('boundingbox')

# Display updated dim_location DataFrame
print(dim_location.head())

Fetching Geo Data: 0it [00:00, ?it/s]

[]
        lat       lon geo_place_id geo_osm_type geo_osm_id  \
0   33.8201 -118.3015    281393593          way   13356241   
6   34.0326 -118.3941    284925955          way  165791832   
7   33.9875 -118.4668    281444680          way  168954633   
11  34.1707 -118.6565    285003597          way  402526611   
12  33.9638 -118.2629    281472844          way  165899885   

                                     geo_display_name               geo_road  \
0   Halldale Avenue, Los Angeles, California, 9050...        Halldale Avenue   
6   South Canfield Avenue, Castle Heights, Los Ang...  South Canfield Avenue   
7   Venice Way, Venice Canal Historic District, Ve...             Venice Way   
11  Burbank Boulevard, Los Angeles, Los Angeles Co...      Burbank Boulevard   
12  Stanford Avenue, Florence, Los Angeles, Los An...        Stanford Avenue   

                 geo_neighbourhood geo_suburb     geo_city   geo_state  \
0                             None       None  Los Angeles  Californi




## Use the FCC API to Fetch Census Details from the Lat/Lon
See https://geo.fcc.gov/api/census/#!/area/get_area

### Write a Func for the Fetch
This will be moved to Common Funcs Later

In [29]:
import requests
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

def fetch_census_data_for_location(row):
    base_url = "https://geo.fcc.gov/api/census/area"
    census_year = 2020
    format_type = "json"
    
    params = {
        'lat': row['lat'],
        'lon': row['lon'],
        'censusYear': census_year,
        'format': format_type
    }
    
    attempts = 0
    while attempts < 2:
        try:
            response = requests.get(base_url, params=params)
            if response.status_code == 200:
                response_data = response.json()
                with open('../logs/census_fetch_debug.log', 'a') as log_file:
                    log_file.write(f"Response data for lat: {row['lat']}, lon: {row['lon']} - {response_data}\n")
                return response_data
            else:
                attempts += 1
                time.sleep(1)
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            attempts += 1
            time.sleep(1/2)
    
    with open('../logs/census_fetch_errors.log', 'a') as log_file:
        log_file.write(f"Failed to fetch data for lat: {row['lat']}, lon: {row['lon']} after 2 attempts.\n")
    return None

def Fetch_Census_Data(locations, max_responses, max_workers=10):
    census_data = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(fetch_census_data_for_location, row): index for index, row in locations.iterrows()}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching Census Data"):
            if len(census_data) >= max_responses:
                print("Max number of responses reached. Stopping data fetch.")
                break
            result = future.result()
            if result is not None:
                census_data.append(result)
                if len(census_data) >= max_responses:
                    print("Max number of responses reached. Stopping data fetch.")
                    break

    # Cancel remaining futures if max_responses is reached
    for future in futures:
        if not future.done():
            future.cancel()
    
    return census_data

### Now call the Func Fetch_Census_Data

In [30]:
census_data = Fetch_Census_Data(tmp_census_block_fips_to_be_processed, max_responses=100000, max_workers=20)

# Update the dim_census DataFrame with the fetched census data
for i, data in enumerate(census_data):
    if data and 'results' in data and data['results']:
        result = data['results'][0]
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'block_fips'] = result.get('block_fips')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'county_fips'] = result.get('county_fips')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'county_name'] = result.get('county_name')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'state_fips'] = result.get('state_fips')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'state_code'] = result.get('state_code')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'state_name'] = result.get('state_name')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'block_pop_2020'] = result.get('block_pop_2020')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'amt'] = result.get('amt')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'bea'] = result.get('bea')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'bta'] = result.get('bta')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'cma'] = result.get('cma')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'eag'] = result.get('eag')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'ivm'] = result.get('ivm')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'mea'] = result.get('mea')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'mta'] = result.get('mta')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'pea'] = result.get('pea')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'rea'] = result.get('rea')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'rpc'] = result.get('rpc')
        dim_census.at[tmp_census_block_fips_to_be_processed.index[i], 'vpc'] = result.get('vpc')

# Display updated dim_census DataFrame
print(dim_census.head())


Fetching Census Data: 0it [00:00, ?it/s]

       lat       lon       block_fips county_fips         county_name  \
0  34.0210 -118.3002  060372095102002       06037  Los Angeles County   
1  34.1576 -118.4387  060372933041015       06037  Los Angeles County   
2  34.0820 -118.2130  060371835201010       06037  Los Angeles County   
3  34.0642 -118.2771  060372267011001       06037  Los Angeles County   
4  34.0536 -118.2788  060372079021009       06037  Los Angeles County   

  state_fips state_code  state_name block_pop_2020     amt  ...     bta  \
0         06         CA  California            587  AMT006  ...  BTA262   
1         06         CA  California            257  AMT006  ...  BTA262   
2         06         CA  California            302  AMT006  ...  BTA262   
3         06         CA  California            144  AMT006  ...  BTA262   
4         06         CA  California             28  AMT006  ...  BTA262   

      cma     eag     ivm     mea     mta     pea     rea     rpc     vpc  
0  CMA002  EAG706  IVM002  MEA044 




# Pickle & Zip Dims, Facts for Downstream Processing

In [31]:
import os

# Create the directory if it doesn't exist
pickles_dir = locations_data['Rel_Pickes_Dir']
os.makedirs(pickles_dir, exist_ok=True)

# Define a list of the dimension and fact tables along with their filenames
pickle_files = [
    (dim_area, 'dim_area.pkl'),
    (dim_crime, 'dim_crime.pkl'),
    (dim_victim, 'dim_victim.pkl'),
    (dim_premise, 'dim_premise.pkl'),
    (dim_weapon, 'dim_weapon.pkl'),
    (dim_status, 'dim_status.pkl'),
    (dim_location, 'dim_location.pkl'),
    (dim_census, 'dim_census.pkl'),
    (crime_facts, 'crime_facts.pkl')
]

# Use a while loop to pickle the tables
i = 0
while i < len(pickle_files):
    df, filename = pickle_files[i]
    pickle_path = os.path.join(pickles_dir, filename)
    df.to_pickle(pickle_path)
    
    # Zip the pickle file
    with zipfile.ZipFile(pickle_path + '.zip', 'w', zipfile.ZIP_DEFLATED) as z:
        z.write(pickle_path, filename)
    
    # Remove the uncompressed pickle file
    os.remove(pickle_path)
    
    i += 1