In [1]:
import csv
import numpy as np
import pandas as pd
from scipy.stats import linregress
import matplotlib.pyplot as plt
import requests
import geopandas

url = "https://geo.fcc.gov/api/census/area?lat=&lon=132&format=json"


In [2]:
crime_path = "Resources/hci_crime_752_pl_co_re_ca_2000-2013_21oct15.csv"
csv_path = "Resources/housing.csv"
county_path = "Resources/county_data.csv"

county_data_df = pd.read_csv(county_path)
cali_housing = pd.read_csv(csv_path)
crime_data_df = pd.read_csv(crime_path,encoding="ISO-8859-1", low_memory=False)
crime_data_df.head()

Unnamed: 0,ind_id,ind_definition,reportyear,race_eth_code,race_eth_name,geotype,geotypevalue,geoname,county_fips,county_name,...,denominator,rate,ll_95ci,ul_95ci,se,rse,ca_decile,ca_rr,dof_population,version
0,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CA,6.0,California,,,...,33847694.0,,,,,,,,33873086.0,10/21/2015
1,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CA,6.0,California,,,...,33847694.0,,,,,,,,33873086.0,10/21/2015
2,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CA,6.0,California,,,...,33847694.0,,,,,,,,33873086.0,10/21/2015
3,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CA,6.0,California,,,...,33847694.0,,,,,,,,33873086.0,10/21/2015
4,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CA,6.0,California,,,...,33847694.0,6.217499,6.190935,6.244063,0.013553,0.217985,,1.0,33873086.0,10/21/2015


In [3]:
# naming variables in the housing dataset
median_income = cali_housing.loc[:, "median_income"]
median_house_value = cali_housing.loc[:, "median_house_value"]
longitude = cali_housing.loc[:, "longitude"]
latitude = cali_housing.loc[:, "latitude"]
county_name = crime_data_df.loc[:, "county_name"]

# api for lat long to location info
long = -122.23
lat = 37.88
url = f"https://geo.fcc.gov/api/census/area?lat={lat}&lon={long}&format=json"

# finding null values in housing dataset
cali_housing["median_house_value"]
cali_housing.info()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
# dropping null values in county_name
dropna_crime_data = crime_data_df.loc[crime_data_df["county_name"].notnull(), :]
dropna_crime_data_2000 = dropna_crime_data.loc[dropna_crime_data["reportyear"] == 2000, :]
dropna_crime_data_2000

Unnamed: 0,ind_id,ind_definition,reportyear,race_eth_code,race_eth_name,geotype,geotypevalue,geoname,county_fips,county_name,...,denominator,rate,ll_95ci,ul_95ci,se,rse,ca_decile,ca_rr,dof_population,version
5,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CO,6001.0,Alameda,6001.0,Alameda,...,1443741.0,,,,,,,,1443939.0,10/21/2015
6,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CO,6001.0,Alameda,6001.0,Alameda,...,1443741.0,,,,,,,,1443939.0,10/21/2015
7,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CO,6001.0,Alameda,6001.0,Alameda,...,1443741.0,,,,,,,,1443939.0,10/21/2015
8,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CO,6001.0,Alameda,6001.0,Alameda,...,1443741.0,,,,,,,,1443939.0,10/21/2015
9,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CO,6001.0,Alameda,6001.0,Alameda,...,1443741.0,6.582206,6.449864,6.714547,0.067521,1.025816,,1.058658,1443939.0,10/21/2015
10,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CO,6003.0,Alpine,6003.0,Alpine,...,1208.0,,,,,,,,1208.0,10/21/2015
11,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CO,6003.0,Alpine,6003.0,Alpine,...,1208.0,,,,,,,,1208.0,10/21/2015
12,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CO,6003.0,Alpine,6003.0,Alpine,...,1208.0,,,,,,,,1208.0,10/21/2015
13,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CO,6003.0,Alpine,6003.0,Alpine,...,1208.0,,,,,,,,1208.0,10/21/2015
14,752,"Number of Violent Crimes per 1,000 Population",2000.0,9.0,Total,CO,6003.0,Alpine,6003.0,Alpine,...,1208.0,8.278146,3.147298,13.408994,2.617780,31.622777,,1.331427,1208.0,10/21/2015


In [5]:
# creating new dataframe with only pertinent columns
columndrop_crimedata_2000 = dropna_crime_data_2000[["county_name", "numerator", "denominator", "rate"]]
columndrop_crimedata_2000

Unnamed: 0,county_name,numerator,denominator,rate
5,Alameda,5457.0,1443741.0,
6,Alameda,568.0,1443741.0,
7,Alameda,110.0,1443741.0,
8,Alameda,3368.0,1443741.0,
9,Alameda,9503.0,1443741.0,6.582206
10,Alpine,9.0,1208.0,
11,Alpine,1.0,1208.0,
12,Alpine,,1208.0,
13,Alpine,,1208.0,
14,Alpine,10.0,1208.0,8.278146


In [6]:
county_data_df.isnull().sum()

Unnamed: 0        0
latitude          0
longitude         0
amt               9
bea               9
block_fips        9
block_pop_2015    9
bta               9
cma               9
county_fips       9
county_name       9
eag               9
ivm               9
latitude.1        0
longitude.1       0
mea               9
mta               9
pea               9
rea               9
rpc               9
state_code        9
state_fips        9
state_name        9
vpc               9
dtype: int64

In [7]:
# dropping all the rows with null values in "rate"
county_crime_rate_df = columndrop_crimedata_2000.loc[columndrop_crimedata_2000["rate"].notnull(), :]
county_crime_rate_df


Unnamed: 0,county_name,numerator,denominator,rate
9,Alameda,9503.0,1443741.0,6.582206
14,Alpine,10.0,1208.0,8.278146
19,Amador,179.0,34120.0,5.246190
24,Butte,699.0,203171.0,3.440452
29,Calaveras,118.0,40554.0,2.909701
34,Colusa,57.0,18804.0,3.031270
39,Contra Costa,4532.0,948816.0,4.776479
44,Del Norte,118.0,27507.0,4.289817
49,El Dorado,702.0,156299.0,4.491391
54,Fresno,6041.0,799407.0,7.556852


In [8]:
# sorted by county name for later reference
county_crime_rate_df.sort_values('county_name')


Unnamed: 0,county_name,numerator,denominator,rate
9,Alameda,9503.0,1443741.0,6.582206
522,Alameda,754.0,102743.0,7.338699
2102,Alameda,130.0,42471.0,3.060912
3083,Alameda,342.0,66869.0,5.114478
2688,Alameda,500.0,79452.0,6.293108
2150,Alameda,5038.0,399484.0,12.611269
1045,Alameda,93.0,29973.0,3.102793
1767,Alameda,132.0,73345.0,1.799714
1257,Alameda,387.0,203413.0,1.902533
310,Alameda,302.0,72259.0,4.179410


In [9]:
# creating summary crime rate by county
group_county_rate_df = county_crime_rate_df.groupby(["county_name"])
sum_county_rate_df = group_county_rate_df[["numerator", "denominator"]].sum()
sum_county_rate_df

Unnamed: 0_level_0,numerator,denominator
county_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alameda,18232.0,2751712.0
Alpine,10.0,1208.0
Amador,203.0,47541.0
Butte,1080.0,309712.0
Calaveras,151.0,43558.0
Colusa,94.0,27876.0
Contra Costa,8363.0,1750878.0
Del Norte,133.0,31535.0
El Dorado,1083.0,189518.0
Fresno,10773.0,1411249.0


In [12]:
# calculation for summary crime rate and creating csv file
# multiplied by 1000 to match formatting in original dataset
sum_county_rate_df["sum_rate"] = (sum_county_rate_df["numerator"] / sum_county_rate_df["denominator"]) * 1000
sum_county_rate_df.to_csv("Resources/sum_county_rate.csv", index=True)