In [None]:
import pandas as pd
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy import stats

crime_housing = pd.read_csv('crime-housing-austin-2015.csv')
# print(crime_housing.columns)

In [None]:
# build df with relevant cols. 
df_poverty = pd.read_csv('crime-housing-austin-2015.csv', usecols=[
    'Highest_NIBRS_UCR_Offense_Description', 
    'Zip_Code_Crime',
    'Changeinpercentageofpopulationbelowpoverty2000-2012',
    'Populationbelowpovertylevel'
    ])

df_poverty = df_poverty.rename(columns={'Zip_Code_Crime': 'Zip Code'})







In [None]:
# Read in zip code file for per capita info
df_zip_codes = pd.read_csv('AustinZipCodes.csv', usecols=['Zip Code', 'Population', 'People / Sq. Mile'])


# Merge data frames on the 'Zip Code' column
merged_df = pd.merge(df_poverty, df_zip_codes, on='Zip Code', how='left')

# Calculate total number of crime reports for each zip code
crime_reports_count = merged_df['Zip Code'].value_counts().reset_index()

# Rename the columns
crime_reports_count.columns = ['Zip Code', 'Total_Crimes']

# Merge the crime reports count with df_zip_codes
df_zip_codes = pd.merge(df_zip_codes, crime_reports_count, on='Zip Code', how='left')

# Fill NaN values with 0 for zip codes with no crime reports
# df_zip_codes['Total_Crimes'].fillna(0, inplace=True)
df_zip_codes = df_zip_codes.dropna()

df_zip_codes['People / Sq. Mile'] = df_zip_codes['People / Sq. Mile'].str.replace(',','').astype('float64')
df_zip_codes['Population']= df_zip_codes['Population'].str.replace(',','').astype('float64')


# add crimes per capita column.
df_zip_codes['Crimes_Per_Capita'] = (df_zip_codes['Total_Crimes'] / df_zip_codes['Population'])


percent_poverty = df_poverty[['Zip Code', 'Populationbelowpovertylevel']].drop_duplicates().reset_index(drop=True)
percent_change_poverty = df_poverty[['Zip Code', 'Changeinpercentageofpopulationbelowpoverty2000-2012']].drop_duplicates().reset_index(drop=True)
df_zip_codes = pd.merge(df_zip_codes, percent_poverty, on='Zip Code', how='left')
df_zip_codes = pd.merge(df_zip_codes, percent_change_poverty, on='Zip Code', how='left')


df_zip_codes.head(50)


In [None]:
dropped = df_zip_codes.dropna()
"""
Converting percentages to floats. Using the recommended .loc iterator prevents me from using the regplot because
pandas doesn't recognize the resulting columns as having a datatype of float and rather insists the datatype is object
even when I confirm the datatype of each field is a float. 
"""
dropped['Populationbelowpovertylevel'] = dropped['Populationbelowpovertylevel'].str.replace('%','').astype('float64')
dropped['Changeinpercentageofpopulationbelowpoverty2000-2012'] = dropped['Changeinpercentageofpopulationbelowpoverty2000-2012'].str.replace('%','').astype('float64')

# dropped zip code 78701 as an outlier.
dropped = dropped.drop(21).reset_index(drop=True)

# Correlation tests
print('Correlation between Crimes Per Capita and Population below poverty level')
print(stats.pearsonr(dropped['Crimes_Per_Capita'], dropped['Populationbelowpovertylevel']))
print('Correlation between Crimes Per Capita and Change in percentage below poverty level')
print(stats.pearsonr(dropped['Crimes_Per_Capita'], dropped['Changeinpercentageofpopulationbelowpoverty2000-2012']))
dropped.head(50)

In [None]:
# dropped.loc[:,'Changeinpercentageofpopulationbelowpoverty2000-2012'] = pd.to_numeric(dropped['Changeinpercentageofpopulationbelowpoverty2000-2012'], errors='coerce')
# dropped.loc[:,'Populationbelowpovertylevel'] = pd.to_numeric(dropped['Populationbelowpovertylevel'], errors='coerce')


sb.regplot(data=dropped, x=dropped['Populationbelowpovertylevel'], y=dropped['Crimes_Per_Capita'])

# non_numeric_values = dropped['Changeinpercentageofpopulationbelowpoverty2000-2012'][pd.to_numeric(dropped['Changeinpercentageofpopulationbelowpoverty2000-2012'], errors='coerce').isna()]


In [None]:
sb.regplot(data=dropped, x=dropped['Changeinpercentageofpopulationbelowpoverty2000-2012'], y=dropped['Crimes_Per_Capita'])
plt.xlabel('Change_%Pop_Below_Poverty')
plt.title("Total Crimes vs Change in poverty")
plt.ylim(0, None)

# Nate's analyses

In [None]:
# Grab crime data
crimeData = pd.read_csv("crime-housing-austin-2015.csv")
crimeData.rename(columns={'Zip_Code_Crime': 'Zip Code'}, inplace=True)
# display(crimeData)
display(crimeData.columns)
display(crimeData['Highest_NIBRS_UCR_Offense_Description'].unique())

In [None]:
# Grab zip data
zipData = pd.read_csv("AustinZipCodes.csv")
zipData['National Rank'] = zipData['National Rank'].str.replace('#', '').str.replace(',', '').astype('int')
zipData['People / Sq. Mile'] = zipData['People / Sq. Mile'].str.replace(',', '').astype('float')
zipData['Population'] = zipData['Population'].str.replace(',', '').astype('int')
zipData = zipData[['Zip Code', 'Location', 'Population', 'People / Sq. Mile', 'National Rank']]
display(zipData)

In [None]:
zipData.sort_values(by='People / Sq. Mile')


In [None]:
# Combine the data into a single df
combinedData = pd.merge(crimeData, zipData, on="Zip Code", how='left')
display(combinedData)

## Total counts of reported crimes

In [None]:
crimeCounts = combinedData.groupby('Highest_NIBRS_UCR_Offense_Description')['Key'].count().reset_index()
crimeCounts = crimeCounts.sort_values(ascending=False, by='Key')
display(crimeCounts)
sb.barplot(data=crimeCounts, x='Highest_NIBRS_UCR_Offense_Description', y='Key')


## Average population density per crime

In [None]:
# Find average pop density per crime type
# rapes = combinedData.groupby('Highest_NIBRS_UCR_Offense_Description')['People / Sq. Mile'].mean().reset_index()
# display(rapes)


# display(combinedData.sort_values(by='People / Sq. Mile'))
combinedData = combinedData.dropna(subset=['People / Sq. Mile'])
byCrime = combinedData.groupby('Highest_NIBRS_UCR_Offense_Description')['People / Sq. Mile'].mean().reset_index()
byCrime = byCrime.sort_values(ascending=False, by='People / Sq. Mile')
display(byCrime)
sb.barplot(data=byCrime, x='Highest_NIBRS_UCR_Offense_Description', y='People / Sq. Mile').set_ylim(3800, 4200)



## Comparing average population densities of crimes

In [None]:
sb.displot(data=combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'].isin(['Robbery','Burglary'])], x='People / Sq. Mile',  hue='Highest_NIBRS_UCR_Offense_Description', kind='kde', common_norm=False)

# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Theft']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Rape']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Robbery']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Theft']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Robbery']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Auto Theft']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Theft']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Auto Theft']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Auto Theft']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Robbery']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Auto Theft']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile']))


display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Murder']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile']))

# Crime types: ['Robbery', 'Burglary', 'Auto Theft', 'Agg Assault', 'Theft', 'Rape', 'Murder']

print()


### Burglaries

In [None]:

display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Robbery']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile']))
display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Auto Theft']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Agg Assault']['People / Sq. Mile']))
display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Theft']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Rape']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Murder']['People / Sq. Mile']))

# combinedData.sort_values(by='People / Sq. Mile')

# between two zip codes, compare crime types out of total crimes

# stats.ttest_ind(df[df.State == 'RI'].Score, df[df.State == 'TX'].Score)


### Thefts

In [None]:

display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Theft']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Robbery']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Theft']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Theft']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Theft']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Auto Theft']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Theft']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Agg Assault']['People / Sq. Mile']))
display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Theft']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Burglary']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Theft']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Rape']['People / Sq. Mile']))
# display(stats.ttest_ind(combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Theft']['People / Sq. Mile'], combinedData[combinedData['Highest_NIBRS_UCR_Offense_Description'] == 'Murder']['People / Sq. Mile']))

# combinedData.sort_values(by='People / Sq. Mile')

# between two zip codes, compare crime types out of total crimes

# stats.ttest_ind(df[df.State == 'RI'].Score, df[df.State == 'TX'].Score)


In [None]:
crimesByArtist = combinedData.groupby('Zip Code').agg({'Key': 'count', 'Rentalunitsaffordabletoaverageartist': 'first'}).reset_index()
crimesByArtist = crimesByArtist.dropna(subset='Rentalunitsaffordabletoaverageartist')
crimesByArtist['Rentalunitsaffordabletoaverageartist'] = crimesByArtist['Rentalunitsaffordabletoaverageartist'].str.replace('%', '').astype('int')
display(crimesByArtist)

# sb.barplot(data=crimesByArtist.head(5), x='Zip Code', hue='')

# Create a bar chart with two y-axes
fig, ax1 = plt.subplots()

# Plot the first set of data on the first y-axis
sb.barplot(x='Zip Code', y='Key', data=crimesByArtist.head(5), ax=ax1, color='blue', alpha=0.7)

# Create a second y-axis on the same plot
ax2 = ax1.twinx()

# Plot the second set of data on the second y-axis
sb.barplot(x='Zip Code', y='Rentalunitsaffordabletoaverageartist', data=crimesByArtist.head(5), ax=ax2, color='orange', alpha=0.7)

# Adjust the colors and alpha values as needed

# Show the plot
plt.show()


In [None]:
crimesPerCapita = combinedData.groupby('Zip Code').agg({'Key': 'count', 'Population': 'first', 'People / Sq. Mile': 'first'}).reset_index()
crimesPerCapita = crimesPerCapita.dropna(subset='Population')
display(crimesPerCapita)
crimesPerCapita['Crimes_Per_Capita'] = crimesPerCapita['Key'] / crimesPerCapita['Population']
# crimesPerCapitaByPopulation = crimesPerCapita.sort_values('Population')
# crimesByArtist = combinedData.groupby('Zip Code').agg({'Key': 'count', 'Rentalunitsaffordabletoaverageartist': 'first'}).reset_index()
display(crimesPerCapita)

# sb.regplot(data=crimesPerCapita, x="Population", y="Crimes_Per_Capita", scatter_kws={'s': 10})

sb.regplot(data=crimesPerCapita, x="People / Sq. Mile", y="Crimes_Per_Capita", scatter_kws={'s': 10})
# 
stats.pearsonr(crimesPerCapita["People / Sq. Mile"], crimesPerCapita["Crimes_Per_Capita"])