## Import All Necessary Modules And Setup Project
If you get any errors when importing these, ensure you run the command:

  $ python -m pip install -r requirements.txt

to install all necessary modules for this project. This command must be run from inside of this project directory.

It is recommended to use virtual environments for this project to ensure there is no conflicting package versions on your system.

Activate the virtual environment (if needed), run the pip install command, and then launch Jupyter Lab inside this project to get this project running.

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
from scipy.stats import t
from numpy.polynomial.polynomial import polyfit

In [None]:
masterData=pd.read_csv("data/crime-housing-austin-2015.csv")
ZipcodeData=pd.read_csv("data/AustinZipCodes.csv")
#explore dataset
masterData.head()
#It receives an int or None (to print all the columns):
pd.set_option('display.max_columns', None)
masterData.head()

In [None]:
#list out all the columns in a dataset
list(masterData)
#list unique values in a column "Council_District" of a dataset
masterData.Council_District.unique()
#clear rows that contains nan values
masterData=masterData.dropna()
ZipcodeData=ZipcodeData.dropna()

In [None]:
#type of a dataset columns
masterData.dtypes
#"council district" & "zip codes" are float, lets make it as int first
masterData=masterData.astype({"Council_District":'int',"Zip_Code_Crime":'int'})
#percent sign or dollar sign attached
#dataset['colname'] = dataset['colname'].str.replace('$', '').astype('float')
masterData['Medianhouseholdincome'] = masterData['Medianhouseholdincome'].str.replace('$', '').astype('float')
masterData['Medianrent'] = masterData['Medianrent'].str.replace('$', '').astype('float')
masterData['Medianhomevalue'] = masterData['Medianhomevalue'].str.replace('$', '').astype('float')
masterData['Unemployment'] = masterData['Unemployment'].str.replace('%', '').astype('int')
masterData['Populationbelowpovertylevel'] = masterData['Populationbelowpovertylevel'].str.replace('%', '').astype('int')
masterData['Non-WhiteNon-HispanicorLatino'] = masterData['Non-WhiteNon-HispanicorLatino'].str.replace('%', '').astype('int')
masterData['HispanicorLatinoofanyrace'] = masterData['HispanicorLatinoofanyrace'].str.replace('%', '').astype('int')

In [None]:
sub_masterData=masterData[['Key','Zip_Code_Crime','District','Clearance_Status','Highest_NIBRS_UCR_Offense_Description',
                              'Medianhouseholdincome', 'Unemployment','Populationbelowpovertylevel','Medianrent',
                              'Medianhomevalue','Non-WhiteNon-HispanicorLatino','HispanicorLatinoofanyrace']]
sub_masterData=sub_masterData.rename(columns={'Zip_Code_Crime':'Zip_Code','Highest_NIBRS_UCR_Offense_Description':'crime_type',
                                             'Clearance_Status':'CS','Medianhouseholdincome':'MhhI','Unemployment':'UE(%)',
                                             'Populationbelowpovertylevel':'Pov_lvl','Medianrent':'rent',
                                             'Medianhomevalue':'home_price','HispanicorLatinoofanyrace':'hispanic',
                                             'Non-WhiteNon-HispanicorLatino':'non_wh_non_lat'})
sub_masterData['non_native_pop']=sub_masterData.non_wh_non_lat+sub_masterData.hispanic
display(sub_masterData.head())
ZipcodeData=ZipcodeData.rename(columns={'Zip Code':'Zip_Code'})
display(ZipcodeData.head())

In [None]:
unified_dataset=pd.merge(sub_masterData,ZipcodeData,on='Zip_Code')
display(unified_dataset)
#Convert string with comma separator and dot to float in population column values
unified_dataset['Population'] = unified_dataset['Population'].str.replace(',', '').astype(float)

In [None]:
#groupby zip code for sub_masterdata
grp_sub_masterData = unified_dataset.groupby('Zip_Code').agg(
    {'crime_type': 'count','MhhI':'mean','UE(%)':'mean','Pov_lvl':'mean','rent':'mean','home_price':'mean',
    'non_wh_non_lat':'mean','hispanic':'mean','Population':'mean','non_native_pop':'mean'})
grp_sub_masterData=grp_sub_masterData.rename(columns={'crime_type':'crime_count'})
grp_sub_masterData=grp_sub_masterData.reset_index()

grp_sub_masterData['crimepercapita']=grp_sub_masterData['crime_count']/grp_sub_masterData['Population']
grp_sub_masterData['cr_rate_nonative']=grp_sub_masterData['crime_count']/(grp_sub_masterData['Population']*grp_sub_masterData['non_native_pop']/100)
display(grp_sub_masterData.head())

In [None]:
#aggregate the total number of each type of crimes
crime_types = sub_masterData.groupby('crime_type').agg(
    {'crime_type': 'count'})
crime_types=crime_types.rename(columns={'crime_type':'count'})
crime_types=crime_types.reset_index()

#plot figure
plt.figure(figsize=(8,8))
plt.pie(data=crime_types, x='count',shadow=True)
plt.title('Texas Crime Results in 2015')
plt.legend(labels=crime_types['crime_type'])
plt.savefig('pi_plot_categorical_crime.jpeg')

## main instructions
Your analysis must include a **number of statistical methods**. You must include `Pearson correlations` (be sure to report the **`p-value`**), _scatterplots_, _averages_, _standard deviations_, and a **t-test** (or **Mann-Whitney-U test**). The specific number of analyses is left up to you, but the contribution must be significant and your project report must give detailed justification and results for each analysis.

## Look at summary statistics mean, median, and mode

In [None]:
rent=sub_masterData.rent
homeprice=sub_masterData.home_price
print('mean: {}\nmedian: {}\nstddev: {}\nmode: {}'.format(rent.mean(), rent.median(), rent.std(), rent.mode()))
print('mean: {}\nmedian: {}\nstddev: {}\nmode: {}'.format(homeprice.mean(), homeprice.median(), homeprice.std(),
                                                          homeprice.mode()))
sub_masterData.describe()

In [None]:
#scatterplot for crime per capita for different zip codes in texas austin
plt.figure(figsize=(8,8))
plt.scatter(grp_sub_masterData.Zip_Code,grp_sub_masterData.crimepercapita)

In [None]:
#remove outlier in a zip code whose crimepercapita >.10: the zip code 78701 has population only 3855, hence outlier
grp_sub_masterData_scatter=grp_sub_masterData.copy()
grp_sub_masterData_scatter=grp_sub_masterData_scatter[grp_sub_masterData_scatter['crimepercapita']<0.10]
plt.figure(figsize=(8,8))
plt.scatter(grp_sub_masterData_scatter.Zip_Code,grp_sub_masterData_scatter.crimepercapita)

In [None]:
#making sub plots for comparison
fig, ax=plt.subplots(2,figsize=(6,10))
plt.title('scatter plot for different zip codes after removing opulier')
ax[0].scatter(grp_sub_masterData.Zip_Code,grp_sub_masterData.crimepercapita)
ax[0].set_xlabel('zip codes')
ax[0].set_ylabel('all types of crime per capita')
ax[1].scatter(grp_sub_masterData_scatter.Zip_Code,grp_sub_masterData_scatter.crimepercapita)
ax[1].set_xlabel('zip codes')
ax[1].set_ylabel('all types of crime per capita')
plt.savefig('scatter plot for crime per capita in different zip codes.jpeg')

### split dataset for each crime type

In [None]:
#sub_masterData.loc[sub_masterData['crime_type'] == 'Robbery', 'crime_type'].count()
Data_robbery=unified_dataset[unified_dataset['crime_type']=='Robbery']
Data_Burglary=unified_dataset[unified_dataset['crime_type']=='Burglary']
Data_Auto_Theft=unified_dataset[unified_dataset['crime_type']=='Auto Theft']
Data_Agg_Assault=unified_dataset[unified_dataset['crime_type']=='Agg Assault']
Data_Theft=unified_dataset[unified_dataset['crime_type']=='Theft']
Data_Murder=unified_dataset[unified_dataset['crime_type']=='Murder']

In [None]:
Data_robbery_grp_zip=Data_robbery.groupby('Zip_Code').agg(
    {'crime_type': 'count','Pov_lvl':'mean',
    'non_wh_non_lat':'mean','hispanic':'mean','Population':'mean','non_native_pop':'mean'})
Data_robbery_grp_zip=Data_robbery_grp_zip.reset_index()
Data_robbery_grp_zip=Data_robbery_grp_zip.rename(columns={'crime_type':'cr_robbery'})
Data_robbery_grp_zip['robpercap']=Data_robbery_grp_zip.cr_robbery/Data_robbery_grp_zip.Population

Data_Burglary_grp_zip=Data_Burglary.groupby('Zip_Code').agg({'crime_type': 'count'})
Data_Burglary_grp_zip=Data_Burglary_grp_zip.reset_index()
Data_Burglary_grp_zip=Data_Burglary_grp_zip.rename(columns={'crime_type':'cr_Burglary'})
data_cr_type=pd.merge(Data_robbery_grp_zip,Data_Burglary_grp_zip,on='Zip_Code')
data_cr_type['burgpercap']=data_cr_type.cr_Burglary/data_cr_type.Population

Data_Auto_Theft_grp_zip=Data_Auto_Theft.groupby('Zip_Code').agg({'crime_type': 'count'})
Data_Auto_Theft_grp_zip=Data_Auto_Theft_grp_zip.reset_index()
Data_Auto_Theft_grp_zip=Data_Auto_Theft_grp_zip.rename(columns={'crime_type':'cr_Auto_Theft'})
data_cr_type=pd.merge(data_cr_type,Data_Auto_Theft_grp_zip,on='Zip_Code')
data_cr_type['autheftpercap']=data_cr_type.cr_Auto_Theft/data_cr_type.Population

Data_Agg_Assault_grp_zip=Data_Agg_Assault.groupby('Zip_Code').agg({'crime_type': 'count'})
Data_Agg_Assault_grp_zip=Data_Agg_Assault_grp_zip.reset_index()
Data_Agg_Assault_grp_zip=Data_Agg_Assault_grp_zip.rename(columns={'crime_type':'cr_Agg_Assault'})
data_cr_type=pd.merge(data_cr_type,Data_Agg_Assault_grp_zip,on='Zip_Code')
data_cr_type['agasspercap']=data_cr_type.cr_Agg_Assault/data_cr_type.Population

Data_Theft_grp_zip=Data_Theft.groupby('Zip_Code').agg({'crime_type': 'count'})
Data_Theft_grp_zip=Data_Theft_grp_zip.reset_index()
Data_Theft_grp_zip=Data_Theft_grp_zip.rename(columns={'crime_type':'cr_Theft'})
data_cr_type=pd.merge(data_cr_type,Data_Theft_grp_zip,on='Zip_Code')
data_cr_type['theftpercap']=data_cr_type.cr_Theft/data_cr_type.Population

Data_Murder_grp_zip=Data_Murder.groupby('Zip_Code').agg({'crime_type': 'count'})
Data_Murder_grp_zip=Data_Murder_grp_zip.reset_index()
Data_Murder_grp_zip=Data_Murder_grp_zip.rename(columns={'crime_type':'cr_Murder'})
data_cr_type=pd.merge(data_cr_type,Data_Murder_grp_zip,on='Zip_Code')
data_cr_type['murderpercap']=data_cr_type.cr_Murder/data_cr_type.Population
display(data_cr_type)

In [None]:
#data_cr_type=data_cr_type[data_cr_type['Zip_Code']!= 78701]

# select 5 zip codes to compare the specific crime rates
#sel_zip=data_cr_type.loc[data_cr_type['Zip_Code'].isin([78702])]
sel_zip=data_cr_type[data_cr_type['Zip_Code']>78730]
#make zip code as a categorical variable
sel_zip=sel_zip.astype({"Zip_Code":'str'})
display(sel_zip)
sel_zip.dtypes['Zip_Code']
#barplot
plt.figure(figsize=(6,6))
plt.bar(x=sel_zip.Zip_Code,height=sel_zip.theftpercap,width=0.5,label='theft')
plt.title('Bar plot for theft rate per capita in 5 places ')
plt.xlabel('Zip Codes')
plt.ylabel('theft crime per capita')
plt.legend()
plt.show()
plt.savefig('theft per capita in 5 Zips.jpeg')

In [None]:
#multiple barplot
plt.figure(figsize=(6,6))
plt.bar(x=sel_zip.Zip_Code,height=sel_zip.robpercap,width=0.5,label='robbery')
plt.bar(x=sel_zip.Zip_Code,height=sel_zip.burgpercap,width=0.5,label='burglary')
plt.bar(x=sel_zip.Zip_Code,height=sel_zip.autheftpercap,width=0.5,label='auto theft')
plt.bar(x=sel_zip.Zip_Code,height=sel_zip.agasspercap,width=0.5,label='agg assault')
plt.bar(x=sel_zip.Zip_Code,height=sel_zip.murderpercap,width=0.5,label='murder')
plt.title('all crimes except Robbery per capita in 5 places ')
plt.xlabel('Zip Codes')
plt.ylabel('Robbery crime per capita')
plt.legend()

plt.show()
plt.savefig('all crimes except Robbery per capita in 5 Zips.jpeg')

## Show a scatterplot and regression line & Pearson R
correlation between population percentage below poverty line towards the crime commit

In [None]:
#aggregate the total number of crimes with respect to population below poverty level
crime_POV_lvl = grp_sub_masterData.groupby('Pov_lvl').agg(
    {'crimepercapita': 'mean'})
crime_POV_lvl=crime_POV_lvl.reset_index()
crime_POV_lvl=crime_POV_lvl[crime_POV_lvl['Pov_lvl']<30]
crime_POV_lvl=crime_POV_lvl[crime_POV_lvl['Pov_lvl']!=20]
crime_POV_lvl.head()

In [None]:
plt.figure(figsize=(8,8))
sns.regplot('Pov_lvl', 'crimepercapita', data=crime_POV_lvl)

plt.title('correlation for population % below poverty line(<30%) to crimepercapita by zip codes')
#we removed one single outlier that has crime rate per capita of 0.5 for poverty level 20
plt.savefig('correlation for population % below poverty line to crimepercapita.jpeg')
#pearson R coefficient and probaility in confidence of statistics
display(stats.pearsonr(crime_POV_lvl.Pov_lvl, crime_POV_lvl.crimepercapita))

In [None]:
plt.figure(figsize=(8,8))
sns.regplot('rent', 'home_price', data=sub_masterData,marker='o',scatter_kws={'s':15},color='blue',line_kws={'color':'red'})
plt.title('correlation for rent paid by people to price of house')
plt.xlabel('rent paid($)')
plt.ylabel('median price of house in texas ($)')
plt.savefig('correlation for rent paid by people to price of house.jpeg')
#pearson R coefficient and probaility in confidence of statistics
display(stats.pearsonr(sub_masterData.rent, sub_masterData.home_price))

## distribution plots

In [None]:
plt.figure()
#histogram plot or KDE plot 
#sns.kdeplot(data = sub_masterData['MhhI'])
#sns.histplot(data = sub_masterData['MhhI'],bins=11,kde=True,legend=True)
sns.distplot(sub_masterData[sub_masterData.CS == 'N'].rent, hist=False, label='not cleared',bins=None)
sns.distplot(sub_masterData[sub_masterData.CS == 'C'].rent, hist=False, label='cleared by arrest',bins=None)
sns.distplot(sub_masterData[sub_masterData.CS == 'O'].rent, hist=False, label='cleared by exception',bins=None)
plt.xlabel('median house rent($)')
plt.legend()

## T distribution
The t test computes t-statistic, which measures the spread between means. What are the chances that we re-sampled from a population and got the same value of t? This is what the p value is important for.

In [None]:
#sns.distplot(sub_masterData.rent, hist=False)
#sns.distplot(sub_masterData.home_price, hist=False)

#t-test for rent vs house price in texas in 2015
stats.ttest_ind(sub_masterData.rent, sub_masterData.home_price)

In [None]:
#t-test for population % below poverty level
stats.ttest_ind(sub_masterData.Pov_lvl, sub_masterData.rent)

In [None]:
#t-test for population and house price by places
stats.ttest_ind(unified_dataset.Population, unified_dataset.MhhI)