In [1]:
# Importing dependencies required for our analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
import scipy.stats as st
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [13]:
# Read csv file
crime_df = pd.read_csv("Crime Stats Cleaned v2.csv")

In [14]:
#Display data
crime_df.head()

Unnamed: 0,Year,County,Month,County Pop,County Pop ('000s),Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,Property_sum,Burglary_sum,VehicleTheft_sum,LTtotal_sum
0,2019.0,Alameda County,1.0,1668412,1668,50,0,1,19,30,195,23,48,124
1,2019.0,Alameda County,2.0,1668412,1668,49,1,2,19,27,187,34,47,106
2,2019.0,Alameda County,3.0,1668412,1668,52,1,1,20,30,153,20,23,110
3,2019.0,Alameda County,4.0,1668412,1668,60,0,3,15,42,174,41,34,99
4,2019.0,Alameda County,5.0,1668412,1668,62,0,1,23,38,180,31,38,111


In [30]:
#Check data types
crime_df.dtypes

Year                  float64
County                 object
Month                 float64
Country_Pop            object
County_Pop_('000s)     object
Violent_sum            object
Homicide_sum           object
ForRape_sum            object
Robbery_sum            object
AggAssault_sum         object
Property_sum           object
Burglary_sum           object
VehicleTheft_sum       object
LarcenyTheft_sum       object
dtype: object

In [16]:
#rename columns to be more readable
crime_df.rename(columns={'County Pop':'Country_Pop',"County Pop ('000s)":"County_Pop_('000s)",'LTtotal_sum':'LarcenyTheft_sum'} , inplace = True)
crime_df

Unnamed: 0,Year,County,Month,Country_Pop,County_Pop_('000s),Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,Property_sum,Burglary_sum,VehicleTheft_sum,LarcenyTheft_sum
0,2019.0,Alameda County,1.0,1668412,1668,50,0,1,19,30,195,23,48,124
1,2019.0,Alameda County,2.0,1668412,1668,49,1,2,19,27,187,34,47,106
2,2019.0,Alameda County,3.0,1668412,1668,52,1,1,20,30,153,20,23,110
3,2019.0,Alameda County,4.0,1668412,1668,60,0,3,15,42,174,41,34,99
4,2019.0,Alameda County,5.0,1668412,1668,62,0,1,23,38,180,31,38,111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34582,2022.0,Yuba County,9.0,84310,84,0,0,0,0,0,0,0,0,0
34583,2022.0,Yuba County,10.0,84310,84,0,0,0,0,0,2,0,0,2
34584,2022.0,Yuba County,11.0,84310,84,0,0,0,0,0,0,0,0,0
34585,2022.0,Yuba County,12.0,84310,84,0,0,0,0,0,1,0,0,1


In [33]:
# Remove leading and trailing white spaces, replace empty values with nan then drop nan values 
#convert data types to int and strin

crime_df_cleaned = crime_df.dropna()
for c in crime_df.columns:
    if c!="County":
        if crime_df[c].dtype == "object":
            crime_df_cleaned[c] = crime_df_cleaned[c].str.strip()
            crime_df_cleaned[c].replace('', np.nan, inplace=True)
        #Removed all the nan rows    
        crime_df_cleaned = crime_df_cleaned.dropna()
        #convert to int
        crime_df_cleaned[c] = crime_df_cleaned[c].astype("int64")

crime_df_cleaned["County"] = crime_df_cleaned["County"].astype('string')
crime_df_cleaned

Unnamed: 0,Year,County,Month,Country_Pop,County_Pop_('000s),Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,Property_sum,Burglary_sum,VehicleTheft_sum,LarcenyTheft_sum
0,2019,Alameda County,1,1668412,1668,50,0,1,19,30,195,23,48,124
1,2019,Alameda County,2,1668412,1668,49,1,2,19,27,187,34,47,106
2,2019,Alameda County,3,1668412,1668,52,1,1,20,30,153,20,23,110
3,2019,Alameda County,4,1668412,1668,60,0,3,15,42,174,41,34,99
4,2019,Alameda County,5,1668412,1668,62,0,1,23,38,180,31,38,111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34581,2022,Yuba County,8,84310,84,0,0,0,0,0,0,0,0,0
34582,2022,Yuba County,9,84310,84,0,0,0,0,0,0,0,0,0
34583,2022,Yuba County,10,84310,84,0,0,0,0,0,2,0,0,2
34584,2022,Yuba County,11,84310,84,0,0,0,0,0,0,0,0,0


In [34]:
#check data types 
crime_df_cleaned.dtypes

Year                   int64
County                string
Month                  int64
Country_Pop            int64
County_Pop_('000s)     int64
Violent_sum            int64
Homicide_sum           int64
ForRape_sum            int64
Robbery_sum            int64
AggAssault_sum         int64
Property_sum           int64
Burglary_sum           int64
VehicleTheft_sum       int64
LarcenyTheft_sum       int64
dtype: object

In [11]:
#Identify unique values on each crime :'Violent_sum','Homicide_sum','ForRape_sum','Robbery_sum','AggAssault_sum','Property_sum','Burglary_sum','VehicleTheft_sum','LarcenyTheft_sum'
unique_v = crime_df_cleaned["Violent_sum"].unique()
print('Violent' ,unique_v)
# unique_h = crime_df_cleaned["Homicide_sum"].unique()
# print('Homicide',unique_h)
# unique_r = crime_df_cleaned["ForRape_sum"].unique()
# print('Rape',unique_r)
# unique_rob = crime_df_cleaned["Robbery_sum"].unique()
# print('Robbery',unique_rob)
# unique_agg = crime_df_cleaned["AggAssault_sum"].unique()
# print('AggAssault',unique_agg)
# unique_pro = crime_df_cleaned["Property_sum"].unique()
# print('Property',unique_pro)
# unique_bur = crime_df_cleaned["Burglary_sum"].unique()
# print('Buglary',unique_bur)
# unique_v = crime_df_cleaned["VehicleTheft_sum"].unique()
# print('Vehicle',unique_v)
# unique_larceny = crime_df_cleaned["LarcenyTheft_sum"].unique()
# print('Larceny',unique_rob)

Violent [  50   49   52   60   62   53   51   54   43   46   70    8   13   12
   16   15   17   11   14    5    3    6    4    2    1   37   56   44
   63   48   55   24   10    9   29   21   34   38   47   35   33   26
   41   39   58   45   57   40   23   19   22    7  472  368  438  465
  457  500  453  477  516  491  428  455    0   36   32   42   28   27
   20   25   18   30   59   31   64   73   66   89   71  100  110   93
   80  107   92   84   79   95   87  120   99  111   76   81  227  178
  212  221  274  282  324  283  218  238  209  170  188  233  222  265
  335  263  242  239  245  253  125  138  132  147  156  167  163  165
  148  157  131  137  408  374  427  476  418  570  538  499  482  436
  483  117   91   94  103   82   75   65   69   68  193  183  198  197
  173  201  205 2352 2055 2503 2440 2485 2607 2784 2559 2478 2474 2258
 2405   61  104   98   67   83  109  126  154  136  135  129  101  130
  105  121  123  102   74  112   86  108  106  146  139  119   72   7

In [35]:
#filtering the negative values for all the crimes
crime_cols = ['Violent_sum','Homicide_sum','ForRape_sum','Robbery_sum','AggAssault_sum','Property_sum','Burglary_sum','VehicleTheft_sum','LarcenyTheft_sum']
for c in crime_cols:
    crime_df_cleaned = crime_df_cleaned [ crime_df_cleaned[c]>=0]

crime_df_cleaned

Unnamed: 0,Year,County,Month,Country_Pop,County_Pop_('000s),Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,Property_sum,Burglary_sum,VehicleTheft_sum,LarcenyTheft_sum
0,2019,Alameda County,1,1668412,1668,50,0,1,19,30,195,23,48,124
1,2019,Alameda County,2,1668412,1668,49,1,2,19,27,187,34,47,106
2,2019,Alameda County,3,1668412,1668,52,1,1,20,30,153,20,23,110
3,2019,Alameda County,4,1668412,1668,60,0,3,15,42,174,41,34,99
4,2019,Alameda County,5,1668412,1668,62,0,1,23,38,180,31,38,111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34581,2022,Yuba County,8,84310,84,0,0,0,0,0,0,0,0,0
34582,2022,Yuba County,9,84310,84,0,0,0,0,0,0,0,0,0
34583,2022,Yuba County,10,84310,84,0,0,0,0,0,2,0,0,2
34584,2022,Yuba County,11,84310,84,0,0,0,0,0,0,0,0,0


In [39]:
crime_df_cleaned.to_csv("crime_df_cleaned_lastV.csv")
newdf = pd.read_csv('crime_df_cleaned_lastV.csv')