In [1]:
# import libaries
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

%matplotlib inline

In [2]:
# import data. data is in a zipfile
import zipfile
with zipfile.ZipFile('los-angeles-traffic-collision-data.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [2]:
la_df = pd.read_csv('traffic-collision-data-from-2010-to-present.csv')

### Just a note, you can access the docstring of any function, method, variable (pretty much anything) by adding '?' after a comand. Example: 
- md_df.head?
- md_df?
- pd.read_csv?

In [3]:
la_df.head()

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Premise Description,Address,Cross Street,Location,Zip Codes,Census Tracts,Precinct Boundaries,LA Specific Plans,Council Districts,Neighborhood Councils (Certified)
0,190316163,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,1215,3,Southwest,351,997,TRAFFIC COLLISION,,...,STREET,EXPOSITION BL,LA BREA AV,"{'latitude': '34.025', 'human_address': '{""add...",23077.0,669.0,986.0,,12.0,17.0
1,192112389,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,830,21,Topanga,2157,997,TRAFFIC COLLISION,,...,STREET,DE SOTO,KITTRIDGE,"{'latitude': '34.1938', 'human_address': '{""ad...",4282.0,294.0,1547.0,2.0,4.0,66.0
2,190614945,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,1240,6,Hollywood,643,997,TRAFFIC COLLISION,,...,STREET,FRANKLIN AV,FULLER AV,"{'latitude': '34.1035', 'human_address': '{""ad...",23677.0,456.0,904.0,,7.0,82.0
3,191415527,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,1350,14,Pacific,1457,997,TRAFFIC COLLISION,1402.0,...,STREET,12300 CULVER BL,,"{'latitude': '33.992', 'human_address': '{""add...",24031.0,918.0,1137.0,10.0,10.0,85.0
4,191011897,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,1105,10,West Valley,1025,997,TRAFFIC COLLISION,,...,STREET,SHERMAN WY,LINDLEY AV,"{'latitude': '34.2011', 'human_address': '{""ad...",18909.0,258.0,284.0,,4.0,12.0


In [7]:
la_df.set_index('DR Number', inplace=True)

In [8]:
la_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 479425 entries, 190316163 to 111904006
Data columns (total 23 columns):
Date Reported                        479425 non-null object
Date Occurred                        479425 non-null object
Time Occurred                        479425 non-null int64
Area ID                              479425 non-null int64
Area Name                            479425 non-null object
Reporting District                   479425 non-null int64
Crime Code                           479425 non-null int64
Crime Code Description               479425 non-null object
MO Codes                             394280 non-null object
Victim Age                           401954 non-null float64
Victim Sex                           472433 non-null object
Victim Descent                       471712 non-null object
Premise Code                         479400 non-null float64
Premise Description                  479400 non-null object
Address                              479

In [10]:
la_df.head(1)

Unnamed: 0_level_0,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,Victim Age,...,Premise Description,Address,Cross Street,Location,Zip Codes,Census Tracts,Precinct Boundaries,LA Specific Plans,Council Districts,Neighborhood Councils (Certified)
DR Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
190316163,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,1215,3,Southwest,351,997,TRAFFIC COLLISION,,46.0,...,STREET,EXPOSITION BL,LA BREA AV,"{'latitude': '34.025', 'human_address': '{""add...",23077.0,669.0,986.0,,12.0,17.0


In [13]:
# to only view specific columns, use this formate df[['name', 'name2']].head()
la_df[['Area Name', 'Crime Code', 'Victim Age']].head()

Unnamed: 0_level_0,Area Name,Crime Code,Victim Age
DR Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
190316163,Southwest,997,46.0
192112389,Topanga,997,32.0
190614945,Hollywood,997,29.0
191415527,Pacific,997,
191011897,West Valley,997,45.0


You can index the rows by using the loc and iloc accessors.

- loc does label-based indexing
- iloc performs integer-based indexing

You can use a comma separated list to access multiple fields at the same time.
Both support the standard Python slicing operations [start:end:step]

In [18]:
la_df.loc[190316163]

Date Reported                                                  2019-07-06T00:00:00.000
Date Occurred                                                  2019-07-06T00:00:00.000
Time Occurred                                                                     1215
Area ID                                                                              3
Area Name                                                                    Southwest
Reporting District                                                                 351
Crime Code                                                                         997
Crime Code Description                                               TRAFFIC COLLISION
MO Codes                                                                           NaN
Victim Age                                                                          46
Victim Sex                                                                           F
Victim Descent                             

In [17]:
# use double brackets for two or more, df.loc[[index, index2]]
la_df.loc[[190316163, 191011897]]

Unnamed: 0_level_0,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,Victim Age,...,Premise Description,Address,Cross Street,Location,Zip Codes,Census Tracts,Precinct Boundaries,LA Specific Plans,Council Districts,Neighborhood Councils (Certified)
DR Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
190316163,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,1215,3,Southwest,351,997,TRAFFIC COLLISION,,46.0,...,STREET,EXPOSITION BL,LA BREA AV,"{'latitude': '34.025', 'human_address': '{""add...",23077.0,669.0,986.0,,12.0,17.0
191011897,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,1105,10,West Valley,1025,997,TRAFFIC COLLISION,,45.0,...,STREET,SHERMAN WY,LINDLEY AV,"{'latitude': '34.2011', 'human_address': '{""ad...",18909.0,258.0,284.0,,4.0,12.0


In [20]:
# iloc searching by interger based. Meaning 0, 1, 2,... seaching the 1st row, 2nd row, etc
la_df.iloc[1]

#get every 5th row starting from 10, until 25.
la_df.iloc[10:25:5]

Unnamed: 0_level_0,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,Victim Age,...,Premise Description,Address,Cross Street,Location,Zip Codes,Census Tracts,Precinct Boundaries,LA Specific Plans,Council Districts,Neighborhood Councils (Certified)
DR Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
191011898,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,1130,10,West Valley,1028,997,TRAFFIC COLLISION,,25.0,...,STREET,SHERMAN WY,FORBES AV,"{'latitude': '34.2012', 'human_address': '{""ad...",19734.0,263.0,436.0,,3.0,61.0
190411883,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,950,4,Hollenbeck,423,997,TRAFFIC COLLISION,,60.0,...,STREET,MAIN,JOHNSTON,"{'latitude': '34.066', 'human_address': '{""add...",23448.0,494.0,619.0,,11.0,1.0
192112342,2019-07-05T00:00:00.000,2019-07-05T00:00:00.000,1035,21,Topanga,2156,997,TRAFFIC COLLISION,,45.0,...,STREET,OWENSMOUTH AV,VICTORY BL,"{'latitude': '34.1878', 'human_address': '{""ad...",4278.0,299.0,434.0,2.0,4.0,49.0


In [21]:
# show the first 5 rows and the first 5 columns
la_df.iloc[:5, :5]

Unnamed: 0_level_0,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name
DR Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
190316163,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,1215,3,Southwest
192112389,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,830,21,Topanga
190614945,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,1240,6,Hollywood
191415527,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,1350,14,Pacific
191011897,2019-07-06T00:00:00.000,2019-07-06T00:00:00.000,1105,10,West Valley


In [26]:
# show the all the rows up to index 190614945 and the columns named 'Crime Code' and 'Victim Age'
la_df.loc[:190614945, ['Crime Code', 'Victim Age']]

Unnamed: 0_level_0,Crime Code,Victim Age
DR Number,Unnamed: 1_level_1,Unnamed: 2_level_1
190316163,997,46.0
192112389,997,32.0
190614945,997,29.0
