# Analyzing Police Activity with pandas

### Libraries and datasets

In [2]:
import pandas as pd

ri = pd.read_csv('datasets/police.csv')

## 1. Preparing the data for analysis

### Examining the dataset
Instructions:
<ul>
<li>Import pandas using the alias pd.</li>
<li>Read the file police.csv into a DataFrame named ri.</li>
<li>Examine the first 5 rows of the DataFrame (known as the "head").</li>
<li>Count the number of missing values in each column: Use .isnull() to check which DataFrame elements are missing, and then take the .sum() to count the number of True values in each column.</li>
</ul>

In [3]:
# Import the pandas library as pd
import pandas as pd

# Read 'police.csv' into a DataFrame named ri
ri = pd.read_csv('datasets/police.csv')

# Examine the head of the DataFrame
display(ri.head())

# Count the number of missing values in each column
print(ri.isnull().sum())

Unnamed: 0,state,stop_date,stop_time,county_name,driver_gender,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop,district
0,RI,2005-01-04,12:55,,M,White,Equipment/Inspection Violation,Equipment,False,,Citation,False,0-15 Min,False,Zone X4
1,RI,2005-01-23,23:15,,M,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False,Zone K3
2,RI,2005-02-17,04:15,,M,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False,Zone X4
3,RI,2005-02-20,17:15,,M,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False,Zone X1
4,RI,2005-02-24,01:20,,F,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False,Zone X3


state                     0
stop_date                 0
stop_time                 0
county_name           91741
driver_gender          5205
driver_race            5202
violation_raw          5202
violation              5202
search_conducted          0
search_type           88434
stop_outcome           5202
is_arrested            5202
stop_duration          5202
drugs_related_stop        0
district                  0
dtype: int64


### Dropping columns
Instructions:
<ul>
<li>Examine the DataFrame's .shape to find out the number of rows and columns.</li>
<li>Drop both the county_name and state columns by passing the column names to the .drop() method as a list of strings.</li>
<li>Examine the .shape again to verify that there are now two fewer columns.</li>
</ul>

In [4]:
# Examine the shape of the DataFrame
print(ri.shape)

# Drop the 'county_name' and 'state' columns
ri.drop(['county_name', 'state'], axis='columns', inplace=True)

# Examine the shape of the DataFrame (again)
print(ri.shape)

(91741, 15)
(91741, 13)


### Dropping rows
Instructions:
<ul>
<li>Count the number of missing values in each column.</li>
<li>Drop all rows that are missing driver_gender by passing the column name to the subset parameter of .dropna().</li>
<li>Count the number of missing values in each column again, to verify that none of the remaining rows are missing driver_gender.</li>
<li>Examine the DataFrame's .shape to see how many rows and columns remain.</li>
</ul>

In [5]:
# Count the number of missing values in each column
print(ri.isnull().sum())

# Drop all rows that are missing 'driver_gender'
ri.dropna(subset=['driver_gender'], inplace=True)

# Count the number of missing values in each column (again)
print(ri.isnull().sum())

# Examine the shape of the DataFrame
print(ri.shape)

stop_date                 0
stop_time                 0
driver_gender          5205
driver_race            5202
violation_raw          5202
violation              5202
search_conducted          0
search_type           88434
stop_outcome           5202
is_arrested            5202
stop_duration          5202
drugs_related_stop        0
district                  0
dtype: int64
stop_date                 0
stop_time                 0
driver_gender             0
driver_race               0
violation_raw             0
violation                 0
search_conducted          0
search_type           83229
stop_outcome              0
is_arrested               0
stop_duration             0
drugs_related_stop        0
district                  0
dtype: int64
(86536, 13)


### Fixing a data type
Instructions:
<ul>
<li>Examine the head of the is_arrested column to verify that it contains True and False values and to check the column's data type.</li>
<li>Use the .astype() method to convert is_arrested to a bool column.</li>
<li>Check the new data type of is_arrested to confirm that it is now a bool column.</li>
</ul>

In [6]:
# Examine the head of the 'is_arrested' column
print(ri.is_arrested.head())

# Change the data type of 'is_arrested' to 'bool'
ri['is_arrested'] = ri.is_arrested.astype('bool')

# Check the data type of 'is_arrested' 
print(ri.is_arrested.dtype)

0    False
1    False
2    False
3     True
4    False
Name: is_arrested, dtype: object
bool


### Combining object columns
Instructions:
<ul>
<li>Use a string method to concatenate stop_date and stop_time (separated by a space), and store the result in combined.</li>
<li>Convert combined to datetime format, and store the result in a new column named stop_datetime.</li>
<li>Examine the DataFrame .dtypes to confirm that stop_datetime is a datetime column.</li>
</ul>

In [7]:
# Concatenate 'stop_date' and 'stop_time' (separated by a space)
combined = ri.stop_date.str.cat(ri.stop_time, sep=' ')

# Convert 'combined' to datetime format
ri['stop_datetime'] = pd.to_datetime(combined)

# Examine the data types of the DataFrame
print(ri.dtypes)

stop_date                     object
stop_time                     object
driver_gender                 object
driver_race                   object
violation_raw                 object
violation                     object
search_conducted                bool
search_type                   object
stop_outcome                  object
is_arrested                     bool
stop_duration                 object
drugs_related_stop              bool
district                      object
stop_datetime         datetime64[ns]
dtype: object


### Setting the index
Instructions:
<ul>
<li>Set stop_datetime as the DataFrame index.</li>
<li>Examine the index to verify that it is a DatetimeIndex.</li>
<li>Examine the DataFrame columns to confirm that stop_datetime is no longer one of the columns.</li>
</ul>

In [8]:
# Set 'stop_datetime' as the index
ri.set_index('stop_datetime', inplace=True)

# Examine the index
print(ri.index)

# Examine the columns
print(ri.columns)

DatetimeIndex(['2005-01-04 12:55:00', '2005-01-23 23:15:00',
               '2005-02-17 04:15:00', '2005-02-20 17:15:00',
               '2005-02-24 01:20:00', '2005-03-14 10:00:00',
               '2005-03-29 21:55:00', '2005-04-04 21:25:00',
               '2005-07-14 11:20:00', '2005-07-14 19:55:00',
               ...
               '2015-12-31 13:23:00', '2015-12-31 18:59:00',
               '2015-12-31 19:13:00', '2015-12-31 20:20:00',
               '2015-12-31 20:50:00', '2015-12-31 21:21:00',
               '2015-12-31 21:59:00', '2015-12-31 22:04:00',
               '2015-12-31 22:09:00', '2015-12-31 22:47:00'],
              dtype='datetime64[ns]', name='stop_datetime', length=86536, freq=None)
Index(['stop_date', 'stop_time', 'driver_gender', 'driver_race',
       'violation_raw', 'violation', 'search_conducted', 'search_type',
       'stop_outcome', 'is_arrested', 'stop_duration', 'drugs_related_stop',
       'district'],
      dtype='object')


## 2. Exploring the relationship between gender and policing

## 3. Visual exploratory data analysis

## 4. Analyzing the effect of weather on policing