# Download dataset form  this website

## https://www.kaggle.com/faressayah/stanford-open-policing-project?select=police_project.csv

## Description::

On a typical day in the United States, police officers make more than 50,000 traffic stops. Our team is gathering, analyzing, and releasing records from millions of traffic stops by law enforcement agencies across the country. Our goal is to help researchers, journalists, and policymakers investigate and improve interactions between police and the public.


## Importing libraries::

In [1]:
import pandas as pd
import numpy as np

# Use Pandas' read_csv function  open it as a DataFrame

In [4]:
df = pd.read_csv('police_project.csv')

# What does each row represent?

#### hint::
head : Return the first n rows. (By default return first 5 rows.)

In [5]:
df.head()

Unnamed: 0,stop_date,stop_time,county_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,2005-01-02,01:55,,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,2005-01-18,08:15,,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,2005-01-23,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2005-02-20,17:15,,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,2005-03-14,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


# How to get the basic statistics of all the columns?

In [7]:
df.describe(include='all')

Unnamed: 0,stop_date,stop_time,county_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
count,91741,91741,0.0,86406,86414.0,86120.0,86408,86408,86408,91741,3196,86408,86408,86408,91741
unique,3768,1436,,2,,,5,12,6,2,24,6,2,5,2
top,2012-01-11,11:00,,M,,,White,Speeding,Speeding,False,Incident to Arrest,Citation,False,0-15 Min,False
freq,69,323,,62895,,,62158,48463,48463,88545,1219,77006,83479,69543,90926
mean,,,,,1970.491228,34.011333,,,,,,,,,
std,,,,,110.914909,12.738564,,,,,,,,,
min,,,,,0.0,15.0,,,,,,,,,
25%,,,,,1967.0,23.0,,,,,,,,,
50%,,,,,1980.0,31.0,,,,,,,,,
75%,,,,,1987.0,43.0,,,,,,,,,


# How to check the shape of dataset?

In [8]:
df.shape

(91741, 15)

# Check the type of columns?

In [10]:
df.dtypes

stop_date              object
stop_time              object
county_name           float64
driver_gender          object
driver_age_raw        float64
driver_age            float64
driver_race            object
violation_raw          object
violation              object
search_conducted         bool
search_type            object
stop_outcome           object
is_arrested            object
stop_duration          object
drugs_related_stop       bool
dtype: object

# Locating missing Values?
#### detecting missing values
#### calculates the sum of each column


In [11]:
df.isnull()

Unnamed: 0,stop_date,stop_time,county_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False
1,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False
4,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91736,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False
91737,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False
91738,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False
91739,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False


In [12]:
df.isnull().sum()

stop_date                 0
stop_time                 0
county_name           91741
driver_gender          5335
driver_age_raw         5327
driver_age             5621
driver_race            5333
violation_raw          5333
violation              5333
search_conducted          0
search_type           88545
stop_outcome           5333
is_arrested            5333
stop_duration          5333
drugs_related_stop        0
dtype: int64

# Dropping Column that only contains missing values.

In [15]:
df.dropna(axis=1, how='all', inplace=True)

In [16]:
df

Unnamed: 0,stop_date,stop_time,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,2005-01-02,01:55,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,2005-01-18,08:15,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,2005-01-23,23:15,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2005-02-20,17:15,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,2005-03-14,10:00,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91736,2015-12-31,20:27,M,1986.0,29.0,White,Speeding,Speeding,False,,Warning,False,0-15 Min,False
91737,2015-12-31,20:35,F,1982.0,33.0,White,Equipment/Inspection Violation,Equipment,False,,Warning,False,0-15 Min,False
91738,2015-12-31,20:45,M,1992.0,23.0,White,Other Traffic Violation,Moving violation,False,,Warning,False,0-15 Min,False
91739,2015-12-31,21:42,M,1993.0,22.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


# Do the men or women speed more often?

# Which year had the least number of stops?

# Does gender affect who gets searched during a stop?


# How does drug activity change by time of day?

# Do most stops occur at night?