In [1]:
# Step 1: Read the "police.csv" file in pandas
import pandas as pd
df = pd.read_csv('police.csv')

In [2]:
# Step 2: Detect columns with missing values
missing_columns = df.columns[df.isnull().any()]
print("Columns with missing values:", missing_columns)

Columns with missing values: Index(['county_name', 'driver_gender', 'driver_age_raw', 'driver_age',
       'driver_race', 'violation_raw', 'violation', 'search_type',
       'stop_outcome', 'is_arrested', 'stop_duration'],
      dtype='object')


In [3]:
# Step 3: Find the number of missing values column-wise
missing_values_count = df.isnull().sum()
print("Number of missing values column-wise:\n", missing_values_count)

Number of missing values column-wise:
 stop_date                 0
stop_time                 0
county_name           91741
driver_gender          5335
driver_age_raw         5327
driver_age             5621
driver_race            5333
violation_raw          5333
violation              5333
search_conducted          0
search_type           88545
stop_outcome           5333
is_arrested            5333
stop_duration          5333
drugs_related_stop        0
dtype: int64


In [4]:
# Step 4: Find any duplicate records
duplicate_records = df[df.duplicated()]
print("Duplicate records:\n", duplicate_records)

Duplicate records:
         stop_date stop_time  county_name driver_gender  driver_age_raw  \
16     2005-07-19     00:30          NaN             M          1982.0   
104    2005-10-03     13:26          NaN             M          1975.0   
117    2005-10-04     00:36          NaN             M          1961.0   
149    2005-10-05     10:00          NaN             M          1981.0   
239    2005-10-08     12:30          NaN             M          1978.0   
...           ...       ...          ...           ...             ...   
89720  2015-10-03     22:59          NaN             F          1968.0   
89873  2015-10-10     08:06          NaN           NaN             NaN   
90047  2015-10-16     20:37          NaN             F          1986.0   
90136  2015-10-20     11:47          NaN             M          1971.0   
90583  2015-11-05     03:14          NaN             M          1947.0   

       driver_age driver_race                   violation_raw  \
16           23.0       Wh

In [5]:
# Step 5: Remove duplicate records
df = df.drop_duplicates()

In [6]:
print(df.head(20))

     stop_date stop_time  county_name driver_gender  driver_age_raw  \
0   2005-01-02     01:55          NaN             M          1985.0   
1   2005-01-18     08:15          NaN             M          1965.0   
2   2005-01-23     23:15          NaN             M          1972.0   
3   2005-02-20     17:15          NaN             M          1986.0   
4   2005-03-14     10:00          NaN             F          1984.0   
5   2005-03-23     09:45          NaN             M          1982.0   
6   2005-04-01     17:30          NaN             M          1969.0   
7   2005-06-06     13:20          NaN             F          1986.0   
8   2005-07-13     10:15          NaN             M          1970.0   
9   2005-07-13     15:45          NaN             M          1970.0   
10  2005-07-13     16:20          NaN             M          1979.0   
11  2005-07-13     19:00          NaN             F          1966.0   
12  2005-07-14     19:55          NaN             M          1979.0   
13  20

In [7]:
# Step 6: Clean "stop_duration" column by removing "Min"
df['stop_duration'] = df['stop_duration'].str.replace('Min', '')

In [8]:
print(df['stop_duration'].head())

0     0-15 
1     0-15 
2     0-15 
3    16-30 
4     0-15 
Name: stop_duration, dtype: object


In [9]:
# Step 7: Find unique values of each column
unique_values = df.nunique()
print("Unique values in each column:\n", unique_values)

Unique values in each column:
 stop_date             3768
stop_time             1436
county_name              0
driver_gender            2
driver_age_raw          97
driver_age              78
driver_race              5
violation_raw           12
violation                6
search_conducted         2
search_type             24
stop_outcome             6
is_arrested              2
stop_duration            5
drugs_related_stop       2
dtype: int64


In [10]:
# Step 8: Sort the DataFrame by "stop_time"
df_sorted_by_stop_time = df.sort_values(by='stop_time')

In [11]:
print(df_sorted_by_stop_time.head())

       stop_date stop_time  county_name driver_gender  driver_age_raw  \
377   2005-10-16     00:00          NaN             M          1981.0   
6912  2006-05-28     00:00          NaN           NaN             NaN   
6911  2006-05-28     00:00          NaN             M          1988.0   
2350  2005-12-22     00:00          NaN             F          1979.0   
4553  2006-03-09     00:00          NaN             M          1969.0   

      driver_age driver_race                   violation_raw  violation  \
377         24.0       Black                        Speeding   Speeding   
6912         NaN         NaN                             NaN        NaN   
6911        18.0       Black                        Speeding   Speeding   
2350        26.0       Asian                        Speeding   Speeding   
4553        37.0       White  Equipment/Inspection Violation  Equipment   

      search_conducted           search_type stop_outcome is_arrested  \
377               True  Reasonable Su

In [12]:
# Step 9: Sort the DataFrame by "stop_time" and "driver_age"
df_sorted_by_stop_time_and_age = df.sort_values(by=['stop_time', 'driver_age'])

In [13]:
print(df_sorted_by_stop_time_and_age.head())

        stop_date stop_time  county_name driver_gender  driver_age_raw  \
40     2005-10-01     00:00          NaN             M          1988.0   
41     2005-10-01     00:00          NaN             M          1988.0   
1865   2005-12-03     00:00          NaN             M          1988.0   
7576   2006-06-18     00:00          NaN             M          1989.0   
73853  2013-12-30     00:00          NaN             F          1996.0   

       driver_age driver_race                   violation_raw  violation  \
40           17.0       White  Equipment/Inspection Violation  Equipment   
41           17.0       White  Equipment/Inspection Violation  Equipment   
1865         17.0       White                        Speeding   Speeding   
7576         17.0       White                        Speeding   Speeding   
73853        17.0       White                        Speeding   Speeding   

       search_conducted     search_type   stop_outcome is_arrested  \
40                 True  Pro

In [14]:
# Step 10: Find max and min "driver_age"
max_driver_age = df['driver_age'].max()
min_driver_age = df['driver_age'].min()
print("Max driver_age:", max_driver_age)
print("Min driver_age:", min_driver_age)

Max driver_age: 99.0
Min driver_age: 15.0
