# Finding datasets that have stopped being updated
This notebook shows how to find datasets that have stopped being updated.
It looks across all datasets and finds the datasets that have been stopped and then 
does analysis with the table types to show which have the highest percentage of being stopped.
This can be extended to find for departments that have stopped releasing certain types of data, what other types are they still releasing.

In [1]:
import openpolicedata as opd
from openpolicedata.exceptions import OPD_DataUnavailableError
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Get all datasets but exclude the openpolicing.stanford.edu to prevent duplicates
all_datasets = opd.datasets.query()
all_datasets['source_url'] = all_datasets['source_url'].astype(str)
all_datasets = all_datasets[~all_datasets['source_url'].str.contains("openpolicing.stanford.edu")]

unique_source_state_combinations = all_datasets[['SourceName', 'State']].drop_duplicates()
print(unique_source_state_combinations)  

In [None]:
# Limit the dataframe columns for analysis
selected_columns = ['State', 'SourceName', 'TableType', 'coverage_start', 'coverage_end']
filtered_df = pd.DataFrame(all_datasets, columns=selected_columns)

filtered_df = filtered_df.dropna(subset=['coverage_start', 'coverage_end'])

# Create entries for each year instead of a range of years. This will create more rows, but will make it easier to filter the data
filtered_df['ListOfYears'] = filtered_df.apply(
    lambda row: list(range(int(row['coverage_start'].year), int(row['coverage_end'].year) + 1)), axis=1)
filtered_df = filtered_df.drop(columns=['coverage_start', 'coverage_end'])
#filtered_df = filtered_df.explode('ListOfYears')

print(filtered_df.head())


In [None]:

# Load data again from the file
df = filtered_df
# modify the TableType column values to remove the " - " and following text if it exists otherwise keep the original value
df['TableType'] = df['TableType'].str.split(' - ').str[0]
# remove all rows where grouped  is None
df = df.dropna(subset=['ListOfYears'])

# Calculate the minimum and maximum years from the combined lists
grouped=df
grouped['MinYear'] = grouped['ListOfYears'].apply(min)
grouped['MaxYear'] = grouped['ListOfYears'].apply(max)

# Create a label for plotting
grouped['Label'] = grouped['SourceName'] + ', ' + grouped['State'] + ', ' + grouped['TableType']

# create a horizontal bar chart where each element of the vertical axis is the unique combination of the 'SourceName', 'State', 'TableType' columns
# the horizontal axis should be the years of data available for each unique combination of 'SourceName', 'State', 'TableType'
plt.figure(figsize=(7, 60))
plt.barh(grouped['Label'], grouped['MaxYear'] - grouped['MinYear'], left=grouped['MinYear'], color='blue', height=0.8)
plt.xlabel('Year')
plt.ylabel('Source, State, Table Type')
plt.title('Years of Data Available by Source, State, and Table Type')

for index, row in grouped.iterrows():
    plt.text(row['MinYear'], index, f"{row['MinYear']} - {row['MaxYear']}", va='center', color='white')

plt.show()


In [None]:
#Find the datasets where the data is most likely to be stopped within the year
current_year = 2024
minimum_tabletype_counts = 10

grouped['MaxYear'] = grouped['MaxYear'].fillna(0).astype(int)
tabletype_counts = grouped['TableType'].value_counts()

tabletype_counts = tabletype_counts[tabletype_counts >= minimum_tabletype_counts]
grouped = grouped[grouped['TableType'].isin(tabletype_counts.index)]

stopped_datasets = grouped[grouped['MaxYear'] < (current_year-1)]
stopped_tabletype_counts = stopped_datasets['TableType'].value_counts()


# compute a bar graph histogram of the number of datasets that are stopped by TableType
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))

tabletype_counts.plot(kind='bar', ax=axes[0])
axes[0].set_xlabel('Table Type')
axes[0].set_ylabel('Number of Datasets')
axes[0].set_title('Number of All Datasets by Table Type')

stopped_tabletype_counts.plot(kind='bar', ax=axes[1])
axes[1].set_xlabel('Table Type')
axes[1].set_ylabel('Number of Datasets')
axes[1].set_title('Number of Stopped Datasets by Table Type')

plt.tight_layout()
plt.show()

# Find which type has the highest ratio of stopped datasets
ratio = stopped_tabletype_counts / tabletype_counts
ratio = ratio.fillna(0)

# Create a bar plot of the ratio and sort the values from high to low
ratio = ratio.sort_values(ascending=False)
plt.figure(figsize=(10, 6))
ratio.plot(kind='bar')
plt.xlabel('Table Type')
plt.ylabel('Ratio of Stopped Datasets to All Datasets')
plt.title('Ratio of Stopped Datasets to All Datasets by Table Type')
plt.show()

In [None]:
# For each unique combination of SourceName, State, and AgencyFull find the number of new datasets that have been released 
# in the current year for all TableTypes

current_year_datasets = all_datasets[(all_datasets['coverage_end'].dt.year == current_year) | (all_datasets['coverage_end'].dt.year == current_year - 1)]
current_year_datasets_sum = current_year_datasets.groupby(['SourceName', 'State', 'AgencyFull'])['TableType'].nunique()
current_year_datasets_sum = current_year_datasets_sum.sort_values(ascending=False)

print(current_year_datasets_sum)


In [None]:
# For each unique combination of SourceName, State, and AgencyFull find the number of stopped datasets which is 
# defined as not being released this year or the previous year
stopped_current_year_datasets = all_datasets[(all_datasets['coverage_end'].dt.year != current_year) | (all_datasets['coverage_end'].dt.year != current_year - 1)]

# Group the filtered datasets by SourceName and State, and count the number of unique TableTypes
stopped_current_year_datasets_sum = stopped_current_year_datasets.groupby(['SourceName', 'State', 'AgencyFull' ])['TableType'].nunique()

# sort the new_datasets_count in descending order
stopped_current_year_datasets_sum = stopped_current_year_datasets_sum.sort_values(ascending=False)

# Print the number of new datasets for each unique combination of SourceName and State
print(stopped_current_year_datasets_sum)


In [None]:
# plot the current_year_datasets_sum and stopped_current_year_datasets_sum on a scatter chart where the x-axis is the number of new datasets
# first match only the SourceName, State, and AgencyFull that are in both current_year_datasets_sum and stopped_current_year_datasets_sum
# and the y-axis is the number of stopped datasets.


merged_df = pd.merge(current_year_datasets_sum, stopped_current_year_datasets_sum, 
                     on=['SourceName', 'State', 'AgencyFull'], 
                     suffixes=('_current', '_stopped'))

markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', 'H', '+', 'x', 'd', '|', '_']
colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive', 'lime', 'navy', 'teal']

plt.figure(figsize=(10, 6))
labels = []
handles = []

# Annotate each point with the index names and collect labels for the legend
for i, (current, stopped) in enumerate(zip(merged_df['TableType_current'], merged_df['TableType_stopped'])):
    index_label = ', '.join(map(str, merged_df.index[i]))
    shortened_label = ''
    marker = markers[i % len(markers)]  
    color = colors[i % len(colors)] 
    plt.scatter(current, stopped, color=color, s=100, marker=marker)
    plt.annotate(shortened_label, (current, stopped), textcoords="offset points", xytext=(0,10), ha='center')
    labels.append(index_label)
    handles.append(plt.Line2D([0], [0], marker=marker, color='w', markerfacecolor=color, markersize=10, label=index_label))


plt.legend(handles=handles, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
plt.xlabel('Number of New Datasets')
plt.ylabel('Number of Stopped Datasets')
plt.title('New vs. Stopped Datasets by Source, State, and AgencyFull')
plt.show()