# CrimePy - National Crime Victimization Survey (NCVS)
----

# Dataset: Bureau of Justice Statistics (BJS)

In [1]:
# Dependencies and Setup

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from scipy.stats import linregress
import scipy.stats as st
from datetime import date
import json
import xmltodict

In [2]:
# User functions to be used

def append_dict_to_df(current_dict, current_index, current_df):
    single_df = pd.DataFrame(current_dict, index=[current_index])
    if current_df.empty:
        current_df = single_df.copy()
    else:
        current_df = current_df.append(single_df)        
    current_index += 1
    return current_df,current_index

def get_incidents(endpoint, field, format_type):
    # Build query URL
    query_url = f"{base_url}{endpoint}?format={format_type}"
    # Call API and get response
    response = requests.get(query_url)
    current_json = response.json()
    return pd.DataFrame(current_json[field])

def get_fields(endpoint):
    # Build query URL
    query_url = f"{base_url}{endpoint}" # XML only
    # Call API and get response
    response = requests.get(query_url)
    fields_ord_dict = xmltodict.parse(response.content) # Ordered dictionary
    fields_idx = 0
    values_idx = 0
    fields_df = pd.DataFrame()
    values_df = pd.DataFrame()
    for key_level1 in fields_ord_dict: # 'bjsDataSetFields'
        for key_level2 in fields_ord_dict[key_level1]: # 'fields'
            for field in fields_ord_dict[key_level1][key_level2]:
                field_id = field['id']
                fields_dict = {
                    "field_id": field_id,
                    "field_name": field['name'],
                    "field_description": field['description']
                }
                fields_df,fields_idx = append_dict_to_df(fields_dict, fields_idx, fields_df)
                values = field['values']
                if type(values) is list:
                    for idx in range(len(values)):
                        value_ord_dict = values[idx]
                        values_dict = {
                            "field_id": field_id,
                            "value_name": value_ord_dict['name'],
                            "value_description": value_ord_dict['description']
                        }
                        values_df,values_idx = append_dict_to_df(values_dict, values_idx, values_df)
                else:
                    value_ord_dict = values
                    values_dict = {
                        "field_id": field_id,
                        "value_name": value_ord_dict['name'],
                        "value_description": value_ord_dict['description']
                    }
                    values_df,values_idx = append_dict_to_df(values_dict, values_idx, values_df)               
    return pd.merge(fields_df,values_df, on="field_id")

def get_desc(df, input_field1, field1_value, input_field2, field2_value, output_field):
    value = df.loc[(df[input_field1] == field1_value) & (df[input_field2] == field2_value), output_field]
    try:
        return value.values[0]
    except IndexError:
        return value.values

def translate_df(df, field_df, offset):
    new_df = df.copy()
    input_field1 = 'field_id' # does not change
    input_field2 = 'value_name' # does not change
    output_field = 'value_description'  # does not change
    for index, row in new_df.iterrows():
        for col in range(len(new_df.columns)-offset):
            field1_value = new_df.columns[col+offset]
            field2_value = row[col+offset]
            value_description = get_desc(field_df, input_field1, field1_value, input_field2, field2_value, output_field)
            row[col+offset] = value_description
    return new_df

def Mbox(title, text, style):
    return ctypes.windll.user32.MessageBoxW(0, text, title, style)

def build_scatter_plot(df, colx, coly, title, xlabel, ylabel, file):
    # Build a scatter plot
    plt.scatter(df[colx], df[coly], marker="o")
    # Incorporate the other graph properties
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    plt.grid(True)
    # Save the figure
    plt.savefig(file)
    # Show plot
    plt.show()
    
def build_linear_regression(df, colx, coly, title, xlabel, ylabel, file, le_x, le_y, r_x, r_y):
    # Perform a linear regression on coly vs. colx
    x_values = df[colx]
    y_values = df[coly]
    # Calculate the line equation using linear regression function
    (slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
    regress_values = x_values * slope + intercept
    line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
    # Plot original data using Scatter type
    plt.scatter(x_values,y_values, label='original data')
    # Plot fitted line using Line type
    plt.plot(x_values,regress_values,"r-", label='fitted line')
    # Add line equation to plot
    plt.annotate(line_eq,(le_x,le_y),fontsize=15,color="red")
    # Set title, xlabel, ylabel and legend
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()
    # Show r-squared value
    r2_latex = r'$r^2 = $'
    r2_value = f'{r2_latex}{round(rvalue**2,4)}'
    plt.annotate(r2_value,(r_x,r_y),fontsize=15,color="green")
    print(f"The r-value is: {rvalue}")
    # Save the figure
    plt.savefig(file)
    # Show plot
    plt.show()

In [3]:
# General Config information

base_url = "https://api.bjs.ojp.gov"
format_type = "json"
personal_field = 'personalData'
household_field = 'householdData'
year = '2019'

## List of available NCVS DATASETS and their data types.

In [4]:
# ENDPOINT
endpoint = "/bjs/ncvs/v2"

# Build query URL
query_url = f"{base_url}{endpoint}" # XML only

# Call API and get response
response = requests.get(query_url)

# General DataFrame
datasets_ord_dict = xmltodict.parse(response.content)
general_dict = {
    "title": datasets_ord_dict['bjsDataSet']['title'],
    "basePath": datasets_ord_dict['bjsDataSet']['basePath'],
    "description": datasets_ord_dict['bjsDataSet']['description']
}
general_df = pd.DataFrame(general_dict, index=[0])

# Resources DataFrame
datasets_idx = 0
datasets_df = pd.DataFrame()
dataset_formats_idx = 0
dataset_formats_df = pd.DataFrame()
for idx in range(len(datasets_ord_dict['bjsDataSet']['resources'])):
    current_dict = datasets_ord_dict['bjsDataSet']['resources'][idx]
    datasets_dict = {
        "title": current_dict["title"],
        "description": current_dict["description"]
    }
    datasets_df,datasets__idx = append_dict_to_df(datasets_dict, datasets_idx, datasets_df)
    for key in current_dict:
        formats_dict = current_dict["formats"]
        for cur_link in range(len(formats_dict['links'])):
            dataset_formats_dict = {
                "format": formats_dict['format'],
                "link": formats_dict['links'][cur_link]['link']
            }
            dataset_formats_df,dataset_formats_idx = append_dict_to_df(dataset_formats_dict, 
                                                                       dataset_formats_idx, 
                                                                       dataset_formats_df)
general_df

Unnamed: 0,title,basePath,description
0,National Crime Victimization Survey (NCVS) API,/developer/ncvs/index.cfm,The National Crime Victimization Survey (NCVS)...


In [5]:
datasets_df

Unnamed: 0,title,description
0,Personal Victimization,Personal victimization includes all violent vi...
0,Household Victimization,Household victimization includes all property ...


In [6]:
dataset_formats_df.head()

Unnamed: 0,format,link
0,CSV,/developer/ncvs/data/csv/NCVS_PERSONAL_1993-20...
1,CSV,/developer/ncvs/data/csv/NCVS_PERSONAL_2010-20...
2,CSV,/developer/ncvs/data/csv/NCVS_PERSONAL_2015-20...
3,CSV,/developer/ncvs/data/csv/NCVS_PERSONAL_1993-20...
4,CSV,/developer/ncvs/data/csv/NCVS_PERSONAL_2010-20...


# PERSONAL victimization datasets

## Description of the FIELDS or columns used in the PERSONAL victimization datasets

In [7]:
# ENDPOINT
endpoint = "/bjs/ncvs/v2/personal/fields/"

bjsPersonalDataSetFields_df = get_fields(endpoint)
bjsPersonalDataSetFields_df

Unnamed: 0,field_id,field_name,field_description,value_name,value_description
0,ager,Age,The respondent's age on the last day of the mo...,1,12 to 14
1,ager,Age,The respondent's age on the last day of the mo...,2,15 to 17
2,ager,Age,The respondent's age on the last day of the mo...,3,18 to 20
3,ager,Age,The respondent's age on the last day of the mo...,4,21 to 24
4,ager,Age,The respondent's age on the last day of the mo...,5,25 to 34
...,...,...,...,...,...
84,weapcat,Weapon category,Types of weapons present during the victimiza...,4,Type weapon unknown
85,weapcat,Weapon category,Types of weapons present during the victimiza...,5,Do not know if offender had weapon
86,weight,Weight,Weight definition,Population,This weight is attached to the person populati...
87,weight,Weight,Weight definition,Victimization,The weight used to calculate an estimate of vi...


##  PERSONAL victimization COUNTS of incidents reported to the NCVS by year 2019

In [8]:
# ENDPOINT
endpoint = f"/bjs/ncvs/v2/personal/{year}"

personal_counts_df = get_incidents(endpoint, personal_field, format_type)
personal_counts_df = translate_df(personal_counts_df, bjsPersonalDataSetFields_df, 2)
personal_counts_df

Unnamed: 0,weight,year,ager,direl,ethnic1R,gender,hincome,hispanic,injury,locationr,...,newoff,notify,popsize,race1R,region,seriousviolent,treatment,vicservices,weapcat,weapon
0,3313.0195,2019,18 to 20,Stranger,Non-Hispanic white,Female,Unknown,Non-Hispanic,Not injured,"Commercial place, parking lot, or other public...",...,Simple assault,"No, did not report to the police","Under 100,000",White,Midwest,Simple assault,Not injured,No services received from victim service agencies,Do not know if offender had weapon,Do not know if offender had weapon
1,3313.0195,2019,18 to 20,Stranger,Non-Hispanic white,Female,Unknown,Non-Hispanic,Not injured,"Commercial place, parking lot, or other public...",...,Simple assault,"No, did not report to the police","Under 100,000",White,Midwest,Simple assault,Not injured,No services received from victim service agencies,No weapon,"No, offender did not have weapon"
2,3313.0195,2019,18 to 20,Stranger,Non-Hispanic white,Female,Unknown,Non-Hispanic,Not injured,"Commercial place, parking lot, or other public...",...,Simple assault,"No, did not report to the police","Under 100,000",White,Midwest,Simple assault,Not injured,No services received from victim service agencies,Do not know if offender had weapon,Do not know if offender had weapon
3,1221.0648,2019,50 to 64,Stranger,Non-Hispanic white,Female,Unknown,Non-Hispanic,Not injured,"Commercial place, parking lot, or other public...",...,Simple assault,"Yes, reported to the police",1 million or more,White,West,Simple assault,Not injured,No services received from victim service agencies,No weapon,"No, offender did not have weapon"
4,1221.0648,2019,50 to 64,Stranger,Non-Hispanic white,Female,Unknown,Non-Hispanic,Not injured,"Commercial place, parking lot, or other public...",...,Rape/sexual assault,"Yes, reported to the police",1 million or more,White,West,Violent crime excluding simple assault,Not injured,No services received from victim service agencies,No weapon,"No, offender did not have weapon"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2237,1361.5831,2019,50 to 64,Stranger,Non-Hispanic white,Female,"Less than $7,500",Non-Hispanic,Not injured,"Commercial place, parking lot, or other public...",...,Rape/sexual assault,"No, did not report to the police","Under 100,000",White,South,Violent crime excluding simple assault,Not injured,No services received from victim service agencies,No weapon,"No, offender did not have weapon"
2238,1361.5831,2019,50 to 64,Stranger,Non-Hispanic white,Female,"Less than $7,500",Non-Hispanic,Not injured,"Commercial place, parking lot, or other public...",...,Rape/sexual assault,"No, did not report to the police","Under 100,000",White,South,Violent crime excluding simple assault,Not injured,No services received from victim service agencies,No weapon,"No, offender did not have weapon"
2239,2321.6145,2019,25 to 34,Stranger,Non-Hispanic white,Male,"$50,000 to $74,999",Non-Hispanic,Not injured,"Commercial place, parking lot, or other public...",...,Aggravated assault,"Yes, reported to the police","Under 100,000",White,South,Violent crime excluding simple assault,Not injured,No services received from victim service agencies,Knife,"Yes, offender had weapon"
2240,2321.6145,2019,25 to 34,Stranger,Non-Hispanic white,Male,"$50,000 to $74,999",Non-Hispanic,Not injured,"Commercial place, parking lot, or other public...",...,Aggravated assault,"Yes, reported to the police","Under 100,000",White,South,Violent crime excluding simple assault,Not injured,No services received from victim service agencies,Firearm,"Yes, offender had weapon"


## PERSONAL victimization POPULATION of incidents reported to the NCVS by year 2019

In [9]:
# ENDPOINT
endpoint = f"/bjs/ncvs/v2/personal/population/{year}"

personal_population_df = get_incidents(endpoint, personal_field, format_type)
personal_population_df = translate_df(personal_population_df, bjsPersonalDataSetFields_df, 2)
personal_population_df

KeyboardInterrupt: 

# HOUSEHOLD victimization datasets

## Description of the FIELDS or columns used in the HOUSEHOLD victimization datasets

In [None]:
# ENDPOINT
endpoint = "/bjs/ncvs/v2/household/fields/"

bjsHouseholdDataSetFields_df = get_fields(endpoint)
bjsHouseholdDataSetFields_df

## HOUSEHOLD victimization COUNTS of incidents reported to the NCVS by year 2019

In [None]:
# ENDPOINT
endpoint = f"/bjs/ncvs/v2/household/{year}"

household_counts_df = get_incidents(endpoint, household_field, format_type)
household_counts_df = translate_df(household_counts_df, bjsHouseholdDataSetFields_df, 2)
household_counts_df

## HOUSEHOLD victimization POPULATION of incidents reported to the NCVS by year 2019

In [None]:
# ENDPOINT
endpoint = f"/bjs/ncvs/v2/household/population/{year}"

household_population_df = get_incidents(endpoint, household_field, format_type)
household_population_df = translate_df(household_population_df, bjsHouseholdDataSetFields_df, 2)

## Plotting the Data


### x_values vs. y_values Plot

In [None]:
# build_scatter_plot(df, colx, coly, title, xlabel, ylabel, file)

## Linear Regression

In [None]:
# build_linear_regression(df, colx, coly, title, xlabel, ylabel, file, le_x, le_y, r_x, r_y)