In [2]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from pathlib import Path

# File to Load (Remember to Change These)
dog_data = Path("../Resources/DOHMH_Dog_Bite_Data.csv")

# Read School and Student Data File and store into Pandas DataFrames
dog_df = pd.read_csv(dog_data)

dog_df.head()


Unnamed: 0,UniqueID,DateOfBite,Species,Breed,Age,Gender,SpayNeuter,Borough,ZipCode
0,1,January 01 2018,DOG,UNKNOWN,,U,False,Brooklyn,11220.0
1,2,January 04 2018,DOG,UNKNOWN,,U,False,Brooklyn,
2,3,January 06 2018,DOG,Pit Bull,,U,False,Brooklyn,11224.0
3,4,January 08 2018,DOG,Mixed/Other,4.0,M,False,Brooklyn,11231.0
4,5,January 09 2018,DOG,Pit Bull,,U,False,Brooklyn,11224.0


In [2]:
#Show the column information to review data types to determine if any data type manipulation is needed, also review nulls for possible data issues
#It appears age has nulls which will need to be deleted for age analysis only, age as well will need to be converted to decimal for analysis.  Gender has no nulls and will be used as it is in the dataset.
dog_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22663 entries, 0 to 22662
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UniqueID    22663 non-null  int64 
 1   DateOfBite  22663 non-null  object
 2   Species     22663 non-null  object
 3   Breed       20445 non-null  object
 4   Age         11442 non-null  object
 5   Gender      22663 non-null  object
 6   SpayNeuter  22663 non-null  bool  
 7   Borough     22663 non-null  object
 8   ZipCode     16805 non-null  object
dtypes: bool(1), int64(1), object(7)
memory usage: 1.4+ MB


In [3]:
#review the number of columns and rows for analysis to determine the dataset size for proper analysis.  
#This dataset appears to be a good dataset, despite the data inconsistancies in various columns after reviewing the data
print(dog_df.shape)

(22663, 9)


In [4]:
dog_df.describe(include="all")

Unnamed: 0,UniqueID,DateOfBite,Species,Breed,Age,Gender,SpayNeuter,Borough,ZipCode
count,22663.0,22663,22663,20445,11442.0,22663,22663,22663,16805.0
unique,,2555,1,1651,228.0,3,2,6,519.0
top,,September 16 2017,DOG,Pit Bull,2.0,U,False,Queens,10029.0
freq,,25,22663,4004,1624.0,10535,16787,5773,369.0
mean,5715.036668,,,,,,,,
std,3354.278369,,,,,,,,
min,1.0,,,,,,,,
25%,2833.5,,,,,,,,
50%,5666.0,,,,,,,,
75%,8499.0,,,,,,,,


In [5]:
#Use these fields as is for age data analysis
uniqueid = dog_df.UniqueID
dateofbite = dog_df.DateOfBite
species = dog_df.Species
breed = dog_df.Breed
gender = dog_df.Gender
spayneuter = dog_df.SpayNeuter
borough = dog_df.Borough
zipcode = dog_df.ZipCode

In [6]:
# age, clean and convert
# Used XPERT to get ideas on how to tackle age.  There were several types of data scrubbing needed for age with this dataset.
def convert_age_to_decimal(age_str):
    # Import re to assist with age conversion
    import re

    # Extract numeric values for months and years
    months = 0
    years = 0
    weeks = 0
    yearmonth = 0


    # Extract weeks
    week_match = re.search(r'(\d+)\s*(WKS|W|WEEK|WEEKS)', age_str)
    if week_match:
        weeks = int(week_match.group(1))
        weeks = weeks/52
        decimal_age = weeks
        
    # Extract months
    month_match = re.search(r'(\d+)\s*(MTHS|M|MTH|MONTH)', age_str)
    if month_match:
        months = int(month_match.group(1))
        months = months/12
        decimal_age = months

    # Extract years
    year_match = re.search(r'(\d+)\s*(YRS|Y|YR|YEAR)', age_str)
    if year_match:
        years = int(year_match.group(1))
        decimal_age = years

    #Extract Combined Year and Month data when & sign is used.
    combined_match = re.search(r'(\d+)\s*(&)', age_str)
    if combined_match:
        yearmonth = re.split(r'\s*&\s*', age_str)
        year = int(yearmonth[0])
        month = int(yearmonth[1])/12
        yearmonth = year + month
        decimal_age = yearmonth

    return decimal_age

In [8]:
# Test the function with sample age values
age_values = ['3 & 4','5&6','7 &8','8& 9','2018-01-01T00:00:00.000','1Y','2 YR','3YRS','4 YEARS','6 YRS','5M','6 MTH','7MTHS','8 MONTHS','9W','10 WK','11 WKS','12 WEEKS','2019-01-01T00:00:00.000']
for age_value in age_values:
    try:
        decimal_age = convert_age_to_decimal(age_value)
        print(f"{age_value}: {decimal_age}")
    except:
        decimal_age = 0
        print(f"{age_value}: {decimal_age}")

3 & 4: 3.3333333333333335
5&6: 5.5
7 &8: 7.666666666666667
8& 9: 8.75
2018-01-01T00:00:00.000: 0
1Y: 1
2 YR: 2
3YRS: 3
4 YEARS: 4
6 YRS: 6
5M: 0.4166666666666667
6 MTH: 0.5
7MTHS: 0.5833333333333334
8 MONTHS: 0.6666666666666666
9W: 0.17307692307692307
10 WK: 0.19230769230769232
11 WKS: 0.21153846153846154
12 WEEKS: 0.23076923076923078
2019-01-01T00:00:00.000: 0
