In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import calendar

In [90]:
# ==================================================
# Function to clean data 
# Objective: Split the text that comes with the '||' format to separate the categories for each column
# Parameters:
#     df = DataFrame to use
#     indx = Main column to merge with the original DataFrame
#     col = Name of the colum to clean within the original DataFrame
#     txtRemove = Category we need to remove after cleansing process
# ==================================================

def splitColumnsDF(df, indx, col, txtRemove = ""):
    
    # Identify DataFrame to clean all columns    
    tempDF = df[[indx,col]]
    
    # Step 1 - Remove NaN - Need to fix this step
    tempDF.dropna(how="all",axis='columns')
    tempDF = tempDF[tempDF[col].notna()]
    
    # Create empty dataframe
    internalTempDf = pd.DataFrame(columns=[indx,col])
    
    # Iterate over the entire dataframe
    for index, row in df.iterrows():
        try:
            # Identify how many | the text contains
            n = row[col].count('|')
            
            # Split the text into || or | strings
            if ( n >  0):
                # Specify the type os separator the text has
                if(n > 1):
                    newColumns = row[col].split('||',int(n/2+1))
                elif( n ==  1):
                    newColumns = row[col].split('|',2)
                i = 0
                # Replace the unused text [#:: or #:] pattern
                for i in range (0,len(newColumns)):
                    # We make sure we have text to evaluate
                    if(len(newColumns[i]) > 0):
                        # Add cleaned text to the dataframe
                        internalTempDf = internalTempDf.append({indx: row[indx], col: newColumns[i][newColumns[i].rfind(':')+1:]}, 
                                                               ignore_index=True)
            
            # There is no | in the text
            else:
                n = row[col].count(':')
                # Verify if the text contains the [#:: or #:] pattern
                if(n > 0):
                    internalTempDf = internalTempDf.append({indx: row[indx], col: row[col][row[col].rfind(':')+1:]}, 
                                                           ignore_index=True)
                else:
                    internalTempDf = internalTempDf.append({indx: row[indx], col: row[col]}, 
                                                           ignore_index=True)
                
        except Exception as e:
            continue
            #print("There is an error while loading and processing the data")
            #print(f"{row[col]} \n {e}")
            
    # Exclude specific categories
    if (txtRemove != ''):
        internalTempDf = internalTempDf.loc[internalTempDf[col] != txtRemove]

    tempDF = internalTempDf
    
    
    # Return the cleaned dataframe 
    return tempDF

In [12]:
# Import Data gun file 
# Replace this by link below to get the complete data
#urlGunViolence = "https://drive.google.com/file/d/1akVNMxE4RR0_ySwakTfVx-OCmZQbskSD/view?usp=sharing"
# For testing purposes
urlGunViolence = "Resources/gun_violence_data_small.csv"

# load the data
gunViolenceDf = pd.read_csv(urlGunViolence)

# Add useful columsn for plots
gunViolenceDf['date'] = pd.to_datetime(gunViolenceDf['date'])
gunViolenceDf['year'] = gunViolenceDf['date'].dt.year
gunViolenceDf['month'] = gunViolenceDf['date'].dt.month
gunViolenceDf['monthday'] = gunViolenceDf['date'].dt.day
gunViolenceDf['weekday'] = gunViolenceDf['date'].dt.weekday




In [103]:
# Starts cleaning data
#display(gunViolenceDf.columns)
#display(gunViolenceDf.head())

# Cleaning categories
#gunStolenDF = splitColumnsDF(gunViolenceDf,"incident_id","gun_stolen", "Unknown")
#gunTypeDF = splitColumnsDF(gunViolenceDf,"incident_id","gun_type", "Unknown")
######characteristicsDF = splitColumnsDF(gunViolenceDf,"incident_id","incident_characteristics", "Unknown")
#ageDF = splitColumnsDF(gunViolenceDf,"incident_id","participant_age", "Unknown")
#ageGroupDF = splitColumnsDF(gunViolenceDf,"incident_id","participant_age_group", "Unknown")
#genderDF = splitColumnsDF(gunViolenceDf,"incident_id","participant_gender", "Unknown")
#relationshipDF = splitColumnsDF(gunViolenceDf,"incident_id","participant_relationship", "Unknown")
#statusDF = splitColumnsDF(gunViolenceDf,"incident_id","participant_status", "Unknown")
#typeDF = splitColumnsDF(gunViolenceDf,"incident_id","participant_type", "Unknown")

# Function to clean all the columns within the DataFrame




In [104]:
display(gunStolenDF["gun_stolen"].unique())
display(gunTypeDF["gun_type"].unique())
display(ageDF["participant_age"].unique())
display(ageGroupDF["participant_age_group"].unique())
display(genderDF["participant_gender"].unique())
display(relationshipDF["participant_relationship"].unique())
display(statusDF["participant_status"].unique())
display(typeDF["participant_type"].unique())



array(['Not-stolen', 'Stolen'], dtype=object)

array(['Handgun', '22 LR', '223 Rem [AR-15]', 'Shotgun', '9mm', '45 Auto',
       '12 gauge', '7.62 [AK-47]', '40 SW', '44 Mag', 'Other', '38 Spl',
       '380 Auto', '410 gauge', '32 Auto', '308 Win', 'Rifle', '357 Mag',
       '16 gauge', '30-30 Win', '25 Auto'], dtype=object)

array(['20', '25', '31', '33', '34', '29', '56', '18', '46', '14', '47',
       '23', '55', '51', '40', '9', '5', '2', '15', '19', '28', '78',
       '48', '24', '41', '22', '21', '39', '68', '35', '26', '69', '27',
       '37', '17', '50', '42', '16', '13', '57', '66', '67', '62', '64',
       '36', '43', '30', '32', '1', '3', '63', '10', '53', '60', '58',
       '72', '45', '49', '12', '44', '70', '0', '54', '6', '38', '8', '4',
       '7', '52', '79', '59', '11', '80', '73', '61', '85', '88', '71',
       '89', '90', '65', '74', '75', '77', '86', '84', '76', '81', '96',
       '83', '82', '87', '05', '03', '0.048611111', '08', '0.041666667',
       '0.059027778', '0.045833333', '0.046527778', '01', '07',
       '0.045138889'], dtype=object)

array(['Adult 18+', 'Teen 12-17', 'Child 0-11'], dtype=object)

array(['Male', 'Female'], dtype=object)

array(['Family', 'Drive by - Random victims', 'Aquaintance',
       'Gang vs Gang', 'Significant others - current or former',
       'Armed Robbery', 'Mass shooting - Random victims',
       'Mass shooting - Perp Knows Victims', 'Co-worker', 'Neighbor',
       'Friends', 'Home Invasion - Perp Does Not Know Victim',
       'Home Invasion - Perp Knows Victim'], dtype=object)

array(['Arrested', 'Injured', 'Killed', 'Injured, Unharmed, Arrested',
       'Unharmed, Arrested', 'Unharmed', 'Injured, Arrested',
       'Killed, Unharmed, Arrested', 'Injured, Unharmed',
       'Killed, Injured', 'Killed, Unharmed', 'Killed, Arrested'],
      dtype=object)

array(['Victim', 'Subject-Suspect'], dtype=object)