In [2]:
import numpy as np
import pandas as pd
import itertools
from zipfile import ZipFile
from sklearn.model_selection import train_test_split

In [9]:
# Read data from CSV file.
with ZipFile('Murder_Data.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()

df = pd.read_csv("SHR76_20.csv")
df.info()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 827219 entries, 0 to 827218
Data columns (total 31 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   ID            827219 non-null  object 
 1   CNTYFIPS      827219 non-null  object 
 2   Ori           827219 non-null  object 
 3   State         827219 non-null  object 
 4   Agency        827219 non-null  object 
 5   Agentype      827219 non-null  object 
 6   Source        827219 non-null  object 
 7   Solved        827219 non-null  object 
 8   Year          827219 non-null  int64  
 9   StateName     22468 non-null   object 
 10  Month         827219 non-null  object 
 11  Incident      827219 non-null  int64  
 12  ActionType    827219 non-null  object 
 13  Homicide      827219 non-null  object 
 14  Situation     827219 non-null  object 
 15  VicAge        827219 non-null  int64  
 16  VicSex        827219 non-null  object 
 17  VicRace       827219 non-null  object 
 18  VicE

## Cleaning Data

In [71]:
def clean_dataframe(df):
    original_data = df.copy(deep=True)
    solved, unsolved = split_solved(original_data)
    data = remove_col(solved, ['Situation', 'Incident', 'Ori', 'StateName'])
    data = split_filedate(data)
    data = split_county_area(data)
    data = fill_unknown(data)
    data = delete_val(data, ['OffSex', 'OffRace', 'VicSex', 'VicRace'], ['Unknown', 'Unknown', 'Unknown', 'Unknown'])
    data = clean_unk(data)
    data = del_agentype(data, 'Agentype')
    return data

# Split the dataframe into a solved and unsolved dataset
def split_solved(df):
    grouped = df.groupby(df.Solved)
    return grouped.get_group("Yes"), grouped.get_group("No")

# Fill in missing data (NaN) with Unknown value
def fill_unknown(df):
    return df.fillna("Unknown")

# Remove column(s)
def remove_col(df, cols):
    df = df.drop(cols, axis=1)
    return df

# Delete rows with specific column value
def delete_val(df, cols, values):
    for i in range(len(cols)):
        df = df.drop(df[df[cols[i]] == values[i]].index)
    return df

# Everything that is unknown/undetermined/not specified/not reported/not determined, change to Unknown
def clean_unk(df):
    df.loc[df['OffAge'] == 999, 'OffAge'] = 'Unknown'
    df.loc[df['VicAge'] == 999, 'VicAge'] = 'Unknown'
    for col in df.columns:
        try:
            df.loc[df[col].str.contains("unknown|undetermined|not specified|not reported|not determined|na|Not enough information to determine|^$", regex = True), col] = 'Unknown'
        except:
            continue
    return df

# Delete Agency Type which is equal to 4 (unclear)
def del_agentype(df, col):
    return df.drop(df[df[col] == '4'].index)

# Split the FileDate column into three columns
def split_filedate(df):
    df['FileDate'] = df['FileDate'].astype(str)
    df.insert(0, 'FileYear', df['FileDate'].map(lambda x : x[len(x)-4:len(x)-2]), True)
    df.insert(0, 'FileDay', df['FileDate'].map(lambda x : x[len(x)-6:len(x)-4]), True)
    df.insert(0, 'FileMonth', df['FileDate'].map(lambda x : x[0:len(x)-6]), True)
    df['FileYear'] = df['FileYear'].astype(str)
    df['FileMonth'] = df['FileMonth'].astype(str)
    df['FileDay'] = df['FileDay'].astype(str)
    df = df.drop(['FileDate'], axis=1)
    return df

# Split CNTYFIPS and MSA columns into County and Area
def split_county_area(df):
    df.insert(0, 'County', df['CNTYFIPS'].map(lambda x : str(x).split(',')[0]), True)
    df.insert(0, 'Area',df['MSA'].map(lambda x :  str(x).split(',')[0] ) ,True)
    df = df.drop(['CNTYFIPS'], axis=1)
    df = df.drop(['MSA'], axis=1)
    return df

# Split dataframe into small train and test set, stratified on cols
def split_stratify(df, cols, train_frac, test_frac):
    # Get unique combinations of columns
    unique_vals = [np.unique(df[[col]].values) for col in cols]
    combinations = list(itertools.product(*unique_vals))
    
    # Create df for each combination and sample non-random from df
    train, test = [], []
    for combi in combinations:
        binned_df = df.loc[(df[cols[0]] == combi[0]) & (df[cols[1]] == combi[1]) & (df[cols[2]] == combi[2]) & (df[cols[3]] == combi[3])]
        train.append(binned_df.sample(frac=train_frac, replace=True, random_state=1))
        test.append(binned_df.sample(frac=test_frac, replace=True, random_state=1))
        
    # Return training df and test df
    return pd.concat(train, ignore_index=True), pd.concat(test, ignore_index=True)      

In [72]:
cleaned = clean_dataframe(df)