### 01. Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

#### Import Data

In [None]:
# creating a path into a string variable
path = r'C:\Users\marze\Gun Violence'

In [None]:
# importing the gun violence data
df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'gun-violence-data_01-2013_03-2018.csv'))

In [None]:
# checking dataframe
df.head()

In [None]:
df.shape

### 02. Consistency Checks

#### Missing Values

In [None]:
# Checking for missing values in each column
miss_values = df.isnull().sum()

In [None]:
# Display the count of missing values for each column
print(miss_values)

###### We have quite a lot of missing values. There is investigation needed as to why there are so many missing values

#### Duplicate rows

In [None]:
# Check for duplicate rows
dup_rows = df[df.duplicated()]
print("Number of duplicate rows:", len(dup_rows))

#### Unique values

In [None]:
# Check for duplicates
dups = df[df.duplicated()]

# Print the duplicates
print("Number of duplicates:", len(dups))

#### Mixed Data Types

In [None]:
# Checking for mixed-type data

for col in df.columns.tolist():
    weird = (df[[col]].map(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df[weird]) > 0:
        print (col)

#### Checking numeric column statistics

In [None]:
# Summary statistics
df.describe()

In [None]:
# distribution of numeric columns
df.hist(figsize=(12, 10))
plt.show()

#### Non-numeric columns

In [None]:
# Checking unique values and counts for 'state' column
state_counts = df['state'].value_counts()
print(state_counts)

##### Illinois, California, Florida, Texas, and Ohio have the most gun violence incidents

In [None]:
# Checking unique values and counts for 'address' column
address_counts = df['address'].value_counts()
print(address_counts)

##### There appear to be some addresses/locations that have repeated incidents of gun violence. Should be investigated more

In [None]:
# Checking unique values and counts for 'incident_characteristics' column
incident_characteristics_counts = df['incident_characteristics'].value_counts()
print(incident_characteristics_counts)

In [None]:
# Checking for outliers or inconsistencies in latitude and longitude columns
plt.scatter(df['longitude'], df['latitude'])
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

In [None]:
# Checking for negative values in victim counts to make sure there are no anomalies.
negative_victims = df[(df['n_killed'] < 0) | (df['n_injured'] < 0)]
print("Rows with negative victim counts:", len(negative_victims))

In [None]:
# Checking consistency between 'n_killed' and 'n_injured' columns
inconsistencies = df[df['n_killed'] > df['n_injured']]
print("Rows with inconsistencies:", len(inconsistencies))

##### This check allows to see how many rows there are with more n_killed than n_injured. Usually, number of people injured would be greater than those killed. Having 45,887 rows with n_killed > n_injured is a quite large number. It will definitely need to be investigated

In [None]:
# summary statistics of n_killed and n_injured 
print(df[['n_killed', 'n_injured']].describe())

In [None]:
#examining rows where n_killed > n_injured
inconsistent_rows = df[df['n_killed'] > df['n_injured']]
print(inconsistent_rows.head())

##### The inconsistent rows seem to not be including n_killed in total n_injured. It seems like this dataframe is keeping n_killed and n_injured as seperate numbers and not considering killed persons injured

### 03. Cleaning Data

In [None]:
df.head()

In [None]:
# Dropping unnecessary columns 
df.drop(['source_url','incident_url','incident_url_fields_missing','gun_stolen','participant_name','sources','state_house_district','state_senate_district'], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
df.columns.values

In [None]:
# Updating date columns to datetime
df[['date']] = df[['date']].astype('datetime64[ns]')

In [None]:
# Updating 'str' relevant columns that were saved as object
df[['incident_id']] = df[['incident_id']].astype('str')

In [None]:
# Updating selected columns to 'category' data type
columns_to_category = ['state', 'city_or_county', 'gun_type', 'incident_characteristics', 'location_description', 'notes', 'participant_age_group', 'participant_gender', 'participant_relationship', 'participant_status', 'participant_type']

df[columns_to_category] = df[columns_to_category].astype('category')

In [None]:
df.info()

### 04. Exporting Dataframe

In [None]:
# exporting df to csv
df.to_csv(os.path.join(path, '02 Data', 'Prepared Data',
'6.1_gun_violence_cleaned.csv'))