### Script used to clean and Sort data


In [1]:
# Use this cell to set up import statements for all of the packages 
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

# 'magic word' so that your visualizations are plotted
%matplotlib inline

### 01- Data Wrangling for NICS Firearm Background Checks

In [2]:
# Load NICS Firearm Background Checks:
nics_df = pd.read_csv('nics-firearm-background-checks.csv')
# Perform operations to inspect data
print('- nics firearm background checks size : ',nics_df.shape)
nics_df.head()

- nics firearm background checks size :  (14795, 27)


Unnamed: 0,month,state,permit,permit_recheck,handgun,long_gun,other,multiple,admin,prepawn_handgun,...,returned_other,rentals_handgun,rentals_long_gun,private_sale_handgun,private_sale_long_gun,private_sale_other,return_to_seller_handgun,return_to_seller_long_gun,return_to_seller_other,totals
0,2021-03,Alabama,42992.0,690.0,34675.0,20702.0,2048.0,1599,0.0,47.0,...,0.0,0.0,0.0,35.0,27.0,9.0,1.0,1.0,1.0,107474
1,2021-03,Alaska,322.0,1.0,4375.0,4186.0,590.0,316,0.0,0.0,...,0.0,0.0,0.0,8.0,3.0,3.0,0.0,1.0,0.0,10170
2,2021-03,Arizona,10118.0,699.0,32738.0,15268.0,2497.0,1746,0.0,20.0,...,4.0,0.0,0.0,27.0,17.0,4.0,0.0,0.0,0.0,65805
3,2021-03,Arkansas,3712.0,1173.0,15039.0,9815.0,723.0,744,12.0,29.0,...,0.0,0.0,0.0,16.0,21.0,1.0,0.0,0.0,0.0,34134
4,2021-03,California,17904.0,12669.0,50271.0,35334.0,7327.0,0,0.0,3.0,...,121.0,0.0,0.0,10623.0,4159.0,829.0,44.0,28.0,1.0,143742


In [3]:
# check which columns have missing values with info()
nics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14795 entries, 0 to 14794
Data columns (total 27 columns):
month                        14795 non-null object
state                        14795 non-null object
permit                       14771 non-null float64
permit_recheck               3410 non-null float64
handgun                      14775 non-null float64
long_gun                     14776 non-null float64
other                        7810 non-null float64
multiple                     14795 non-null int64
admin                        14772 non-null float64
prepawn_handgun              12852 non-null float64
prepawn_long_gun             12850 non-null float64
prepawn_other                7425 non-null float64
redemption_handgun           12855 non-null float64
redemption_long_gun          12854 non-null float64
redemption_other             7425 non-null float64
returned_handgun             4510 non-null float64
returned_long_gun            4455 non-null float64
returned_other   

In [4]:
# check for duplicates in the data
print('- Number of duplicated rows:',sum(nics_df.duplicated()),'\n')
# count the number of missing data in each columon
print('- The number of missing data in each columns:')
nics_df.isnull().sum()

- Number of duplicated rows: 0 

- The number of missing data in each columns:


month                            0
state                            0
permit                          24
permit_recheck               11385
handgun                         20
long_gun                        19
other                         6985
multiple                         0
admin                           23
prepawn_handgun               1943
prepawn_long_gun              1945
prepawn_other                 7370
redemption_handgun            1940
redemption_long_gun           1941
redemption_other              7370
returned_handgun             10285
returned_long_gun            10340
returned_other               10670
rentals_handgun              11495
rentals_long_gun             11660
private_sale_handgun          9735
private_sale_long_gun         9735
private_sale_other            9735
return_to_seller_handgun     10010
return_to_seller_long_gun     9735
return_to_seller_other       10230
totals                           0
dtype: int64

In [5]:
# convert object data type to python datetime
nics_df['month'] = pd.to_datetime(nics_df['month'])

#### drop the years before 2010 and after 2019 not enough poplution data exit 


In [6]:
nics_df.month.dt.year.unique()

array([2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011,
       2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000,
       1999, 1998])

In [7]:
yr_out_scope = [2021, 2020, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998]
nics_df = nics_df[~nics_df.month.dt.year.isin(yr_out_scope)]

### Drop District of Columbia, and U.S. territories not enough data
USA colonies : Guam, Puerto Rico, District Columbia, Virgin Islands, and Mariana Islands 

In [8]:
colonies = ['Puerto Rico', 'Mariana Islands', 'Virgin Islands','Guam' , 'District of Columbia']
nics_df = nics_df[~nics_df.state.isin(colonies)]
nics_df['state'].nunique()

50

### drop colmons out of scope

In [9]:
cols_out_scope = [ "admin" , 'prepawn_handgun', 'prepawn_long_gun', 'prepawn_other', 'redemption_handgun', 
                  'redemption_long_gun', 'redemption_other', 'returned_handgun', 'returned_long_gun', 
                  'returned_other', 'rentals_handgun', 'rentals_long_gun', 'private_sale_handgun', 
                  'private_sale_long_gun', 'private_sale_other', 'return_to_seller_handgun',
                  'return_to_seller_long_gun', 'return_to_seller_other','permit_recheck']
nics_df.drop(cols_out_scope, axis = 1, inplace = True)

In [10]:
# count the number of missing data in each columon
nics_df.isnull().sum()

month       0
state       0
permit      0
handgun     0
long_gun    0
other       0
multiple    0
totals      0
dtype: int64

In [11]:
# use zeros to fill in missing values beacuase apearally this data is zero
nics_df.fillna(0, inplace=True)

In [12]:
# convert data type from float to int
for i in range(2,nics_df.shape[1]):
    nics_df.iloc[:,i] = nics_df.iloc[:,i].astype(int)  

In [13]:
# strip and convert to string 
nics_df['state'] = nics_df['state'].astype(str)
nics_df['state'].nunique()

50

In [14]:
# confirm your correction with info()
nics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6000 entries, 825 to 7424
Data columns (total 8 columns):
month       6000 non-null datetime64[ns]
state       6000 non-null object
permit      6000 non-null int64
handgun     6000 non-null int64
long_gun    6000 non-null int64
other       6000 non-null int64
multiple    6000 non-null int64
totals      6000 non-null int64
dtypes: datetime64[ns](1), int64(6), object(1)
memory usage: 421.9+ KB


In [15]:
nics_df.head()

Unnamed: 0,month,state,permit,handgun,long_gun,other,multiple,totals
825,2019-12-01,Alabama,33683,33020,25882,1560,1382,100100
826,2019-12-01,Alaska,386,3455,3080,397,215,7840
827,2019-12-01,Arizona,4536,18178,10423,1560,1109,38171
828,2019-12-01,Arkansas,2339,8671,10957,531,458,25699
829,2019-12-01,California,29544,40637,30780,4883,0,106600


In [16]:
nics_df.to_csv('nics.csv', index = False)