In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

print(f'numpy version: {np.__version__}')
print(f'pandas version: {pd.__version__}')

numpy version: 1.17.0
pandas version: 1.1.3


# Initial data knowledge

The dataset is already provided by the business stakeholder as a zip archive. There are:
* 4 CSV files
* 1 Excels file
* 1 txt file

The list of data files follows:

* area.csv
* crime.csv
* education.csv
* income.xlsx
* life_expectancy.csv
* region.txt

The files are described in a document, "LE - File Description.docx". It contains some metadata on each of the above files.

## Values for sanity checking

In [2]:
states = set(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'])
federal_district = set(['District of Columbia'])
inhabited_territories = set(['Puerto Rico', 'American Samoa', 'Guam', 'Northern Mariana Islands', 'U.S. Virgin Islands'])

# Load and clean data

In [3]:
data_path = '../data'

area_path = os.path.join(data_path, 'area.csv')
crime_path = os.path.join(data_path, 'crime.csv')
education_path = os.path.join(data_path, 'education.csv')
life_expectancy_path = os.path.join(data_path, 'life_expectancy.csv')
income_path = os.path.join(data_path, 'income.xlsx')
region_path = os.path.join(data_path, 'region.txt')

## Process Education file

In [4]:
education = pd.read_csv(education_path, usecols= lambda column: 'rank' not in column, sep=';', skiprows=1)
print(f'columns: {education.columns}')
# remove \n from column names
education.columns = [item.replace('\n', ' ') for item in education.columns]
print(f'columns: {education.columns}')

columns: Index(['State,\nfederal district,\nor territory',
       '% High school graduate\nor higher', '% Bachelor's degree\nor higher',
       '% Advanced degree'],
      dtype='object')
columns: Index(['State, federal district, or territory',
       '% High school graduate or higher', '% Bachelor's degree or higher',
       '% Advanced degree'],
      dtype='object')


In [5]:
# inspect data types
education.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 4 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   State, federal district, or territory  52 non-null     object
 1   % High school graduate or higher       52 non-null     object
 2   % Bachelor's degree or higher          52 non-null     object
 3   % Advanced degree                      52 non-null     object
dtypes: object(4)
memory usage: 1.8+ KB


In [6]:
# convert columns with numbers into floating point values
for column in education.columns:
    if '%' in column and education[column].dtype != np.dtype(np.float):
        education[column] = education[column].apply(lambda x: 1.0/100.0 * float(x.strip(' %')))

In [7]:
education.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 4 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   State, federal district, or territory  52 non-null     object 
 1   % High school graduate or higher       52 non-null     float64
 2   % Bachelor's degree or higher          52 non-null     float64
 3   % Advanced degree                      52 non-null     float64
dtypes: float64(3), object(1)
memory usage: 1.8+ KB


In [8]:
# convert first column name from 'State, federal district, or territory' to 'State'
if education.columns[0] == 'State, federal district, or territory':
    education.rename(columns={'State, federal district, or territory': 'State'}, inplace=True)
if 'State' in set(education.columns):
    # Remove leading spaces in state names
    education.State = education.State.apply(lambda x: x.strip())
    education.set_index('State', inplace=True, verify_integrity = True)
    education.drop(labels='United States', inplace=True)
education.sort_index(inplace=True)


In [9]:
# check if states name is inside the set of 50 states + federal_district + inhabited_territories

assert set(education.index).issubset(states.union(federal_district).union(inhabited_territories))

print(set(education.index).difference(states))

{'District of Columbia'}


In [10]:
education.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51 entries, Alabama to Wyoming
Data columns (total 3 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   % High school graduate or higher  51 non-null     float64
 1   % Bachelor's degree or higher     51 non-null     float64
 2   % Advanced degree                 51 non-null     float64
dtypes: float64(3)
memory usage: 1.6+ KB


In [11]:
# get count of missing values per column
education.isna().sum(axis=0)

% High school graduate or higher    0
% Bachelor's degree or higher       0
% Advanced degree                   0
dtype: int64

In [12]:
education.shape

(51, 3)

In [13]:
education.to_csv(os.path.join(data_path, 'education_clean.csv'))

## Process Life Expectancy file

In [14]:
life_expectancy = pd.read_csv(life_expectancy_path, sep=';')
life_expectancy.shape

(57, 5)

In [15]:
life_expectancy.columns

Index(['State', 'LifeExp2018', 'LifeExp2010', 'MaleLifeExp', 'FemLifeExp'], dtype='object')

In [16]:
life_expectancy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   State        57 non-null     object 
 1   LifeExp2018  57 non-null     object 
 2   LifeExp2010  57 non-null     float64
 3   MaleLifeExp  57 non-null     float64
 4   FemLifeExp   57 non-null     float64
dtypes: float64(3), object(2)
memory usage: 2.4+ KB


In [17]:
life_expectancy.State = life_expectancy.State.apply(lambda state: state.strip())

In [18]:
life_expectancy.drop(life_expectancy[life_expectancy.State == 'United States'].index, inplace=True)

In [19]:
# convert column LifeExp2018
life_expectancy.LifeExp2018 = life_expectancy.LifeExp2018.apply(lambda value: float(value.strip()))
life_expectancy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56 entries, 0 to 56
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   State        56 non-null     object 
 1   LifeExp2018  56 non-null     float64
 2   LifeExp2010  56 non-null     float64
 3   MaleLifeExp  56 non-null     float64
 4   FemLifeExp   56 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.6+ KB


In [20]:
life_expectancy

Unnamed: 0,State,LifeExp2018,LifeExp2010,MaleLifeExp,FemLifeExp
0,Hawaii,82.3,81.4,79.3,85.3
1,California,81.6,80.6,79.4,83.8
2,Puerto Rico,81.3,78.7,77.6,84.7
3,New York,81.3,80.3,79.0,83.4
4,U.S. Virgin Islands,81.2,79.2,76.3,85.6
5,Minnesota,81.0,80.8,79.0,83.0
6,Connecticut,80.9,80.7,78.7,83.0
7,Guam,80.7,78.2,77.6,83.8
8,Colorado,80.5,80.1,78.5,82.5
9,Massachusetts,80.5,80.5,78.2,82.6


In [21]:
if 'State' in life_expectancy.columns:
    life_expectancy.set_index('State', inplace=True)
assert set(life_expectancy.index)  == set(states).union(set(inhabited_territories)).union(federal_district)

In [22]:
life_expectancy.sort_index(inplace=True)

In [23]:
life_expectancy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56 entries, Alabama to Wyoming
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   LifeExp2018  56 non-null     float64
 1   LifeExp2010  56 non-null     float64
 2   MaleLifeExp  56 non-null     float64
 3   FemLifeExp   56 non-null     float64
dtypes: float64(4)
memory usage: 2.2+ KB


In [24]:
life_expectancy.to_csv(os.path.join(data_path, 'life_expectancy_clean.csv'))

## Process Crime file

In [25]:
crime = pd.read_csv(crime_path, sep=';')

In [26]:
crime.columns

Index(['State', 'Population\n(total inhabitants) \n(2015) [2]',
       'Murders and\nNonnegligent\nManslaughter\n(total deaths) \n(2015) [1]',
       'Murders\n(total deaths) \n(2015) [3]',
       'Gun Murders\n(total deaths) \n(2015) [3]',
       'Gun\nOwnership\n(%) \n(2013) [4]',
       'Murder and\nNonnegligent\nManslaughter\nRate\n(per 100,000) \n(2015)',
       'Murder Rate\n(per 100,000) \n(2015)',
       'Gun\nMurder Rate\n(per 100,000) \n(2015)'],
      dtype='object')

In [27]:
# remove '\n' from column names
crime.columns = [item.replace('\n', ' ').replace('  ', ' ') for item in crime.columns]
crime.columns

Index(['State', 'Population (total inhabitants) (2015) [2]',
       'Murders and Nonnegligent Manslaughter (total deaths) (2015) [1]',
       'Murders (total deaths) (2015) [3]',
       'Gun Murders (total deaths) (2015) [3]', 'Gun Ownership (%) (2013) [4]',
       'Murder and Nonnegligent Manslaughter Rate (per 100,000) (2015)',
       'Murder Rate (per 100,000) (2015)',
       'Gun Murder Rate (per 100,000) (2015)'],
      dtype='object')

In [28]:
crime.shape

(62, 9)

In [29]:
crime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 9 columns):
 #   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   State                                                            62 non-null     object 
 1   Population (total inhabitants) (2015) [2]                        51 non-null     object 
 2   Murders and Nonnegligent Manslaughter (total deaths) (2015) [1]  51 non-null     object 
 3   Murders (total deaths) (2015) [3]                                51 non-null     object 
 4   Gun Murders (total deaths) (2015) [3]                            51 non-null     object 
 5   Gun Ownership (%) (2013) [4]                                     51 non-null     float64
 6   Murder and Nonnegligent Manslaughter Rate (per 100,000) (2015)   51 non-null     float64
 7   Murder Rate (per 100,000) (2015)               

In [30]:
# remove empty rows
non_empty_rows = crime[crime.columns[1:]].isna().sum(axis=1) == 0
crime = crime[non_empty_rows]

In [31]:
if 'State' in crime.columns:
    crime.set_index('State', inplace=True)
crime.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51 entries, Alabama to Wyoming
Data columns (total 8 columns):
 #   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   Population (total inhabitants) (2015) [2]                        51 non-null     object 
 1   Murders and Nonnegligent Manslaughter (total deaths) (2015) [1]  51 non-null     object 
 2   Murders (total deaths) (2015) [3]                                51 non-null     object 
 3   Gun Murders (total deaths) (2015) [3]                            51 non-null     object 
 4   Gun Ownership (%) (2013) [4]                                     51 non-null     float64
 5   Murder and Nonnegligent Manslaughter Rate (per 100,000) (2015)   51 non-null     float64
 6   Murder Rate (per 100,000) (2015)                                 51 non-null     object 
 7   Gun Murder Rate (per 100,000) (2015)     

In [32]:
# check correctness of index values
assert set(crime.index) == set(states).union(set(federal_district))

In [33]:
def str_to_float(val:str) -> np.float:
    index_parant = val.find('[')
    val = val[:index_parant].strip() if index_parant > 0 else val
    try:
        return float(val)
    except:
        return np.nan

for column in crime.columns[1:]:
    if crime[column].dtype != np.number:
        crime[column] = crime[column].apply(lambda value: str_to_float(value))

In [34]:
crime.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51 entries, Alabama to Wyoming
Data columns (total 8 columns):
 #   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   Population (total inhabitants) (2015) [2]                        51 non-null     object 
 1   Murders and Nonnegligent Manslaughter (total deaths) (2015) [1]  48 non-null     float64
 2   Murders (total deaths) (2015) [3]                                47 non-null     float64
 3   Gun Murders (total deaths) (2015) [3]                            48 non-null     float64
 4   Gun Ownership (%) (2013) [4]                                     51 non-null     float64
 5   Murder and Nonnegligent Manslaughter Rate (per 100,000) (2015)   51 non-null     float64
 6   Murder Rate (per 100,000) (2015)                                 49 non-null     float64
 7   Gun Murder Rate (per 100,000) (2015)     

In [35]:
crime.sort_index(inplace=True)
crime.to_csv(os.path.join(data_path, 'crime_clean.csv'))

## Process Area file

In [36]:
area = pd.read_csv(area_path, sep=';')
area.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   State      50 non-null     object 
 1   TotalRank  50 non-null     int64  
 2   TotalSqMi  50 non-null     float64
 3   TotalKmQ   50 non-null     int64  
 4   LandRank   50 non-null     int64  
 5   LandSqMi   50 non-null     float64
 6   LandKmQ    50 non-null     int64  
 7   LandPer    50 non-null     float64
 8   WaterRank  50 non-null     int64  
 9   WaterSqMi  50 non-null     float64
 10  WaterKmQ   50 non-null     int64  
 11  WaterPer   50 non-null     float64
dtypes: float64(5), int64(6), object(1)
memory usage: 4.8+ KB


In [37]:
area.shape

(50, 12)

In [38]:
if 'State' in area.columns:
    area.set_index('State', inplace=True)
assert set(area.index) == set(states)

In [39]:
area.sort_index(inplace=True)
area.to_csv(os.path.join(data_path, 'area_clean.csv'))

## Process Income file

In [40]:
income = pd.read_excel(income_path, skiprows=1, usecols= lambda column: 'Rank' not in column)

In [41]:
income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   State       51 non-null     object
 1   Income2017  51 non-null     int64 
 2   Income2016  51 non-null     int64 
 3   Income2015  51 non-null     int64 
 4   Income2014  51 non-null     int64 
 5   Income2013  51 non-null     int64 
 6   Income2012  51 non-null     int64 
 7   Income2011  51 non-null     int64 
 8   Income2010  51 non-null     int64 
 9   Income2009  51 non-null     int64 
 10  Income2008  51 non-null     int64 
 11  Income2007  51 non-null     int64 
dtypes: int64(11), object(1)
memory usage: 4.9+ KB


In [42]:
if 'State' in income.columns:
    income.set_index('State', inplace=True)
income.sort_index(inplace=True)
assert set(income.index) == set(states).union(set(federal_district))
income.to_csv(os.path.join(data_path, 'income_clean.csv'))

## Processing Region file

In [43]:
region = pd.read_csv(region_path, sep=r'\s+')
region.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 1 to 50
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      50 non-null     object
 1   Abb       50 non-null     object
 2   Region    50 non-null     object
 3   Division  50 non-null     object
dtypes: object(4)
memory usage: 2.0+ KB


In [44]:
if region.columns[0] == 'Name':
    region.rename(columns={'Name': 'State'}, inplace=True)
    
region.set_index('State', inplace=True)
region.sort_index(inplace=True)
assert set(region.index) == set(states)

In [45]:
region.to_csv(os.path.join(data_path, 'region_clean.csv'))

# Reports on datasets

 ## Data Collection Report

Target:Examine the data for completeness: check if you have all the cases you need. If the datasets have different number of rows, explain where these differences are.
Check if there are missing values and if that’s the case, list the states that have them.
Data Quality Report: report your findings in the data quality report.

### Dataset Education

Problems addressed:
* The first row is semantically empty, it had to be skipped
* The rank columns were skipped at reading
* After reading the data, the column names were cleaned, with '\n' replaced with space; this would help labels on plots etc
* The column 'State, federal district, or territory' contained leading and trailing spaces, which had to be removed. The column was renamed to 'State'. 
* The row with 'United states' was removed from the dataset. 
* Excepting the State column, the other columns were supposed to contain numbers (percentages); they were converted from string to numbers (e.g. '30.7%' -> 0.307)
* The index was set to state names (column State), the dataset was ordered by index.
* We checked by an assertion that the index names is 50 states + federal_district + inhabited_territories

### Dataset Life Expectancy

Problems addressed:
* The state names were prepended with space; we had to strip each value on this column.
* The column LifeExp2018 was found to contain string values. The only case with a value not convertible to float was for row corresponding to 'United states'. We eliminated this row and further converted the column to floating point. 
* After this step, we got 56 rows: 50 states, District of Columbia, and inhabited territories

### Dataset Crime

Problems addressed:
	After reading, the column names were cleaned, with '\n' replaced with space; this would help labels on plots etc
	The last few rows were semantically empty, we removed them; the emptyness was seen by both eyebaling the source file, and comparing the # of rows (62) with the number of non-null values reported by the info method:
    
 0   State                                                            62 non-null     object 
 1   Population (total inhabitants) (2015) [2]                        51 non-null     object 
 2   Murders and Nonnegligent Manslaughter (total deaths) (2015) [1]  51 non-null     object 
 3   Murders (total deaths) (2015) [3]                                51 non-null     object 
 4   Gun Murders (total deaths) (2015) [3]                            51 non-null     object 
 5   Gun Ownership (%) (2013) [4]                                     51 non-null     float64
 6   Murder and Nonnegligent Manslaughter Rate (per 100,000) (2015)   51 non-null     float64
 7   Murder Rate (per 100,000) (2015)                                 51 non-null     object 
 8   Gun Murder Rate (per 100,000) (2015)                             51 non-null     object 
 
The column 'State' was further used as index.

Excepting columns: 'Gun Ownership (%) (2013) [4]' and 'Murder and Nonnegligent Manslaughter Rate (per 100,000) (2015)', all other columns were of type String. Directly converting them to float did not work, one had to perform a complex custom parse. Still, there were some cases for which the values could not be converted and they were replaced witn NaN. 

### Dataset Area
We've got 50 rows, corresponding to the states.
There no issues with data types. No missing value was found.

### Dataset Income
The Income dataset was provided as an excel file. 50 states and district of Columbia are given in this file. The rank columns were skipped during load of teh file. There are no missing values. No special issue was encountered during data processing.

### Dataset Region 
This is a text file, with columns separated by multiple spaces. There are 4 columns of type string.

## Dataset description
Data Description Report: describe the data you acquired, the number of rows of the dataframes, the column you used as index, and the structure of each dataframe (column name, type, and description). Write the data description report. This information can serve other data scientists who want to use your study.

### Dataset Education
Acquired data: a dataframe of shape (51, 3). There are 50 states and 'District of Columbia'

The index is State, the dataset is ordered by index.

All three columns are of type floating point, with non-null values.

The column "% High school graduate or higher" contains the percentage of people with high school or higher education.
The column "% Bachelor's degree or higher" contains the percentage of people with bachelor degree or higher education.
The column "% Advanced degree" contains the percentage of people with higher education.

### Dataset Life Expectancy
Initially data acquired: a dataframe of shape (57, 5).
The columns were:
'State', of type string; name of the state, region of District of Columbia; some values prepended with space
'LifeExp2018', of type string; supposed to contain life expectancy in 2018, the row corresponding to 'United States' contained a value which could not be parsed to float.
'LifeExp2010', of type float64; life expectancy in 2010
'MaleLifeExp', of type float64; the header is self-explanatory
'FemLifeExp', of type float64; the header is self-explanatory

We eliminated the row with State='United States'. The remaining rows corresponds to 50 states, District of Columbia, and inhabited territories. 
The column State was further used as index. 

### Dataset Crime

Initially, the number of rows was 62. There were some empty rows. After their removal we got 50 rows, one for each state
The initial data structure was:
Data columns (total 9 columns):
 \#   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   State                                                            62 non-null     object 
 1   Population (total inhabitants) (2015) [2]                        51 non-null     object 
 2   Murders and Nonnegligent Manslaughter (total deaths) (2015) [1]  51 non-null     object 
 3   Murders (total deaths) (2015) [3]                                51 non-null     object 
 4   Gun Murders (total deaths) (2015) [3]                            51 non-null     object 
 5   Gun Ownership (%) (2013) [4]                                     51 non-null     float64
 6   Murder and Nonnegligent Manslaughter Rate (per 100,000) (2015)   51 non-null     float64
 7   Murder Rate (per 100,000) (2015)                                 51 non-null     object 
 8   Gun Murder Rate (per 100,000) (2015)                             51 non-null     object 
The column 'State' was further used as index

The object columns (excepting State) were converted to float. Not all values could be casted as such, and they were replaced with NaNs.

### Dataset Area
The dataset shape is (50, 12). Each row correspond to a state.  The columns are:
The initial columns are:

* State: name of the state                              
* TotalRank: total area rank  
* TotalSqMi: total area in SqMi
* TotalKmQ: total area in KmQ
* LandRank: land area rank
* LandSqMi: land area in SqMi 
* LandKmQ: land area in KmQ
* LandPer: land area percentage 
* WaterRank: water area rank
* WaterSqMi: water area in SqMi
* WaterKmQ: water area in KmQ
* WaterPer: water area percentage
The column 'State' was further used as index

### Income
The Income datasets has the columns (excepting the one called Rank):

* Income2017  51 non-null     int64 
* Income2016  51 non-null     int64 
* Income2015  51 non-null     int64 
* Income2014  51 non-null     int64 
* Income2013  51 non-null     int64 
* Income2012  51 non-null     int64 
* Income2011  51 non-null     int64 
* Income2010  51 non-null     int64 
* Income2009  51 non-null     int64 
* Income2008  51 non-null     int64 
* Income2007  51 non-null     int64 
 
There are no missing values. The column 'State' was further used as index.

### Dataset Region

There are 50 rows in this dataset, one for each state. The data are in the end indexed by column 'Name', renamed as 'State'. The other three columns are:
* Abb: abbreviation of the name of the state
* Region: the region that each state belongs to (Northeast, South, North Central, West)
* Division: state divisions (New England, Middle Atlantic, South Atlantic, East South Central, West South Central, East North Central, West North Central, Mountain, and Pacific)

There are no missing values. 

## Data Quality

### Dataset Education

All the values are known (i.e., no null/nans). 
There are 50 rows (one for each state) and yet another one for District of Columbia.
The cleaned dataset is saved as education_clean.csv

Life Expectancy
The value for row United States and column LifeExp2018 was not parseable to a float, but the whole row was eliminated (not a state itself)
All other values were known. 
Unlike the Education dataset, it contains values for District of Columbia and 5 inhabitted regions as well.
The cleaned dataset was saved as life_expectancy_clean.csv


### Dataset Crime

After removing the trailing empty rows, the dataset remained with 51 records (50 states + District of Columbia). 
There are some missing values in the 8 columns, after trying to convert them to floating points:
Data columns (total 8 columns):

 \# Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   Population (total inhabitants) (2015) [2]                        51 non-null     object 
 1   Murders and Nonnegligent Manslaughter (total deaths) (2015) [1]  51 non-null     object 
 2   Murders (total deaths) (2015) [3]                                51 non-null     object 
 3   Gun Murders (total deaths) (2015) [3]                            51 non-null     object 
 4   Gun Ownership (%) (2013) [4]                                     51 non-null     float64
 5   Murder and Nonnegligent Manslaughter Rate (per 100,000) (2015)   51 non-null     float64
 6   Murder Rate (per 100,000) (2015)                                 51 non-null     object 
 7   Gun Murder Rate (per 100,000) (2015)                             51 non-null     object 
 
Most of the values were convertible to float; what could not be converted was replaced with NaNs.

The status of the columns is:
 0   Population (total inhabitants) (2015) [2]                        51 non-null     object 
 1   Murders and Nonnegligent Manslaughter (total deaths) (2015) [1]  48 non-null     float64
 2   Murders (total deaths) (2015) [3]                                47 non-null     float64
 3   Gun Murders (total deaths) (2015) [3]                            48 non-null     float64
 4   Gun Ownership (%) (2013) [4]                                     51 non-null     float64
 5   Murder and Nonnegligent Manslaughter Rate (per 100,000) (2015)   51 non-null     float64
 6   Murder Rate (per 100,000) (2015)                                 49 non-null     float64
 7   Gun Murder Rate (per 100,000) (2015)                             49 non-null     float64

The resulted dataset was saved as crime_clean.csv

### Dataset Area

The dataset contains values for all 50 states. There are no missing values. 
The cleaned dataset was saved as area_clean.csv.

### Dataset Income
The dataset contains values for all 50 states. There are no missing values. 
The cleaned dataset was saved as income_clean.csv.

### Dataset Region
The dataset contains values for all 50 states. There are no missing values. 
The cleaned dataset was saved as ergion_clean.csv.