In [1]:
import numpy as np
import pandas as pd
import string

In [2]:
def read_file(path,extension,blank_firstrow=False):
    if extension == 'csv':
        if blank_firstrow:
            df = pd.read_csv(path, sep = ';', skiprows = 1, escapechar='\n')
        else:
            df = pd.read_csv(path, sep = ';', escapechar='\n')
    elif extension == 'txt':
        df = pd.read_csv(path, sep = '\t')
    elif extension == 'xls':
        df = pd.read_excel(path, skiprows = 1)
    return df

In [3]:
def remove_rank_cols(df):
    # Remove rank columns
    col_names = list(df)
    del_list = []
    for item in col_names:
        if 'rank' in item.lower():
            del_list.append(item)

    df.drop(del_list, axis=1, inplace=True)
    return df

In [4]:
def clean_states_text(states):    
    cleaned = []
    invalidChars = string.punctuation

    for word in states:
        word= word.strip()
        for char in word:
            if(char in invalidChars):
                word= word.replace(char, '')
                word= word.replace(' ', '_')
            else:
                word= word.replace(' ', '_')
        cleaned.append(word)

    return cleaned

In [5]:
def remove_summary_row(df):
    if "United_States" in df['State']:
        df.drop('United_States', inplace = True)
    return df

In [6]:
def index_sort(df):
    df.set_index('State', inplace=True)
    df.sort_index(axis = 0, inplace= True)
    return df

In [7]:
def drop_duplicate_rows(df):
    if df.index.has_duplicates:
        df = df.drop_duplicates(subset = ["State"])
    return df

In [8]:
df_edu = read_file('../data/csv/education.csv','csv',True)
df_reg = read_file('../data/csv/region.txt','txt')
df_area = read_file('../data/csv/area.csv','csv')
df_exp = read_file('../data/csv/life_expectancy.csv','csv')
df_crime = read_file('../data/csv/crime.csv','csv')
df_income = read_file('../data/csv/income.xls','xls',True)

### Cleaning Education file

In [9]:
df_edu.head()

Unnamed: 0,"State,federal district,or territory",% High school graduateor higher,High School rank,% Bachelor's degreeor higher,Bachelor's rank,% Advanced degree,Advanced rank
0,Montana,93.0%,1.0,30.7%,21.0,10.1%,33.0
1,New Hampshire,92.8%,2.0,36.0%,9.0,13.8%,10.0
2,Minnesota,92.8%,3.0,34.8%,11.0,11.8%,18.0
3,Wyoming,92.8%,4.0,26.7%,41.0,9.3%,39.0
4,Alaska,92.4%,5.0,29.0%,28.0,10.4%,29.0


In [10]:
# Remove rank columns
df_edu = remove_rank_cols(df_edu)
# Rename columns
df_edu.rename(columns = {'State,federal district,or territory':'State','% High school graduateor higher':'HS_Grad','% Bachelor\'s degreeor higher':'BS_Grade','% Advanced degree':'Adv_Degree' }, inplace = True)
# Convert object types to state
df_edu['HS_Grad'] = pd.to_numeric(df_edu['HS_Grad'].str.replace('%',''))  
df_edu['BS_Grade'] = pd.to_numeric(df_edu['BS_Grade'].str.replace('%',''))  
df_edu['Adv_Degree'] = pd.to_numeric(df_edu['Adv_Degree'].str.replace('%','')) 
# Remove invalid characters and replace whitespace in States with underscore
state_list = df_edu['State'].tolist()
df_edu['State'] = clean_states_text(state_list)
# Remove summary row
df_edu = remove_summary_row(df_edu)
# Index and sort by State
df_edu = index_sort(df_edu)
# Drop duplicate index
df_edu = drop_duplicate_rows(df_edu)

In [11]:
df_edu

Unnamed: 0_level_0,HS_Grad,BS_Grade,Adv_Degree
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,85.3,24.5,9.1
Alaska,92.4,29.0,10.4
Arizona,82.1,28.4,10.7
Arkansas,85.6,22.0,7.9
California,82.5,32.6,12.2
Colorado,91.1,39.4,14.6
Connecticut,90.2,38.4,17.0
Delaware,89.3,31.0,12.9
District_of_Columbia,90.3,56.6,32.8
Florida,87.6,28.5,10.3
