## 2. Cleaning Data

### 2.1. Calling libraries

In [None]:
import pandas as pd
import numpy as np

### 2.2. Functions

In [None]:
# This function should remove NANs from a list
def cleanan(input_list):
    cleaned = [i for i in input_list if pd.isnull(i) == False]
    return (cleaned)

### 2.3. All the CSVs created in cap_read will be called and merged

In [None]:
rural_df = pd.read_csv('data/pct_rural_df.csv', index_col=1)
rural_df.drop(['Unnamed: 0'], axis=1, inplace=True)
rural_df.head(3)

In [None]:
housing_df = pd.read_csv('data/pop_hous_df.csv', index_col=1)
housing_df.drop(['Unnamed: 0'], axis=1, inplace=True)
housing_df.head(3)

In [None]:
industry_df = pd.read_csv('data/indus_dist.csv', index_col=1)
industry_df.drop(['Unnamed: 0'], axis=1, inplace=True)
industry_df.head(3)

In [None]:
density_df = pd.read_csv('data/density_pct.csv', index_col=1)
density_df.drop(['Unnamed: 0'], axis=1, inplace=True)
density_df.rename(columns={'Hous%80-90.1' : 'Pop%80-90'},inplace=True)
density_df.head(3)

In [None]:
farm0712_df = pd.read_csv('data/farm.csv', index_col=1)
farm0712_df.drop(['Unnamed: 0'], axis=1, inplace=True)
farm0712_df.head(3)

In [None]:
home_sale_df = pd.read_csv('data/home_sold.csv', index_col=1)
home_sale_df.drop(['Unnamed: 0'], axis=1, inplace=True)
home_sale_df.head(3)

In [None]:
farm02_df = pd.read_csv('data/farm2.csv', index_col=1)
farm02_df.drop(['Unnamed: 0'], axis=1, inplace=True)
farm02_df.head(3)

In [None]:
pasture0712_df = pd.read_csv('data/pasture.csv', index_col=1)
pasture0712_df.drop(['Unnamed: 0'], axis=1, inplace=True)
pasture0712_df.head(3)

In [None]:
pasture02_df = pd.read_csv('data/pasture2.csv', index_col=1)
pasture02_df.drop(['Unnamed: 0'], axis=1, inplace=True)
pasture02_df.head(3)

### 2.4. Merge the dataframes

In [None]:
explan_df = rural_df.merge(housing_df, left_index=True, right_index=True, how='left')

explan_df = explan_df.merge(industry_df, left_index=True, right_index=True, how='left')

explan_df = explan_df.merge(density_df, left_index=True, right_index=True, how='left')

explan_df = explan_df.merge(home_sale_df, left_index=True, right_index=True, how='left')

explan_df = explan_df.merge(farm0712_df, left_index=True, right_index=True, how='left')

explan_df = explan_df.merge(pasture0712_df, left_index=True, right_index=True, how='left')

explan_df = explan_df.merge(pasture02_df, left_index=True, right_index=True, how='left')

In [None]:
# Some 2007 duplicate columns are left behind, so they have to be removed
dup_07=[]
for i in np.arange(explan_df.shape[1]):
    if explan_df.columns[i][-2:] =='_y':
        dup_07.append(explan_df.columns[i])
explan_df.drop(dup_07, axis=1, inplace=True)

### 2.5. Create a list of columns

This will take place separately. A file will be created in Excel, simplifying the tables PLUS adding a description to the table for future reference.

In [None]:
explan_cols = explan_df.transpose()
explan_cols.isnull().sum()[explan_cols.isnull().sum() != 0]
explan_cols['brief_header']=''
explan_cols['description']=''
explan_cols.drop(list(explan_cols.columns[0:95]), axis=1, inplace=True)
# explan_cols.to_csv('data/Col_Dict2.csv')

### 2.6. Make a brief heading for the columns + the definition and source of information

In [None]:
col_dict = pd.read_csv('data/Col_Dict.csv')
col_dict

In [None]:
label_dict = explan_cols.merge(col_dict, left_index=True, right_on='Unnamed: 0', how='left')

In [None]:
label_dict.drop(['brief_header_x', 'description_x'], axis=1, inplace=True)

In [None]:
label_dict.columns = ['old','new','description','source']

In [None]:
# This block renames the columns. A dictionary will describe what each column mean.
renam=[]
for i in np.arange(explan_df.shape[1]):
    for j in np.arange(label_dict.shape[0]):
        if explan_df.columns[i] == label_dict.iloc[j,0]:
            renam.append(label_dict.iloc[j,1])

explan_df.columns=renam

In [None]:
# create a test df to fill the NaNs and make sure non-NaNs are not touched
nonan_df = explan_df.copy()

In [None]:
# This block fills the NaNs with a random value from the same column.

for c in np.arange(nonan_df.shape[1]):
    uniq_val = cleanan(list(nonan_df.iloc[:,c].unique()))
    for r in np.arange(nonan_df.shape[0]):
        if pd.isnull(nonan_df.iloc[r,c]) == True:
            nonan_df.iloc[r,c] = np.random.choice(uniq_val)
        else:
            pass

In [None]:
# Make sure no NaN value is left
nonan_df.isnull().sum()[nonan_df.isnull().sum() != 0]

In [None]:
nonan_df.to_csv('data/cleaned.csv')

### Next Step: Running PCA