# Data Cleaning

In [886]:
# Import dependencies
import pandas as pd

In [887]:
# Store file names, desired areas, and desired states in lists
file_names = ["lat_lon", "population", "commute",\
             "atlanta_housing", "chicago_housing", "dallas_housing", "denver_housing"]
areas = ["Atlanta", "Chicago", "Dallas", "Denver", "Detroit", "Los Angeles", "Miami", "New York", "Philadelphia"]
areas = sorted(areas)
state = ["GA", "IL", "TX", "CO", "IL", "MI", "CA", "FL", "PA", "NY"]

In [888]:
# define function to read csv files:
def get_file():
    global df
    data = pd.read_csv(f"resources/{file_name}.csv", encoding="ISO-8859-1", header=None, error_bad_lines=False)  
    df = pd.DataFrame(data)
    print(f"\nraw: {file_name}")
    return df

# Latitudes and Longitudes

In [889]:
# extract latitudes and longitudes
file_name = "lat_lon"

# Get raw data from all files and print out
get_file()
df = df.loc[2:, [2,3,5,6]]
columns = ["state", "area", "latitude", "longitude"]
df.columns = columns
print(df.head())

# Get desired data from DataFrame:
df = df.loc[df["area"].isin(areas)]
df = df.sort_values("area")
df = df.groupby("area").first().reset_index()
df = df.iloc[:len(areas),:]
output_columns = ["area", "state", "latitude", "longitude"]
df.columns = output_columns
print(f"\n\nclean: {file_name}")
print(df)
print("\n\nfinal:")
mega_df = df
print(mega_df)


raw: lat_lon
  state           area  latitude  longitude
2    NY       Valhalla   41.0877   -73.7768
3    PA     Pittsburgh   40.4495   -79.9880
4    MO      Bridgeton   38.7667   -90.4201
5    CA  San Francisco   37.7353  -122.3732
6    NY       New York   40.7528   -73.9725


clean: lat_lon
           area state  latitude  longitude
0       Atlanta    GA   33.7490   -84.3880
1       Chicago    IL   41.8119   -87.6873
2        Dallas    TX   32.7155   -96.7684
3        Denver    IA   42.6863   -92.3417
4       Detroit    MI   42.3314   -83.0457
5   Los Angeles    CA   34.0522  -118.2437
6         Miami    FL   25.7743   -80.1937
7      New York    NY   40.7391   -73.9826
8  Philadelphia    PA   39.9767   -75.2586


final:
           area state  latitude  longitude
0       Atlanta    GA   33.7490   -84.3880
1       Chicago    IL   41.8119   -87.6873
2        Dallas    TX   32.7155   -96.7684
3        Denver    IA   42.6863   -92.3417
4       Detroit    MI   42.3314   -83.0457
5   Los 

# Population

In [892]:
# Variables
file_name = "population"

# Get raw data from all files and print out
get_file()
df = df.iloc[3:(3+len(areas)), [2,3]]
print(df)

# Get data for desired areas
columns = ["metro", file_name]
df.columns = columns
df["area"] = df["metro"].str.split('-', 1).str[0]
del df["metro"]
columns = ["area", file_name]
df = df[columns].sort_values("area")
df = df.reset_index(drop=True)
df = df.dropna()
print(f"\n\nclean: {file_name}")
print(df)

mega_df = pd.concat([mega_df, df], axis=1)
mega_df = mega_df.T.drop_duplicates().T
print("\n\nfinal:")
print(mega_df)


raw: population
                                                    2         3
3   Los Angeles-Long Beach-Anaheim, CA Metro Area;...  13189366
4     Denver-Aurora-Lakewood, CO Metro Area; Colorado   2752056
5   Miami-Fort Lauderdale-West Palm Beach, FL Metr...   5926955
6   Atlanta-Sandy Springs-Roswell, GA Metro Area; ...   5612777
7   Chicago-Naperville-Elgin, IL-IN-WI Metro Area ...   8656303
8    Detroit-Warren-Dearborn, MI Metro Area; Michigan   4296731
9   New York-Newark-Jersey City, NY-NJ-PA Metro Ar...  13380318
10  Philadelphia-Camden-Wilmington, PA-NJ-DE-MD Me...   4076378
11  Dallas-Fort Worth-Arlington, TX Metro Area; Texas   6957123


clean: population
           area population
0       Atlanta    5612777
1       Chicago    8656303
2        Dallas    6957123
3        Denver    2752056
4       Detroit    4296731
5   Los Angeles   13189366
6         Miami    5926955
7      New York   13380318
8  Philadelphia    4076378


final:
           area state latitude longitude pop

# Commute

In [893]:
# Define function to extract latitudes and longitudes
def commute():
    file_name = "commute"
    global df
    global mega_df
    # Get raw data from all files and print out
    get_file()
    df = df.loc[2:, [2,3]]
    print(df)
    # Get data for desired areas
    columns = ["metro", file_name]
    df.columns = columns
    df.loc[range(len(areas)), columns]
    df["area"] = df["metro"].str.split('-', 1).str[0]
    del df["metro"]
    columns = ["area", file_name]
    df = df[columns].sort_values("area")
    df = df.reset_index(drop=True)
    df = df.loc[range(len(areas)), :]
    print(f"\n\nclean: {file_name}")
    print(df)
    mega_df = pd.concat([mega_df, df], axis=1)
    mega_df = mega_df.T.drop_duplicates().T
    print("\n\nfinal:")
    print(mega_df)

In [894]:
# Global variables
file_name = "commute"

# Call functions and get latitudes and longitudes
commute()


raw: commute
                                                    2          3
2   Los Angeles-Long Beach-Anaheim, CA Metro Area;...  171087545
3     Denver-Aurora-Lakewood, CO Metro Area; Colorado   36188125
4   Miami-Fort Lauderdale-West Palm Beach, FL Metr...   74035650
5   Atlanta-Sandy Springs-Roswell, GA Metro Area; ...   75949245
6   Chicago-Naperville-Elgin, IL-IN-WI Metro Area ...  125086155
7    Detroit-Warren-Dearborn, MI Metro Area; Michigan   49061055
8   New York-Newark-Jersey City, NY-NJ-PA Metro Ar...  225622415
9   Philadelphia-Camden-Wilmington, PA-NJ-DE-MD Me...   54031170
10  Dallas-Fort Worth-Arlington, TX Metro Area; Texas   88686850


clean: commute
           area    commute
0       Atlanta   75949245
1       Chicago  125086155
2        Dallas   88686850
3        Denver   36188125
4       Detroit   49061055
5   Los Angeles  171087545
6         Miami   74035650
7      New York  225622415
8  Philadelphia   54031170


final:
           area state latitude longitude

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
