In [1]:
import pandas as pd

<img src='http://media4.s-nbcnews.com/j/newscms/2016_36/1685951/ss-160826-twip-05_8cf6d4cb83758449fd400c7c3d71aa1f.nbcnews-ux-2880-1000.jpg'>

In [2]:
#read in a file
states = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_usa_states.csv')

In [3]:
states.head()

Unnamed: 0,rank,state,postal,pop
0,1,Alabama,AL,4849377.0
1,2,Alaska,AK,736732.0
2,3,Arizona,AZ,6731484.0
3,4,Arkansas,AR,2966369.0
4,5,California,CA,38802500.0


In [4]:
#read in another file
airports = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_february_us_airport_traffic.csv')

In [5]:
airports.head()

Unnamed: 0,iata,airport,city,state,country,lat,long,cnt
0,ORD,Chicago O'Hare International,Chicago,IL,USA,41.979595,-87.904464,25129
1,ATL,William B Hartsfield-Atlanta Intl,Atlanta,GA,USA,33.640444,-84.426944,21925
2,DFW,Dallas-Fort Worth International,Dallas-Fort Worth,TX,USA,32.895951,-97.0372,20662
3,PHX,Phoenix Sky Harbor International,Phoenix,AZ,USA,33.434167,-112.008056,17290
4,DEN,Denver Intl,Denver,CO,USA,39.858408,-104.667002,13781


In [6]:
#read in a third file!
agriculture = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_us_ag_exports.csv')

In [7]:
agriculture.head()

Unnamed: 0,code,state,category,total exports,beef,pork,poultry,dairy,fruits fresh,fruits proc,total fruits,veggies fresh,veggies proc,total veggies,corn,wheat,cotton
0,AL,Alabama,state,1390.63,34.4,10.6,481.0,4.06,8.0,17.1,25.11,5.5,8.9,14.33,34.9,70.0,317.61
1,AK,Alaska,state,13.31,0.2,0.1,0.0,0.19,0.0,0.0,0.0,0.6,1.0,1.56,0.0,0.0,0.0
2,AZ,Arizona,state,1463.17,71.3,17.9,0.0,105.48,19.3,41.0,60.27,147.5,239.4,386.91,7.3,48.7,423.95
3,AR,Arkansas,state,3586.02,53.2,29.4,562.9,3.53,2.2,4.7,6.88,4.4,7.1,11.45,69.5,114.5,665.44
4,CA,California,state,16472.88,228.7,11.1,225.4,929.95,2791.8,5944.6,8736.4,803.2,1303.5,2106.79,34.6,249.3,1064.95


In [8]:
#let's check if there's anything apart from states in the 'category' column
agriculture.category.nunique()

2

In [9]:
#okay, two different categories doesn't sound great. what are the categories?
agriculture.category.unique()
#that's a data cleaning issue. 

array(['state', ' state'], dtype=object)

In [10]:
#we won't use the category column, but let's walk through using lambdas to fix it
agriculture['category'] = agriculture.category.map(lambda x: x.replace(' ', ''))
#what we're doing here is using a lambda function to replace a space with no space (' ', ''), and then mapping that to
#the agriculture['category'] column. 
#we're then saving this over itself

In [11]:
#did that work?
agriculture.category.nunique()
#yes, it did.

1

In [12]:
#are there any issues with state?
agriculture.state.nunique()
#doesn't look like it, so that's cool

50

In [15]:
#okay, let's concatenate the airports and states data

new_df = pd.concat([airports, states], axis=1)

In [16]:
#let's check to see if we have null values. 
new_df.isnull().sum()

iata         0
airport      0
city         1
state        1
country      0
lat          0
long         0
cnt          0
rank       169
state      169
postal     169
pop        169
dtype: int64

In [17]:
#the two dataframes we concatenated were of different lengths, so once pandas ran out of rows on the states dataframe
#it started to fill in null values
print new_df.head()
new_df.tail()

  iata                            airport               city state country  \
0  ORD       Chicago O'Hare International            Chicago    IL     USA   
1  ATL  William B Hartsfield-Atlanta Intl            Atlanta    GA     USA   
2  DFW    Dallas-Fort Worth International  Dallas-Fort Worth    TX     USA   
3  PHX   Phoenix Sky Harbor International            Phoenix    AZ     USA   
4  DEN                        Denver Intl             Denver    CO     USA   

         lat        long    cnt  rank       state postal         pop  
0  41.979595  -87.904464  25129   1.0     Alabama     AL   4849377.0  
1  33.640444  -84.426944  21925   2.0      Alaska     AK    736732.0  
2  32.895951  -97.037200  20662   3.0     Arizona     AZ   6731484.0  
3  33.434167 -112.008056  17290   4.0    Arkansas     AR   2966369.0  
4  39.858408 -104.667002  13781   5.0  California     CA  38802500.0  


Unnamed: 0,iata,airport,city,state,country,lat,long,cnt,rank,state.1,postal,pop
216,EAU,Chippewa Valley Regional,Eau Claire,WI,USA,44.865257,-91.485072,48,,,,
217,DBQ,Dubuque Municipal,Dubuque,IA,USA,42.402959,-90.709167,48,,,,
218,RST,Rochester International,Rochester,MN,USA,43.908826,-92.497987,37,,,,
219,UTM,Tunica Municipal Airport,Tunica,MS,USA,34.681499,-90.348816,32,,,,
220,BIL,Billings Logan Intl,Billings,MT,USA,45.807662,-108.542861,23,,,,


In [18]:
#what if we merged everything on the states column instead

new_df = pd.merge(airports, states, left_on='state', right_on='postal')

In [19]:
#this time we have no nulls!
new_df.sum().isnull()

iata       False
airport    False
city       False
state_x    False
country    False
lat        False
long       False
cnt        False
rank       False
state_y    False
postal     False
pop        False
dtype: bool

In [20]:
print new_df.head()
new_df.tail()

#instead, pandas filled in state information for every single airport, even if they were from the same state

  iata                       airport         city state_x country        lat  \
0  ORD  Chicago O'Hare International      Chicago      IL     USA  41.979595   
1  MDW                Chicago Midway      Chicago      IL     USA  41.785983   
2  MLI                     Quad City       Moline      IL     USA  41.448526   
3  BMI     Central Illinois Regional  Bloomington      IL     USA  40.477986   
4  PIA       Greater Peoria Regional       Peoria      IL     USA  40.664243   

        long    cnt  rank   state_y postal         pop  
0 -87.904464  25129    14  Illinois     IL  12880580.0  
1 -87.752424   6979    14  Illinois     IL  12880580.0  
2 -90.507539    451    14  Illinois     IL  12880580.0  
3 -88.915953    426    14  Illinois     IL  12880580.0  
4 -89.693306    404    14  Illinois     IL  12880580.0  


Unnamed: 0,iata,airport,city,state_x,country,lat,long,cnt,rank,state_y,postal,pop
215,FCA,Glacier Park Intl,Kalispell,MT,USA,48.311405,-114.255069,90,27,Montana,MT,1023579.0
216,BIL,Billings Logan Intl,Billings,MT,USA,45.807662,-108.542861,23,27,Montana,MT,1023579.0
217,BTV,Burlington International,Burlington,VT,USA,44.473004,-73.150312,91,47,Vermont,VT,626562.0
218,LWB,Greenbrier Valley,Lewisburg,WV,USA,37.858306,-80.399472,56,50,West Virginia,WV,1850326.0
219,CRW,Yeager,Charleston,WV,USA,38.373151,-81.59319,56,50,West Virginia,WV,1850326.0


## Merged!

<img src='http://wdy.h-cdn.co/assets/16/05/980x490/landscape-1454612525-baby-pandas.jpg'>

Concatenation is taking two or more separately located things and placing them side-by-side next to each other (either as columns OR as rows) so that they can now be treated as one thing. 

Joins using pandas happen when columns of two DataFrames are joined either on index or on a key column. 

In [21]:
#let's say we want to merge just the airport names with the entirety of the states dataframe
#we would need to keep a key on which to merge the two, so let's retain the 'states' column in airports
#we have two brackets around the column names because one set of brackets would just call them up as a series
#two sets of brackets tells pandas that we want them to be treated as a new dataframe
another_one = pd.merge(airports[['iata', 'state']], states, left_on='state', right_on='postal')

In [22]:
another_one.head(25)

Unnamed: 0,iata,state_x,rank,state_y,postal,pop
0,ORD,IL,14,Illinois,IL,12880580.0
1,MDW,IL,14,Illinois,IL,12880580.0
2,MLI,IL,14,Illinois,IL,12880580.0
3,BMI,IL,14,Illinois,IL,12880580.0
4,PIA,IL,14,Illinois,IL,12880580.0
5,CMI,IL,14,Illinois,IL,12880580.0
6,SPI,IL,14,Illinois,IL,12880580.0
7,ATL,GA,11,Georgia,GA,10097343.0
8,SAV,GA,11,Georgia,GA,10097343.0
9,CSG,GA,11,Georgia,GA,10097343.0


## Independent practice

In [24]:
##Merge a few columns from the states dataframe with the agriculture dataframe. 




In [None]:


##Concatenate the airport and agriculture dataframes, first on axis=0, then on axis=1