# Combining DataFrames with Pandas

## Objectives
* Combine data from multiple files into a single data frame using merge and concat
* combine two dataframes using a unique ID found in both dataframes
* employ 'to_csv' to export a dataframe in csv format
* join dataframes using common fields (join keys).

In [12]:
import pandas as pd
import os as os

os.chdir('/home/meelyn/Documents/cloud_development/sw-python-ecology-lesson')

# import surveys and species data
surveys_df = pd.read_csv('data/surveys.csv',
                         keep_default_na=False, na_values=[""]) # can tell pandas how to deal with na values

species_df = pd.read_csv('data/species.csv',
                         keep_default_na=False, na_values=[""])
print(species_df)

   species_id             genus          species     taxa
0          AB        Amphispiza        bilineata     Bird
1          AH  Ammospermophilus          harrisi   Rodent
2          AS        Ammodramus       savannarum     Bird
3          BA           Baiomys          taylori   Rodent
4          CB   Campylorhynchus  brunneicapillus     Bird
5          CM       Calamospiza      melanocorys     Bird
6          CQ        Callipepla         squamata     Bird
7          CS          Crotalus       scutalatus  Reptile
8          CT     Cnemidophorus           tigris  Reptile
9          CU     Cnemidophorus        uniparens  Reptile
10         CV          Crotalus          viridis  Reptile
11         DM         Dipodomys         merriami   Rodent
12         DO         Dipodomys            ordii   Rodent
13         DS         Dipodomys      spectabilis   Rodent
14         DX         Dipodomys              sp.   Rodent
15         EO           Eumeces        obsoletus  Reptile
16         GS 

# Concatenating Data Frames

## we can use the concate function in pandas to append either columns or rows from one dataframe to another

In [18]:
# read the first 10 lines of surveys tables
survey_sub = surveys_df.head(10)
print('survey sub\n', survey_sub)

# Grab the last 10 rows
survey_sub_last10 = surveys_df.tail(10)
print('survey sub last 10\n', survey_sub_last10)

# reset the index values to the second dataframe appends poorly
survey_sub_last10 = survey_sub_last10.reset_index(drop=True)
# drop=True option avoids adding new index column with old index values i.e. the row numbers
print('survey sub last 10 with new index\n', survey_sub_last10)

survey sub
    record_id  month  day  year  plot_id species_id sex  hindfoot_length  \
0          1      7   16  1977        2         NL   M             32.0   
1          2      7   16  1977        3         NL   M             33.0   
2          3      7   16  1977        2         DM   F             37.0   
3          4      7   16  1977        7         DM   M             36.0   
4          5      7   16  1977        3         DM   M             35.0   
5          6      7   16  1977        1         PF   M             14.0   
6          7      7   16  1977        2         PE   F              NaN   
7          8      7   16  1977        1         DM   M             37.0   
8          9      7   16  1977        1         DM   F             34.0   
9         10      7   16  1977        6         PF   F             20.0   

   weight  
0     NaN  
1     NaN  
2     NaN  
3     NaN  
4     NaN  
5     NaN  
6     NaN  
7     NaN  
8     NaN  
9     NaN  
survey sub last 10
        rec

In [19]:
# when we concatenate data frames we need to specify the axis
## axis = 0 tells pandas to stack the second dataframe under the first one
### need to make sure both dataframes have the same columns and associated column format in both datasets

## axis = 1 will stack the columns in the second dataframe to the right of the first dataframe
### stack horizontally we want to make sure what we are doing makes sense

# Stack dataframes on top of each other
vertical_stack = pd.concat([survey_sub, survey_sub_last10], axis=0)
print('vertical stack\n', vertical_stack)

# Place dataframes side by side
horizontal_stack = pd.concat([survey_sub, survey_sub_last10])

vertical stack
    record_id  month  day  year  plot_id species_id  sex  hindfoot_length  \
0          1      7   16  1977        2         NL    M             32.0   
1          2      7   16  1977        3         NL    M             33.0   
2          3      7   16  1977        2         DM    F             37.0   
3          4      7   16  1977        7         DM    M             36.0   
4          5      7   16  1977        3         DM    M             35.0   
5          6      7   16  1977        1         PF    M             14.0   
6          7      7   16  1977        2         PE    F              NaN   
7          8      7   16  1977        1         DM    M             37.0   
8          9      7   16  1977        1         DM    F             34.0   
9         10      7   16  1977        6         PF    F             20.0   
0      35540     12   31  2002       15         PB    F             26.0   
1      35541     12   31  2002       15         PB    F             24.0