In [3]:
# Indexing, slicing, and subsetting dataframes in Python

# Loading our data
import pandas as pd
import os as os

# getting working directory
cwd = os.getcwd() 
print(cwd) # was originally set to src, had to change it to project dir

# set working directory
os.chdir('/home/meelyn/Documents/cloud_development/sw-python-ecology-lesson') # check and see before running this again

# Read in the survey CSV
surveys_df = pd.read_csv('data/surveys.csv')

/home/meelyn/Documents/cloud_development/sw-python-ecology-lesson


In [8]:
# Indexing and slicing

## can index and slice based on labels (column headings), numeric ranges, or specific x,y index locations

# Selecting data using labels (column headings) - use square brackets [] to select a subset of a python object
print('surveys head\n', surveys_df.head()) # see first 5 rows

print('species only\n', surveys_df['species_id']) #only prints the species_id column


surveys head
    record_id  month  day  year  plot_id species_id sex  hindfoot_length  \
0          1      7   16  1977        2         NL   M             32.0   
1          2      7   16  1977        3         NL   M             33.0   
2          3      7   16  1977        2         DM   F             37.0   
3          4      7   16  1977        7         DM   M             36.0   
4          5      7   16  1977        3         DM   M             35.0   

   weight  
0     NaN  
1     NaN  
2     NaN  
3     NaN  
4     NaN  
species only
 0         NL
1         NL
2         DM
3         DM
4         DM
        ... 
35544     AH
35545     AH
35546     RM
35547     DO
35548    NaN
Name: species_id, Length: 35549, dtype: object


In [14]:
# can select multiple columns, useful to reorganize data columns
## select the species and plot columns from the dataframe
surveys_df[['species_id', 'plot_id']] # remember brackets for selecting columns

surveys_df[['plot_id', 'species_id']] # when order is reversed plot_id is first

# surveys_df['speciess'] # get an error when you ask for a column that does not exist

Unnamed: 0,plot_id,species_id
0,2,NL
1,3,NL
2,2,DM
3,7,DM
4,3,DM
...,...,...
35544,15,AH
35545,15,AH
35546,10,RM
35547,7,DO


In [15]:
# python has reserved words that should not be used to define variables:
## True, False, and, or, not, list

# also remember that list() is an array

In [20]:
# Extracting Range based Subsets: Slicing

# python uses 0-based indexing - first element in an object is at position 0

# create a list of numbers:
a = [1, 2, 3, 4, 5]
a[0] # 1
a[2] # 3
a[4] # 5

# a[len(a)] # should produce an error because there is not a position '5'

5

In [24]:
# Slicing subest of rows in python
surveys_df[0:3] # slicing includes start bound (0) but does NOT include stop bound(3). so this slice gets positions 0,1,2

# select first 5 rows (positions 0,1,2,3,4)
surveys_df[0:5] # remember that stop bound is NOT included!

# select the last element in the list
surveys_df[-1:] # slice starts at the last element and ends at the end of the list

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
35548,35549,12,31,2002,5,,,,


In [27]:
# Copying objects vs referencing objects in python

# Using the 'copy() method'
true_copy_surveys_df = surveys_df.copy() # creates a true copy of surveys_df, two different references
print(true_copy_surveys_df)

# Using the '=' operator
ref_surveys_df = surveys_df # only one object and both surveys_df and ref_surveys_df refer to it
print(ref_surveys_df)

       record_id  month  day  year  plot_id species_id  sex  hindfoot_length  \
0              1      7   16  1977        2         NL    M             32.0   
1              2      7   16  1977        3         NL    M             33.0   
2              3      7   16  1977        2         DM    F             37.0   
3              4      7   16  1977        7         DM    M             36.0   
4              5      7   16  1977        3         DM    M             35.0   
...          ...    ...  ...   ...      ...        ...  ...              ...   
35544      35545     12   31  2002       15         AH  NaN              NaN   
35545      35546     12   31  2002       15         AH  NaN              NaN   
35546      35547     12   31  2002       10         RM    F             15.0   
35547      35548     12   31  2002        7         DO    M             36.0   
35548      35549     12   31  2002        5        NaN  NaN              NaN   

       weight  
0         NaN  
1      

In [30]:
# Assign value '0' to the first three rows of data in dataframe
ref_surveys_df[0:3] = 0
print(ref_surveys_df)
print(surveys_df) # both ref_surveys_df and the original surveys_df

       record_id  month  day  year  plot_id species_id  sex  hindfoot_length  \
0              0      0    0     0        0          0    0              0.0   
1              0      0    0     0        0          0    0              0.0   
2              0      0    0     0        0          0    0              0.0   
3              4      7   16  1977        7         DM    M             36.0   
4              5      7   16  1977        3         DM    M             35.0   
...          ...    ...  ...   ...      ...        ...  ...              ...   
35544      35545     12   31  2002       15         AH  NaN              NaN   
35545      35546     12   31  2002       15         AH  NaN              NaN   
35546      35547     12   31  2002       10         RM    F             15.0   
35547      35548     12   31  2002        7         DO    M             36.0   
35548      35549     12   31  2002        5        NaN  NaN              NaN   

       weight  
0         0.0  
1      

In [32]:
# create a clean dataframe from original data csv file
surveys_df = pd.read_csv('data/surveys.csv')

In [None]:
# Slicing subsets of rows and columns in python

