In [1]:
# Data types and formats

## pandas has two main types of data: text and numerics

# numeric data types: integers and floats
## integers = whole numbers
## floats = decimals

# text - aka string in Python or object in pandas (this won't get confusing)

# pandas and base Python use slightly different names for data types:

# pandas type   |  base python                         |  notes 
#----------------------------------------------------------------
# object        |  string                              |  most general
# int64         |  int                                 |  64 refers to memory (64 bits) allocated to hold character
# float64       |  float                               |  contains numbers and NaNs, pandas will default to float64
# datetime64    |  n/a (see datetime module in python) |
# timedelta[ns] |  n/a (see datetime module in python) |



In [3]:
# Checking the format of our data

import pandas as pd
import os as os

# pd.set_option('display.max_columns', 10)
# pd.set_option('display.max_rows', 10)

# getting working directory
cwd = os.getcwd() 
print(cwd) # was originally set to src, had to change it to project dir

# set working directory
os.chdir('/home/meelyn/Documents/cloud_development/sw-python-ecology-lesson') # check and see before running this again

surveys_df = pd.read_csv('data/surveys.csv')

type(surveys_df) # returns pandas.core.frame.DataFrame

/home/meelyn/Documents/cloud_development/sw-python-ecology-lesson/src


pandas.core.frame.DataFrame

In [6]:
# checking individual column types
surveys_df['sex'].dtype # returns dtype('0') which stands for object, which in pandas (and everything else) is a string (text)

dtype('O')

In [8]:
surveys_df['record_id'].dtype # returns dtype('int64') which stands for integer (64) bit

dtype('int64')

In [10]:
# view data types for all columns in data frame
surveys_df.dtypes

record_id            int64
month                int64
day                  int64
year                 int64
plot_id              int64
species_id          object
sex                 object
hindfoot_length    float64
weight             float64
dtype: object

In [12]:
# Working with Integers and Floats
print(5+5) # integer
print(24-4) # integer
print(5/9) # float
print(10/3) # float

10
20
0.5555555555555556
3.3333333333333335


In [16]:
# convert a to an integer
a = 7.83
print(int(a))

# convert b to a float
b=7
print(float(b)) # returns 7.0

7
7.0


In [18]:
# working with our survey data
# convert record_id field to floating point values
surveys_df['record_id'] = surveys_df['record_id'].astype('float64') # like dplyr, need to save the modified variable as itself
surveys_df['record_id'].dtype # checking record_id type, it is now dtype('float64')

dtype('float64')

In [23]:
# Challenge - changing types
# convert column plot_id to floats
surveys_df['plot_id'] = surveys_df['plot_id'].astype('float')
print(surveys_df['plot_id'].dtype)

# try converting weight to an integer
print('weight before', surveys_df['weight'].dtype)
surveys_df['weight'] = surveys_df['weight'].astype('int')
print('weight after', surveys_df['weight'].dtype) # pandas cannot convert types from float to int if the column contains NaN values

float64
weight before float64


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [4]:
# missing data values - NaN

## python will calculate mean of variables with NaN in columns
surveys_df['weight'].mean()

42.672428212991356

In [15]:
# Where4 are the NaN's?\
print('number of NaN\'s:', len(surveys_df[surveys_df['weight'].isna()])) # 3266 rows have a weight of NaN

# how many rows have weight values
print('number of rows with weight above 0:', len(surveys_df[surveys_df['weight'] > 0]))

# total number of rows:
print('total number of rows:', len(surveys_df))

print(3266+32283)

number of NaN's: 3266
number of rows with weight above 0: 32283
total number of rows: 35549
35549


In [13]:
# replace all the NaNs with zeroes using the .fillna() method (after making a copy of the data)

df1 = surveys_df.copy() # only way to make a true copy, points to a new reference

# fill all NaN values with 0
df1['weight'] = df1['weight'].fillna(0)

In [16]:
# NaN and 0 yield different analysis results. you get a different mean with 0s instead of NaNs
df1['weight'].mean() # 38.75 instead of 42.67

38.751976145601844

In [18]:
# fill NaN values with any value that we choose. below fills all NaN vlaues with a mean weight for all values

df1['weight'] = surveys_df['weight'].fillna(surveys_df['weight'].mean())
print(df1['weight'])

0        42.672428
1        42.672428
2        42.672428
3        42.672428
4        42.672428
           ...    
35544    42.672428
35545    42.672428
35546    14.000000
35547    51.000000
35548    42.672428
Name: weight, Length: 35549, dtype: float64


In [31]:
# Challenge - Counting
## count the number of missing values per column
for x in surveys_df.columns:
    missing = len(surveys_df[surveys_df[x].isna()])
    # print(missing)
    print('column', x, 'is missing', missing, 'values')


## another solution:
for x in surveys_df.columns:
        missing = len(surveys_df[pd.isnull(surveys_df[x])])
        print('column', x, 'is missing', missing, 'values')

column record_id is missing 0 values
column month is missing 0 values
column day is missing 0 values
column year is missing 0 values
column plot_id is missing 0 values
column species_id is missing 763 values
column sex is missing 2511 values
column hindfoot_length is missing 4111 values
column weight is missing 3266 values
column record_id is missing 0 values
column month is missing 0 values
column day is missing 0 values
column year is missing 0 values
column plot_id is missing 0 values
column species_id is missing 763 values
column sex is missing 2511 values
column hindfoot_length is missing 4111 values
column weight is missing 3266 values


In [33]:
# writing out data to csv
surveys_df = pd.read_csv('data/surveys.csv') # reads csv

df_na = surveys_df.dropna() # create a new dataframe that drops all the NaNs from it

print(df_na)

# write dataframe to csv
df_na.to_csv('data_clean/surveys_complete.csv', index=False) # to_csv is command, 
#index=False is so that pandas doesn't include the index number for each line

       record_id  month  day  year  plot_id species_id sex  hindfoot_length  \
62            63      8   19  1977        3         DM   M             35.0   
63            64      8   19  1977        7         DM   M             37.0   
64            65      8   19  1977        4         DM   F             34.0   
65            66      8   19  1977        4         DM   F             35.0   
66            67      8   19  1977        7         DM   M             35.0   
...          ...    ...  ...   ...      ...        ...  ..              ...   
35540      35541     12   31  2002       15         PB   F             24.0   
35541      35542     12   31  2002       15         PB   F             26.0   
35542      35543     12   31  2002       15         PB   F             27.0   
35546      35547     12   31  2002       10         RM   F             15.0   
35547      35548     12   31  2002        7         DO   M             36.0   

       weight  
62       40.0  
63       48.0  
64 