In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import random

%matplotlib inline 

In [2]:
df_spicata = pd.read_csv("../data/observations_spicata.csv")
df_spicata.head()

Unnamed: 0,id,observed_on_string,observed_on,time_observed_at,time_zone,user_id,user_login,user_name,created_at,updated_at,...,taxon_supertribe_name,taxon_tribe_name,taxon_subtribe_name,taxon_genus_name,taxon_genushybrid_name,taxon_species_name,taxon_hybrid_name,taxon_subspecies_name,taxon_variety_name,taxon_form_name
0,82406,"May 20, 2012 16:44",2012-05-20,2012-05-20 21:44:00 UTC,Central Time (US & Canada),604,eric_hunt,Eric Hunt,2012-05-23 22:43:55 UTC,2020-12-09 10:50:50 UTC,...,,,,Lobelia,,Lobelia spicata,,,,
1,82408,"May 20, 2012 16:19",2012-05-20,2012-05-20 21:19:00 UTC,Central Time (US & Canada),604,eric_hunt,Eric Hunt,2012-05-23 22:46:36 UTC,2020-12-09 10:50:51 UTC,...,,,,Lobelia,,Lobelia spicata,,,,
2,87039,Sun Jun 03 2012 09:52:31 GMT-0400 (EDT),2012-06-03,2012-06-03 13:52:31 UTC,Eastern Time (US & Canada),477,loarie,Scott Loarie,2012-06-04 05:25:45 UTC,2019-07-02 19:37:38 UTC,...,,,,Lobelia,,Lobelia spicata,,,,
3,92772,"June 16, 2012 05:40",2012-06-16,2012-06-16 12:40:00 UTC,Pacific Time (US & Canada),477,loarie,Scott Loarie,2012-06-18 18:04:31 UTC,2015-10-08 14:36:10 UTC,...,,,,Lobelia,,Lobelia spicata,,,,
4,195645,2008-07-06,2008-07-06,,Eastern Time (US & Canada),12610,susanelliott,Susan Elliott,2013-02-10 18:04:29 UTC,2020-02-19 21:15:23 UTC,...,,,,Lobelia,,Lobelia spicata,,,,


In [3]:
df_spicata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4547 entries, 0 to 4546
Data columns (total 67 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                4547 non-null   int64  
 1   observed_on_string                4538 non-null   object 
 2   observed_on                       4538 non-null   object 
 3   time_observed_at                  4437 non-null   object 
 4   time_zone                         4547 non-null   object 
 5   user_id                           4547 non-null   int64  
 6   user_login                        4547 non-null   object 
 7   user_name                         3079 non-null   object 
 8   created_at                        4547 non-null   object 
 9   updated_at                        4547 non-null   object 
 10  quality_grade                     4547 non-null   object 
 11  license                           3489 non-null   object 
 12  url   

In [4]:
# number of inflata observations with pos acc of 30 m or less
(df_spicata["positional_accuracy"] <= 30).sum()

2304

In [5]:
# list of all users in US who posted inflata with positional accuracy <= 30
spicata_users = df_spicata[df_spicata["positional_accuracy"] <= 30]["user_id"].unique().tolist()
len(spicata_users)

1317

In [6]:
# 10 users chosen at random using random.shuffle ALL HAVE BEEN INCLUDED
randomized_users10 = [ 635041, 2588524,  923056,   18434,  318468, 1549697, 3583533,
       1679129, 2047965, 2570804]

In [7]:
for id in randomized_users10:
    spicata_users.remove(id)
len(spicata_users)

1307

In [8]:
# next 10 random users: HAS BEEN INCLUDED
next10 = [787855, 1892152, 542981, 656158, 2248142, 780600, 4659453, 2336149, 3512034, 1773265]

In [10]:
#reads in the csv files from separate users and puts them together in one data frame
data_sp = pd.read_csv("../data/spicata_0001.csv")
for i in range(2,21):
    if i < 10:
        file = "../data/spicata_000" + str(i) +".csv"
    elif i < 100:
        file = "../data/spicata_00" + str(i) +".csv"
    data_sp = pd.concat([data_sp, pd.read_csv(file)])

data_sp = data_sp.reset_index(drop = True)

  data_sp = pd.concat([data_sp, pd.read_csv(file)])


In [11]:
# double-checking that the set has the expects number of users
data_sp["user_id"].nunique()

20

In [12]:
data_sp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24056 entries, 0 to 24055
Data columns (total 67 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                24056 non-null  int64  
 1   observed_on_string                23892 non-null  object 
 2   observed_on                       23892 non-null  object 
 3   time_observed_at                  23335 non-null  object 
 4   time_zone                         24056 non-null  object 
 5   user_id                           24056 non-null  int64  
 6   user_login                        24056 non-null  object 
 7   user_name                         19869 non-null  object 
 8   created_at                        24056 non-null  object 
 9   updated_at                        24056 non-null  object 
 10  quality_grade                     24056 non-null  object 
 11  license                           22470 non-null  object 
 12  url 

### Dropping Columns and Rows

In [13]:
#columns of interest
to_keep = ['id', 'time_observed_at','user_id', 'created_at',
       'quality_grade', 'num_identification_agreements',
       'num_identification_disagreements', 'captive_cultivated',
       'latitude', 'longitude',
       'positional_accuracy', 'public_positional_accuracy', 'geoprivacy',
       'taxon_geoprivacy', 'coordinates_obscured', 'species_guess', 'scientific_name', 'common_name',
       'taxon_kingdom_name','taxon_genus_name',
      'taxon_species_name']
data_sp = data_sp[to_keep]

In [14]:
# dropping missing observed time rows and confirming
data_sp.dropna(subset=['time_observed_at'], inplace=True)
print(f'Number of null time_observed_at entries = {data_sp[data_sp["time_observed_at"].isnull()].shape[0]}')

Number of null time_observed_at entries = 0


In [15]:
# dropping missing latitude rows and confirming
data_sp.dropna(subset=['latitude'], inplace=True)
print(f'Number of null time_observed_at entries = {data_sp[data_sp["latitude"].isnull()].shape[0]}')

Number of null time_observed_at entries = 0


### Dummy and Boolean

In [16]:
data_sp["geoprivacy"].value_counts() #to boolean

obscured    1163
Name: geoprivacy, dtype: int64

In [17]:
data_sp["geoprivacy"] = data_sp["geoprivacy"].apply(lambda x: 1 if x == "obscured" else 0)

In [18]:
data_sp["geoprivacy"].value_counts()

0    22060
1     1163
Name: geoprivacy, dtype: int64

In [19]:
data_sp["taxon_geoprivacy"].value_counts() # to dummy

open        2829
obscured     159
Name: taxon_geoprivacy, dtype: int64

In [20]:
# makes dummies
taxon_geoprivacy_dum = pd.get_dummies(data_sp["taxon_geoprivacy"])
# concatenates dummy columns with previous data set
data_sp = pd.concat([data_sp,taxon_geoprivacy_dum],axis='columns')
# drops original taxon_geoprivacy column 
data_sp.drop(columns="taxon_geoprivacy",inplace=True)


In [21]:
data_sp["taxon_kingdom_name"].value_counts() # dummy Animalia, Plantae and Fungi

Animalia     11994
Plantae      10011
Fungi         1137
Protozoa        18
Chromista       10
Bacteria         3
Viruses          1
Name: taxon_kingdom_name, dtype: int64

In [22]:
kingdom_dum = pd.get_dummies(data_sp["taxon_kingdom_name"])[["Animalia", "Fungi", "Plantae"]]
data_sp = pd.concat([data_sp, kingdom_dum], axis = "columns")
data_sp.drop(columns = "taxon_kingdom_name", inplace = True)

In [23]:
data_sp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23223 entries, 0 to 24055
Data columns (total 24 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                23223 non-null  int64  
 1   time_observed_at                  23223 non-null  object 
 2   user_id                           23223 non-null  int64  
 3   created_at                        23223 non-null  object 
 4   quality_grade                     23223 non-null  object 
 5   num_identification_agreements     23223 non-null  int64  
 6   num_identification_disagreements  23223 non-null  int64  
 7   captive_cultivated                23223 non-null  bool   
 8   latitude                          23223 non-null  float64
 9   longitude                         23222 non-null  float64
 10  positional_accuracy               19142 non-null  float64
 11  public_positional_accuracy        19735 non-null  float64
 12  geop

### Imputation

In [24]:
#filling categoricals with missing info with "not stated" and confirming
cat_with_null = ['species_guess', 'scientific_name','common_name', 
                 'taxon_genus_name','taxon_species_name']
data_sp[cat_with_null] = data_sp[cat_with_null].fillna("not stated")
print(f'Number of null entries in stated columns = {data_sp[cat_with_null].isnull().sum().sum()}')

Number of null entries in stated columns = 0


In [25]:
round(data_sp[['positional_accuracy','public_positional_accuracy']].describe(),2)

Unnamed: 0,positional_accuracy,public_positional_accuracy
count,19142.0,19735.0
mean,466.6,2244.71
std,21983.79,22657.92
min,1.0,1.0
25%,5.0,5.0
50%,8.0,10.0
75%,20.0,30.0
max,2412154.0,2412154.0


I choose to impute the mean into the null values for both of these columns. There are few null values and imputing the mean will at least preserve the mean. Additionally, a danger of imputing a value near the medians will put those rows inside of positional accuracies that are considered good for research. Whereas, sticking with the mean will signal "poor" positional accuracy which is the better assumption to make. 

In [26]:
#imputing the mean into null categories for both accuracy
PA_mean = data_sp["positional_accuracy"].mean()
PPA_mean = data_sp["public_positional_accuracy"].mean()
data_sp["positional_accuracy"] = data_sp["positional_accuracy"].fillna(PA_mean)
data_sp["public_positional_accuracy"] = data_sp["public_positional_accuracy"].fillna(PPA_mean)

In [27]:
round(data_sp[['positional_accuracy','public_positional_accuracy']].describe(),2)

Unnamed: 0,positional_accuracy,public_positional_accuracy
count,23223.0,23223.0
mean,466.6,2244.71
std,19958.83,20887.09
min,1.0,1.0
25%,5.0,5.0
50%,10.0,10.0
75%,159.5,247.5
max,2412154.0,2412154.0


In [28]:
# changing to datetimes
data_sp["time_observed_at"] = pd.to_datetime(data_sp["time_observed_at"])
data_sp["created_at"] = pd.to_datetime(data_sp["created_at"])

In [29]:
data_sp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23223 entries, 0 to 24055
Data columns (total 24 columns):
 #   Column                            Non-Null Count  Dtype              
---  ------                            --------------  -----              
 0   id                                23223 non-null  int64              
 1   time_observed_at                  23223 non-null  datetime64[ns, UTC]
 2   user_id                           23223 non-null  int64              
 3   created_at                        23223 non-null  datetime64[ns, UTC]
 4   quality_grade                     23223 non-null  object             
 5   num_identification_agreements     23223 non-null  int64              
 6   num_identification_disagreements  23223 non-null  int64              
 7   captive_cultivated                23223 non-null  bool               
 8   latitude                          23223 non-null  float64            
 9   longitude                         23222 non-null  float64    

In [30]:
(data_sp["taxon_species_name"] == "Lobelia spicata").sum()

41