## Code Snippets for Data Preprocessing



In [24]:
# Imports
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt


In [3]:
# Read the raw data
dfRaw = pd.read_csv('/Users/daniellemckenney/Programming/erasmusCourses/ML/mlProjectsBirbs/mlProjectBirds/dataset.csv', sep="\t", on_bad_lines="warn",parse_dates=['eventDate'])

In [4]:
# Print some useful stats: 
print(dfRaw.info())
print(dfRaw.describe())
print(dfRaw.shape)
print(dfRaw.columns)
# Number of unique locations
print(dfRaw['locality'].value_counts())
# All entries should be within Spain
print(dfRaw['countryCode'].value_counts())
# No data is missing in the relevant columns:
print(dfRaw.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753579 entries, 0 to 753578
Data columns (total 50 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   gbifID                            753579 non-null  int64         
 1   datasetKey                        753579 non-null  object        
 2   occurrenceID                      753579 non-null  object        
 3   kingdom                           753579 non-null  object        
 4   phylum                            753579 non-null  object        
 5   class                             753579 non-null  object        
 6   order                             753579 non-null  object        
 7   family                            753579 non-null  object        
 8   genus                             753579 non-null  object        
 9   species                           753579 non-null  object        
 10  infraspecificEpithet            

NameError: name 'df' is not defined

In [5]:
# Function Definitions
def cleanRawData(rawData, relevantColumns):
    # returns a dataframe with only the relevent columns and with invalid location entries removed
    df = rawData.filter(items=relevantColumns)
    df = df[(df['decimalLatitude'] != 0) & (df['decimalLongitude'] != 0)]
    return df

def fillInCounts0(data, dateCombination):
    # Fill in count=0 in the grouped summary
    # Get all unique combinations of the date combination
    existing_combinations = data[dateCombination].drop_duplicates()

    # Create a DataFrame with all unique species
    all_species_combinations = pd.DataFrame({'species': data['species'].unique()})

    # Cross join (cartesian product) to get all combinations of 'species', 'eventDate', and 'locality'
    all_combinations = pd.merge(existing_combinations.assign(key=1), all_species_combinations.assign(key=1), on='key').drop('key', axis=1)

    # Merge with the original DataFrame to get counts
    on = dateCombination.append('species')
    result_df = pd.merge(all_combinations, data, on=on, how='left').fillna(0)
    return result_df

In [35]:
relevantColumns = ['species','locality','decimalLatitude', 'decimalLongitude', 'eventDate']

df = cleanRawData(dfRaw, relevantColumns)

def groupByDates(df):
    # Group by 'species', 'eventDate', and 'locality' and count the rows in each group
    grouped_eventDate_df = df.groupby(['species', 'eventDate', 'locality']).size().reset_index(name='count')
    # Fill in count=0 on days where a locality was visited but no entry was made for that species
    grouped_eventDate_df = fillInCounts0(grouped_eventDate_df, ['eventDate', 'locality'])
    # print(grouped_eventDate_df[10:])

    grouped_eventDate_df['weekOfYear'] = grouped_eventDate_df['eventDate'].dt.isocalendar().week
    grouped_eventDate_df['month'] = grouped_eventDate_df['eventDate'].dt.month
    grouped_eventDate_df['quarter'] = grouped_eventDate_df['eventDate'].dt.to_period('Q')
    grouped_eventDate_df['year'] = grouped_eventDate_df['eventDate'].dt.year

    grouped_weekOfYear_df = grouped_eventDate_df.groupby(['species', 'weekOfYear', 'year', 'locality']).size().reset_index(name='count')
    grouped_weekOfYear_df = fillInCounts0(grouped_eventDate_df, ['weekOfYear', 'year', 'locality'])

    grouped_month_df = grouped_eventDate_df.groupby(['species', 'month', 'year', 'locality']).size().reset_index(name='count')
    grouped_weekOfYear_df = fillInCounts0(grouped_eventDate_df, ['month', 'year', 'locality'])

    grouped_quarter_df = grouped_eventDate_df.groupby(['species', 'quarter', 'locality']).size().reset_index(name='count')
    grouped_weekOfYear_df = fillInCounts0(grouped_eventDate_df, ['quarter', 'locality'])

    grouped_year_df = grouped_eventDate_df.groupby(['species', 'year', 'locality']).size().reset_index(name='count')
    grouped_weekOfYear_df = fillInCounts0(grouped_eventDate_df, ['year', 'locality'])

    return grouped_eventDate_df, grouped_weekOfYear_df, grouped_month_df, grouped_quarter_df, grouped_year_df

# print(grouped_weekOfYear_df[10:])
# print(grouped_month_df[10:])
# print(grouped_quarter_df[10:])
# print(grouped_year_df[10:])

In [36]:
chiffChaff=  df[(df['species'] =='Phylloscopus collybita')]
eD, w,m,q,y = groupByDates(chiffChaff)
y


       eventDate             locality                 species  count
10    1962-04-19          Castromocho  Phylloscopus collybita      2
11    1962-10-25        Casa de campo  Phylloscopus collybita      7
12    1962-10-29        Casa de campo  Phylloscopus collybita      1
13    1962-10-29              Pechina  Phylloscopus collybita      1
14    1962-10-30             Paiporta  Phylloscopus collybita      1
...          ...                  ...                     ...    ...
72403 2019-08-17   Florida de liebana  Phylloscopus collybita      1
72404 2019-08-18          Congostrina  Phylloscopus collybita      1
72405 2019-08-29     PLAYA DE LOS 600  Phylloscopus collybita      2
72406 2019-09-01          Congostrina  Phylloscopus collybita      1
72407 2019-09-03  Arroyo del infierno  Phylloscopus collybita      1

[72398 rows x 4 columns]


Unnamed: 0,species,year,locality,count
0,Phylloscopus collybita,1960,Chipiona,2
1,Phylloscopus collybita,1962,Casa de campo,7
2,Phylloscopus collybita,1962,Castromocho,4
3,Phylloscopus collybita,1962,Gergal,1
4,Phylloscopus collybita,1962,Paiporta,2
...,...,...,...,...
16102,Phylloscopus collybita,2019,Valencia,3
16103,Phylloscopus collybita,2019,Valladolid,1
16104,Phylloscopus collybita,2019,Villacaparra,3
16105,Phylloscopus collybita,2019,Villalonga,1


In [38]:
chiffChaff=  df[(df['species'] =='Turdus merula')]
eD, w,m,q,y = groupByDates(chiffChaff)
y

       eventDate              locality        species  count
10    1961-02-22              Juslibol  Turdus merula      1
11    1961-07-10     Huerto del venado  Turdus merula      1
12    1961-07-11     Huerto del venado  Turdus merula      1
13    1961-07-20     Puente de segovia  Turdus merula      1
14    1961-07-20           Río valsaín  Turdus merula      1
...          ...                   ...            ...    ...
84219 2019-09-14         Fuentelencina  Turdus merula      2
84220 2019-09-14                  Noja  Turdus merula      1
84221 2019-09-14            Puenteceso  Turdus merula      6
84222 2019-09-15                  Noja  Turdus merula      1
84223 2019-09-16  P. NAT. DE GRAZALEMA  Turdus merula      1

[84214 rows x 4 columns]


Unnamed: 0,species,year,locality,count
0,Turdus merula,1951,Elizondo,1
1,Turdus merula,1953,Villarreal de urrenchua,1
2,Turdus merula,1960,Astigarraga,1
3,Turdus merula,1960,Burjasot,1
4,Turdus merula,1960,Garriguella,1
...,...,...,...,...
20508,Turdus merula,2019,Villamanta,5
20509,Turdus merula,2019,Villamantilla,4
20510,Turdus merula,2019,Villasandino,1
20511,Turdus merula,2019,Zapata,5


In [37]:
eD, w,m,q,y = groupByDates(df)
y

        eventDate      locality                 species  count
10     1964-08-30       Milagro           Turdus merula    0.0
11     1964-08-30       Milagro       Turdus philomelos    0.0
12     1964-09-15  Alcantarilla         Merops apiaster    1.0
13     1964-09-15  Alcantarilla  Phylloscopus collybita    0.0
14     1964-09-15  Alcantarilla           Turdus merula    0.0
...           ...           ...                     ...    ...
592987 2019-08-13    Villamanta       Turdus philomelos    1.0
592988 2019-09-09      Anguiano         Merops apiaster    0.0
592989 2019-09-09      Anguiano  Phylloscopus collybita    0.0
592990 2019-09-09      Anguiano           Turdus merula    0.0
592991 2019-09-09      Anguiano       Turdus philomelos    1.0

[592982 rows x 4 columns]


Unnamed: 0,species,year,locality,count
0,Merops apiaster,1951,Elizondo,1
1,Merops apiaster,1953,Villarreal de urrenchua,1
2,Merops apiaster,1959,San sebastian,1
3,Merops apiaster,1960,Astigarraga,1
4,Merops apiaster,1960,Bajamar,1
...,...,...,...,...
115751,Turdus philomelos,2019,Villamanta,6
115752,Turdus philomelos,2019,Villamantilla,4
115753,Turdus philomelos,2019,Villasandino,2
115754,Turdus philomelos,2019,Zapata,7


In [30]:
# Explor the data grouped by week for chiff chaffs
chiffChaffWeek=  grouped_weekOfYear_df[(grouped_weekOfYear_df['species'] =='Phylloscopus collybita')]

canalVell =  chiffChaffWeek[(chiffChaffWeek['locality'] =="Canal vell")]

# View the locations where there are more than 600 entries
locals =chiffChaffWeek['locality'].value_counts()
filtered_locals = locals[locals > 600]
filtered_locals



locality
Canal vell                               1656
Traibuenas                               1410
Montiver                                 1364
Manecorro                                1002
Laguna de san juan                        972
Parque el alamillo                        898
P. NAT. EL HONDO                          872
Estanys jorda                             857
P. NAC. DEL ARCHIPIELAGO DE CABRERA       851
Las minas                                 807
Delta del llobregat                       785
Sebes                                     765
P. NAT. DESEMBOCADURA DEL GUADALHORCE     762
RIVAS-VACIAMADRID                         684
Name: count, dtype: int64

In [81]:
print(grouped_weekOfYear_df[10:])

                                                                        eventDate  \
species           weekOfYear year locality                                          
Merops apiaster   1          1960 A sainza                                      0   
                                  A xunqueira                                   0   
                                  A xunqueira de alba                           0   
                                  ACEBUCHAL-P. NAT. MASISMAS DEL ODIEL          0   
                                  AIGUABARREIG TER-BRUGENT                      0   
...                                                                           ...   
Turdus philomelos 53         2016 Zumarraga                                     0   
                                  Zumaya                                        0   
                                  Zurbano                                       0   
                                  Zuriain                        

In [25]:
# Checking locality vs coordinates
# filtered_df = df[df['locality'].isin(['Localidad confidencial'])]
# print(filtered_df.filter(items=['locality', 'decimalLatitude','decimalLongitude', 'species', 'eventDate']))
# It looks like all of the coordinates of the same locality match

In [28]:
# Heatmap
gdf = gpd.GeoDataFrame(df, 
                       geometry=gpd.points_from_xy(df['decimalLongitude'], df['decimalLatitude']),
                       crs='EPSG:4326')  # Assuming WGS 84 coordinate reference system

spain = gpd.read_file('/Users/daniellemckenney/Programming/erasmusCourses/ML/mlProjectsBirbs/mlProjectBirds/shapefile.shp')
species_spain = gpd.sjoin(gdf, spain, how='inner', op='within')
heatmap_data = species_spain.groupby(['species', 'NAME']).size().unstack(fill_value=0)
fig, ax = plt.subplots(figsize=(12, 8))
heatmap_data.plot(kind='bar', stacked=True, ax=ax)
plt.title('Species Distribution in Spain')
plt.xlabel('Species')
plt.ylabel('Frequency')
plt.show()



  if await self.run_code(code, result, async_=asy):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None

  species_spain = gpd.sjoin(gdf, spain, how='inner', op='within')


KeyError: 'NAME'