# Day 3: Exercises from the course "Machine Learning using Python (MLUP01)"

**Sessions**: (3) Introduction to Machine Learning

**Instructor**: Gabriel Rodrigues Palma

**Objective**: This day focuses on the theoretical foundations of machine learning, detailing the application of learning algorithms in preparation for the practical examples in Python

## Importing modules

In [1]:
import numpy as np

## Tutorial for basic NumPy

#### Basic Array Creation
Create arrays from your ecological measurements

In [3]:
# Daily temperature readings (°C)
temperatures = np.array([18.5, 19.2, 17.8, 20.1, 22.3, 21.7])
print(temperatures)
# array([18.5, 19.2, 17.8, 20.1, 22.3, 21.7])

# Access individual measurements
print(f"Day 1 temperature: {temperatures[0]}°C")
# Day 1 temperature: 18.5°C

[18.5 19.2 17.8 20.1 22.3 21.7]
Day 1 temperature: 18.5°C


#### Multi-dimensional Ecological Data
Create species-by-site abundance matrices

In [4]:
# Species abundance data: 3 species across 4 sites
# Rows = species (Oak, Pine, Birch), Columns = sites (A, B, C, D)
species_abundance = np.array([[15, 8, 22, 12],   # Oak counts
                             [6, 18, 4, 20],    # Pine counts  
                             [11, 3, 16, 7]])   # Birch counts

print(species_abundance)
# array([[15,  8, 22, 12],
#        [ 6, 18,  4, 20],
#        [11,  3, 16,  7]])

# Access Pine abundance at site C (row 1, column 2)
pine_site_c = species_abundance[1, 2] 
print(f"Pine count at site C: {pine_site_c}")
# Pine count at site C: 4

[[15  8 22 12]
 [ 6 18  4 20]
 [11  3 16  7]]
Pine count at site C: 4


#### Array Attributes for Ecological Data Analysis
Understanding your ecological dataset structure

In [5]:
# Environmental monitoring data: 5 sensors, 3 days, 4 measurements/day
sensor_data = np.array([[[23.1, 24.2, 25.0, 23.8],  # Sensor 1, Day 1
                        [24.0, 25.1, 26.2, 24.9],   # Sensor 1, Day 2
                        [22.8, 23.9, 24.7, 23.5]],  # Sensor 1, Day 3
                       # ... data for sensors 2-5
                       ])

print(f"Number of dimensions: {sensor_data.ndim}")
print(f"Shape (sensors, days, measurements): {sensor_data.shape}")
print(f"Total measurements: {sensor_data.size}")
print(f"Data type: {sensor_data.dtype}")


Number of dimensions: 3
Shape (sensors, days, measurements): (1, 3, 4)
Total measurements: 12
Data type: float64


#### Creating Standard Ecological Arrays
Initialize Arrays for Data Collection

In [6]:
# Create arrays for field data collection
# 50 empty biodiversity measurements
biodiversity_scores = np.zeros(50)

# 3x10 matrix for species presence/absence (3 transects, 10 quadrats each)
presence_absence = np.zeros((3, 10), dtype=int)

# pH measurements from 1-14 scale, 20 evenly spaced points
ph_range = np.linspace(1.0, 14.0, num=20)
print(ph_range)
# array([ 1.        ,  1.68421053,  2.36842105, ..., 13.31578947, 14.        ])


[ 1.          1.68421053  2.36842105  3.05263158  3.73684211  4.42105263
  5.10526316  5.78947368  6.47368421  7.15789474  7.84210526  8.52631579
  9.21052632  9.89473684 10.57894737 11.26315789 11.94736842 12.63157895
 13.31578947 14.        ]


#### Generate Sampling Protocols

In [7]:
# Random sampling for ecological surveys
rng = np.random.default_rng(42)  # Reproducible results for research

# Random quadrat selection (10 quadrats from 100 possible locations)
quadrat_ids = rng.integers(1, 101, size=10)
print(f"Selected quadrats: {quadrat_ids}")

# Random GPS coordinates for sampling points
latitude = rng.uniform(45.0, 46.0, size=20)  # Lat range for study area
longitude = rng.uniform(-74.0, -73.0, size=20)  # Long range for study area


Selected quadrats: [ 9 78 66 44 44 86  9 70 21 10]


#### Data Operations
Species Richness and Diversity Calculations

In [8]:
# Species abundance data across sites
oak_counts = np.array([15, 8, 22, 12, 18])
pine_counts = np.array([6, 18, 4, 20, 14])
birch_counts = np.array([11, 3, 16, 7, 9])

# Calculate total abundance per site
total_abundance = oak_counts + pine_counts + birch_counts
print(f"Total abundance per site: {total_abundance}")

# Species richness (number of species present)
species_matrix = np.array([oak_counts, pine_counts, birch_counts])
species_presence = species_matrix > 0  # Convert to presence/absence
richness = np.sum(species_presence, axis=0)
print(f"Species richness per site: {richness}")


Total abundance per site: [32 29 42 39 41]
Species richness per site: [3 3 3 3 3]


In [10]:
# Temperature data analysis
daily_temps = np.array([18.5, 19.2, 17.8, 20.1, 22.3, 21.7, 19.9, 18.6])

# Basic statistics
print(f"Mean temperature: {daily_temps.mean():.1f}°C")
print(f"Temperature range: {daily_temps.max() - daily_temps.min():.1f}°C")
print(f"Standard deviation: {daily_temps.std():.1f}°C")

# Identify extreme temperatures
hot_days = daily_temps > 21.0
cold_days = daily_temps < 19.0
print(f"Hot days (>21°C): {daily_temps[hot_days]}")
print(f"Cold days (<19°C): {daily_temps[cold_days]}")

Mean temperature: 19.8°C
Temperature range: 4.5°C
Standard deviation: 1.5°C
Hot days (>21°C): [22.3 21.7]
Cold days (<19°C): [18.5 17.8 18.6]


#### Preparing Data for Machine Learning
Feature Matrix Creation

In [11]:
# Create feature matrix for species distribution modeling
# Features: elevation, temperature, precipitation, aspect
n_locations = 100
rng = np.random.default_rng(42)

elevation = rng.normal(1200, 300, n_locations)
temperature = rng.normal(15, 5, n_locations)  
precipitation = rng.normal(800, 200, n_locations)
aspect = rng.uniform(0, 360, n_locations)

# Combine into feature matrix (samples × features)
features = np.column_stack([elevation, temperature, precipitation, aspect])
print(f"Feature matrix shape: {features.shape}")
print("First 5 locations:")
print(features[:5])

# Create binary response variable (species presence/absence)
# Complex ecological relationship
suitability_score = (elevation - 1000)**2 + (temperature - 18)**2 + \
                   (precipitation - 900)**2
species_presence = (suitability_score < 50000).astype(int)
print(f"Species presence rate: {species_presence.mean():.2%}")


Feature matrix shape: (100, 4)
First 5 locations:
[[1291.41512393   13.10918723  867.51490976   83.72517859]
 [ 888.00476813   21.49614149 1081.49637226  132.30425164]
 [1425.13535874   13.21868014  818.11698138  131.90128194]
 [1482.16941492   18.68757784  928.78775866  117.89840319]
 [ 614.6894434    10.3319116   389.96557979  136.6070687 ]]
Species presence rate: 32.00%


In [12]:
# Split data for machine learning
n_train = int(0.7 * n_locations)  # 70% for training

# Shuffle indices for random sampling
indices = np.arange(n_locations)
rng.shuffle(indices)

train_indices = indices[:n_train]
test_indices = indices[n_train:]

# Training and testing sets
X_train = features[train_indices]
y_train = species_presence[train_indices]
X_test = features[test_indices]
y_test = species_presence[test_indices]

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training presence rate: {y_train.mean():.2%}")


Training set: 70 samples
Test set: 30 samples
Training presence rate: 32.86%
