# Data Analysis of Coral Reef bleaching

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

## Load dataset and Inital Exploration

In [3]:
data = pd.read_csv("../Data/Datasets/Reef Check Data Raw.csv")
data.head()

Unnamed: 0,Reef ID,Reef Name,Longitude Degrees,Longitude Minutes,Longitude Seconds,Longitude Cardinal Direction,Latitude Degrees,Latitude Minutes,Latitude Seconds,Latitude Cardinal Direction,...,Year,Date,Depth,Organism Code,S1,S2,S3,S4,Errors?,What errors?
0,103.10.28.1E.10.50.46.1N,Koh Mano (Minor),103.0,10.0,28.1,E,10.0,50.0,46.1,N,...,2003,19-Feb-03,4.5,Bleaching (% of colony),0.0,0.0,0.0,0.0,False,
1,103.10.28.1E.10.50.46.1N,Koh Mano (Minor),103.0,10.0,28.1,E,10.0,50.0,46.1,N,...,2003,19-Feb-03,4.5,Bleaching (% of population),0.0,0.0,0.0,0.0,False,
2,103.11.35.5E.10.49.32N,Koh Mano (south),103.0,11.0,35.5,E,10.0,49.0,32.0,N,...,2003,28-Feb-03,4.5,Bleaching (% of colony),0.0,0.0,0.0,0.0,False,
3,103.11.35.5E.10.49.32N,Koh Mano (south),103.0,11.0,35.5,E,10.0,49.0,32.0,N,...,2003,28-Feb-03,4.5,Bleaching (% of population),0.0,0.0,0.0,0.0,False,
4,103.11.79.5E.10.48.2.7N,Koh Ta Team,103.0,11.0,47.7,E,10.0,48.0,2.7,N,...,2003,24-Feb-03,5.0,Bleaching (% of colony),0.0,0.0,0.0,0.0,False,


In [4]:
data.shape

(18211, 24)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18211 entries, 0 to 18210
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Reef ID                       18211 non-null  object 
 1   Reef Name                     18211 non-null  object 
 2   Longitude Degrees             18165 non-null  float64
 3   Longitude Minutes             18165 non-null  float64
 4   Longitude Seconds             18161 non-null  float64
 5   Longitude Cardinal Direction  18179 non-null  object 
 6   Latitude Degrees              18165 non-null  float64
 7   Latitude Minutes              18165 non-null  float64
 8   Latitude Seconds              18165 non-null  float64
 9   Latitude Cardinal Direction   18179 non-null  object 
 10  Ocean                         18171 non-null  object 
 11  Country                       18211 non-null  object 
 12  State/Province/Island         16258 non-null  object 
 13  C

In [6]:
data.columns

Index(['Reef ID', 'Reef Name', 'Longitude Degrees', 'Longitude Minutes',
       'Longitude Seconds', 'Longitude Cardinal Direction', 'Latitude Degrees',
       'Latitude Minutes', 'Latitude Seconds', 'Latitude Cardinal Direction',
       'Ocean', 'Country', 'State/Province/Island', 'City/Town', 'Year',
       'Date', 'Depth', 'Organism Code', 'S1', 'S2', 'S3', 'S4', 'Errors?',
       'What errors?'],
      dtype='object')

In [7]:
## Number of mising values
data.isnull().sum()

Reef ID                             0
Reef Name                           0
Longitude Degrees                  46
Longitude Minutes                  46
Longitude Seconds                  50
Longitude Cardinal Direction       32
Latitude Degrees                   46
Latitude Minutes                   46
Latitude Seconds                   46
Latitude Cardinal Direction        32
Ocean                              40
Country                             0
State/Province/Island            1953
City/Town                        4308
Year                                0
Date                                0
Depth                               0
Organism Code                       0
S1                                 15
S2                                 22
S3                                137
S4                                211
Errors?                             0
What errors?                    14307
dtype: int64

In [8]:
## Number of duplicated rows
data.duplicated().sum()

0

In [10]:
crit_cols = ['Reef ID', 'Reef Name', 'Longitude Degrees',
       'Longitude Cardinal Direction', 'Latitude Degrees',
       'Latitude Cardinal Direction',
       'Country', 'Depth', 'Organism Code', 'S1', 'S2', 'S3', 'S4']

data = data.dropna(subset=crit_cols)
data = data.drop_duplicates()

## Standardizing Data

In [13]:
data["Latitude"] = data["Latitude Degrees"].astype(str) + " " + data["Latitude Minutes"].astype(str) + " " + data["Latitude Seconds"].astype(str) + " " + data["Latitude Cardinal Direction"].astype(str)

data["Longitude"] = data["Longitude Degrees"].astype(str) + " " + data["Longitude Minutes"].astype(str) + " " + data["Longitude Seconds"].astype(str) + " " + data["Longitude Cardinal Direction"].astype(str)

data[["Latitude", "Longitude"]].head()

Unnamed: 0,Latitude,Longitude
0,10.0 50.0 46.1 N,103.0 10.0 28.1 E
1,10.0 50.0 46.1 N,103.0 10.0 28.1 E
2,10.0 49.0 32.0 N,103.0 11.0 35.5 E
3,10.0 49.0 32.0 N,103.0 11.0 35.5 E
4,10.0 48.0 2.7 N,103.0 11.0 47.7 E


In [14]:
data["Depth"] = pd.to_numeric(data["Depth"], errors="coerce")
data["Depth"].fillna(data["Depth"].mean(), inplace=True)

data[["Depth"]].head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Depth"].fillna(data["Depth"].mean(), inplace=True)


Unnamed: 0,Depth
0,4.5
1,4.5
2,4.5
3,4.5
4,5.0


In [15]:
data["Year"] = pd.to_numeric(data["Year"], errors="coerce")
data["Year"].dropna(inplace=True)

data[["Year"]].head()

Unnamed: 0,Year
0,2003
1,2003
2,2003
3,2003
4,2003


In [17]:
def extract_bleaching(row):
    return row["S1"]+ row["S2"] + row["S3"] + row["S4"]

data["Bleaching Percentage"] = data.apply(extract_bleaching, axis=1)
data["Bleaching Percentage"].fillna(data["Bleaching Percentage"].mean(), inplace=True)
data["Bleaching Percentage"] = data["Bleaching Percentage"].apply(lambda x: min(x, 100))

data[["Bleaching Percentage"]].head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Bleaching Percentage"].fillna(data["Bleaching Percentage"].mean(), inplace=True)


Unnamed: 0,Bleaching Percentage
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [18]:
data[["S1", "S2", "S3", "S4"]].head()

Unnamed: 0,S1,S2,S3,S4
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
