In this notebook I merge data from two papers on predictors of deforestation for the Pacific islands: Atkinson et al. 2016, and Rollet and Diamond, 2004.

In [10]:
import os, sys, yaml
print("Working directory: ", os.getcwd())
conf = {k: v for d in yaml.safe_load(open(os.path.join("..", "conf.yaml"))) for k, v in d.items()}
sys.path.insert(0, os.path.join("..", conf["src"]))

import pandas as pd

pd.set_option('display.max_rows', 100)  

Working directory:  /home/patryk/pCloud/PROJECTS/pacific-deforestation/notebooks


Run the scripts to download the data from publishers' website and clean it preliminarily.

In [11]:
%cd ..
%run "code/download_data.py"
%run "code/clean_data.py"
%cd -

def read_file(filename):
    return pd.read_csv(os.path.join("..", conf["data"]["int"], filename), index_col=0)

data1 = read_file("Atkinson2016.csv")
data2 = read_file("Rollet2004.csv")

/home/patryk/pCloud/PROJECTS/pacific-deforestation
Downloading Atkinson2016.xlsx
Downloading Rollet2004.xls
Downloading ne_50m_land.shp
Downloading ne_50m_land.shx
Downloading ne_50m_land.prj
Creating dataset Atkinson2016.csv
Creating dataset: Rollet2004.csv
/home/patryk/pCloud/PROJECTS/pacific-deforestation/notebooks


In [12]:
data1.head()

Unnamed: 0,Island,Language,Replacement,Deforestation,Latitude,Longitude,AbsLat,Area,Isolation,Elevation,Age=3,Rainfall,Makatea,Dust,Tephra,Irrigation,Dry,Arboriculture,Elite Ownership,Individual Ownership
0,Aitutaki,Rarotongan1,4.0,4.0,-18.84635,-159.781895,18.84635,18.0,187.0,124,1.0,1894,0.0,210,1,1,0,0,0.0,0.0
1,Alofi,FutunaEast1,3.0,3.0,-14.333333,-178.03333,14.3333,29.0,2.5,417,,2700,0.0,205,3,0,1,0,0.0,1.0
2,Aneityum,AnejomAneityum,3.0,3.5,-20.186491,169.823914,20.186491,160.0,70.0,852,0.0,2290,0.0,370,3,1,0,0,0.0,0.0
3,Atiu,Rarotongan2,4.0,4.0,-19.994482,-158.117552,19.994482,28.0,22.0,71,1.0,1970,0.53,210,1,1,0,0,0.0,0.0
4,Bora Bora,TahitianModern1,4.0,3.0,-16.497324,-151.737499,16.497324,38.0,20.0,727,1.0,2248,0.0,182,1,1,1,1,1.0,1.0


I am not interested in performing a phylogenetic analysis requiring the information about languages. Hence, I will drop the language column.

Since there is no variability in cultural variables within islands, it is safe to call ".mean()" which will ignore NaNs for some language groups and will average locations of language groups.

In [13]:
data1 = data1.drop(columns=["Language"]).groupby("Island").mean()
data1.sort_index()

Unnamed: 0_level_0,Replacement,Deforestation,Latitude,Longitude,AbsLat,Area,Isolation,Elevation,Age=3,Rainfall,Makatea,Dust,Tephra,Irrigation,Dry,Arboriculture,Elite Ownership,Individual Ownership
Island,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Aitutaki,4.0,4.0,-18.84635,-159.781895,18.84635,18.0,187.0,124,1.0,1894,0.0,210,1,1,0,0,0.0,0.0
Alofi,3.0,3.0,-14.333333,-178.03333,14.3333,29.0,2.5,417,,2700,0.0,205,3,0,1,0,0.0,1.0
Aneityum,3.0,3.5,-20.186491,169.823914,20.186491,160.0,70.0,852,0.0,2290,0.0,370,3,1,0,0,0.0,0.0
Atiu,4.0,4.0,-19.994482,-158.117552,19.994482,28.0,22.0,71,1.0,1970,0.53,210,1,1,0,0,0.0,0.0
Bora Bora,4.0,3.0,-16.497324,-151.737499,16.497324,38.0,20.0,727,1.0,2248,0.0,182,1,1,1,1,1.0,1.0
Bougainville,2.0,2.0,-5.970405,155.135193,5.970405,8591.0,50.0,2591,0.0,3009,0.0,550,3,0,0,0,0.0,1.0
Choiseul,2.0,2.0,-6.884147,156.809235,6.884147,2966.0,45.0,970,1.0,3454,0.0,550,3,0,0,0,0.0,0.0
Easter,,5.0,-27.112312,-109.354477,27.112312,166.0,1600.0,510,0.0,1198,0.0,5,1,0,1,0,0.0,0.0
Efate,3.0,3.0,-17.659071,168.397064,17.659071,915.0,80.0,647,0.0,2293,0.0,370,3,0,0,0,0.0,0.0
Eiao,4.0,3.0,-7.992597,-140.692806,7.992597,52.0,5.0,577,1.0,1000,0.0,138,1,0,0,1,0.0,1.0


In [14]:
data2.sort_index()

Unnamed: 0_level_0,Archipelago,Replacement,Deforestation,Area,Area 50,Isolation,Isolation 75,Elevation,Latitude,Rainfall,Age,Makatea,Dust,Tephra
Island,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Aitutaki,Cooks,4.0,4.0,18.0,18.0,187.0,187.0,124,-19.0,1894.0,3.0,0.0,210,1
Alofi,W Polynesia,3.0,3.0,17.5,63.2,2.5,2.5,417,-19.0,,,0.0,205,3
Aneityum,Vanuatu,3.0,3.5,160.0,160.0,70.0,70.0,852,-20.2,2290.0,2.0,0.0,370,3
Atiu,Cooks,4.0,4.0,28.0,48.0,22.0,22.0,71,-20.0,1970.0,3.0,0.53,210,1
Bora Bora,Societies,4.0,3.0,38.0,432.0,20.0,20.0,727,-16.5,2248.0,3.0,0.0,182,1
Bougainville,Solomons,2.0,2.0,8591.0,12471.0,50.0,200.0,2591,-6.12,3009.0,2.0,0.0,550,3
Choiseul,Solomons,2.0,2.0,2966.0,11698.0,45.0,45.0,970,-7.05,3454.0,3.0,0.0,550,3
Easter,SE Polynesia,5.0,5.0,166.0,166.0,1600.0,1600.0,510,-27.0,1198.0,2.5,0.0,5,1
Efate,Vanuatu,3.0,3.0,915.0,985.0,80.0,110.0,647,-17.37,2293.0,2.0,0.0,370,3
Eiao,Marquesas,4.0,3.0,52.0,400.0,5.0,50.0,577,-8.0,1000.0,3.0,0.0,138,1


I want to see if the datasets contain the same islands, or maybe there are some extra elements.

In [15]:
extra_elements_1 = set(data1.index) - set(data2.index)
extra_elements_2 = set(data2.index) - set(data1.index)

print(extra_elements_1)
print(extra_elements_2)

{'South Island (W coast)'}
{'Pitcairn', 'Kahoolawe'}


Now I want to compare the two datasets for the islands that appear in both of them.

In [16]:
data1_to_compare = data1.drop(index=extra_elements_1)
data2_to_compare = data2.drop(index=extra_elements_2)

compared = (data1_to_compare - data2_to_compare).dropna(axis=1, how='all')

compared

Unnamed: 0_level_0,Area,Deforestation,Dust,Elevation,Isolation,Latitude,Makatea,Rainfall,Replacement,Tephra
Island,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Aitutaki,0.0,0.0,0,0,0.0,0.15365,0.0,0.0,0.0,0
Alofi,11.5,0.0,0,0,0.0,4.666667,0.0,,0.0,0
Aneityum,0.0,0.0,0,0,0.0,0.013509,0.0,0.0,0.0,0
Atiu,0.0,0.0,0,0,0.0,0.005518,0.0,0.0,0.0,0
Bora Bora,0.0,0.0,0,0,0.0,0.002676,0.0,0.0,0.0,0
Bougainville,0.0,0.0,0,0,0.0,0.149595,0.0,0.0,0.0,0
Choiseul,0.0,0.0,0,0,0.0,0.165853,0.0,0.0,0.0,0
Easter,0.0,0.0,0,0,0.0,-0.112312,0.0,0.0,,0
Efate,0.0,0.0,0,0,0.0,-0.289071,0.0,0.0,0.0,0
Eiao,0.0,0.0,0,0,0.0,0.007403,0.0,0.0,0.0,0


There are some discrepancies in area and latitude columns that need to be resolved.

In [17]:
combined = pd.concat([data1_to_compare.add_suffix("_1"), 
                      data2_to_compare.add_suffix("_2"), 
                      compared.add_suffix("_diff")], 
                     axis=1, sort=False)

for var, diff in [("Area", 0), ("Latitude", 1.)]:
    print(combined[[var + "_1", var + "_2", var + "_diff"]]
          [abs(combined[var + "_diff"]) > diff])

        Area_1  Area_2  Area_diff
Alofi     29.0    17.5       11.5
Futuna    62.0    45.7       16.3
                    Latitude_1  Latitude_2  Latitude_diff
Alofi               -14.333333      -19.00       4.666667
Futuna              -14.282683      -19.00       4.717317
Necker               23.574293      -23.30      46.874293
New Britain (lee.)   -5.200151       -6.22       1.019849
New Caledonia       -20.863565      -22.00       1.136435
Nihoa                23.060721      -23.00      46.060721
Niue                -19.050000      -18.00      -1.050000


According to the French Wikipedia, the values from dataset 2 (Rollet 2004) for areas of Futuna and Alofi islands are more accurate ([fr.wikipedia.org/wiki/Wallis-et-Futuna](https://fr.wikipedia.org/wiki/Wallis-et-Futuna)).

Generally, tha latitude data from dataset 1 is more trustworthy. Latitudes for Necker and Nihoa were mistakenly marked as "S" in dataset 1 (which did not influence the paper's result since absolute value of latitude was used). The latitudes for Futuna island, and nearby located Alofi, were mistaken with the latitude of the West Futuna island (which could have influenced the paper's results). 

Finally, I create a dataset I will use for further analysis.

In [18]:
data = data1.copy()
# Add the two extra elements from dataset 2
data = pd.concat([data, data2.loc[['Kahoolawe', 'Pitcairn'], data.columns]], axis=0, sort=True)
# Add some columns from dataset 2
imported_columns = ["Archipelago", "Area 50", "Isolation 75", "Age"]
data = pd.concat([data, data2[imported_columns]], axis=1, sort=False)
# Age=3 and AbsLat columns are redundant
data = data.drop(columns=["Age=3", "AbsLat"])

# Fix the South Island (W coast) entry by importing data for E coast
data.loc["South Island (W coast)", imported_columns] = data2.loc["South Island (E coast)", imported_columns]
# Fix areas of Futuna and Alofi
data.loc[["Futuna", "Alofi"], "Area"] = data2.loc[["Futuna", "Alofi"], "Area"]
# Fix Pitcairn and Kahoolawe
data.loc["Pitcairn", ["Latitude", "Longitude"]] = [-25.07, -130.1]
data.loc["Kahoolawe", ["Latitude", "Longitude"]] = [20.55, -156.6]
# For New Caledonia, the closest "island" of size at least 75% of the island's size is Australia. 
data.loc["New Caledonia", "Isolation 75"] = 1400

# Round the coordinates (it is meaningless to have so many decimal points)
data[["Latitude", "Longitude"]] = data[["Latitude", "Longitude"]].round(2)

# Organize the columns
output_cols = ["Deforestation", "Replacement"]
env_cols = ["Latitude", "Longitude", "Area", "Area 50", "Isolation", "Isolation 75",
            "Elevation", "Rainfall", "Dust", "Makatea", "Age", "Tephra"]
cult_cols = ["Irrigation", "Dry", "Arboriculture", "Elite Ownership", "Individual Ownership"]

data = data[["Archipelago"] + output_cols + env_cols + cult_cols]
data[output_cols + env_cols + cult_cols] = data[output_cols + env_cols + cult_cols].astype(float)
data.index.rename("Island", inplace=True)
data = data.sort_values(by="Archipelago")

data.to_csv(os.path.join("..", conf["data"]["cln"], "pacific-deforestation.csv"))
data

Unnamed: 0_level_0,Archipelago,Deforestation,Replacement,Latitude,Longitude,Area,Area 50,Isolation,Isolation 75,Elevation,Rainfall,Dust,Makatea,Age,Tephra,Irrigation,Dry,Arboriculture,Elite Ownership,Individual Ownership
Island,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Tubuai,Australs,4.0,4.0,-23.37,-149.47,45.0,45.0,180.0,210.0,422.0,2030.0,174.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0
Rurutu,Australs,4.0,4.0,-22.47,-151.34,32.0,32.0,150.0,210.0,389.0,1899.0,181.0,0.1,2.5,1.0,1.0,0.0,0.0,0.0,0.0
Rimatara,Australs,4.0,4.0,-22.65,-152.81,9.0,9.0,150.0,150.0,83.0,1660.0,187.0,0.1,3.0,1.0,1.0,0.0,0.0,0.0,0.0
Rapa,Australs,4.0,4.0,-27.6,-144.34,40.0,40.0,600.0,600.0,650.0,2738.0,156.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0
Raivavae,Australs,4.0,4.0,-23.87,-147.66,18.0,18.0,180.0,180.0,437.0,1841.0,168.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0
New Britain (lee.),Bismarcks,2.0,2.0,-5.2,150.48,35742.0,43784.0,30.0,95.0,2439.0,4156.0,550.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0
New Ireland,Bismarcks,2.0,2.0,-3.14,151.57,7174.0,44482.0,30.0,30.0,2399.0,3302.0,550.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0
New Britain (wind.),Bismarcks,2.0,2.0,-6.12,149.5,35742.0,43784.0,30.0,95.0,2439.0,6076.0,550.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0
Rarotonga,Cooks,3.0,4.0,-21.23,-159.78,67.0,67.0,165.0,165.0,653.0,2021.0,210.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0
Mitiaro,Cooks,4.0,4.0,-19.87,-157.7,22.0,70.0,22.0,22.0,15.0,1828.0,210.0,0.74,3.0,1.0,0.0,1.0,0.0,0.0,0.0
