In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import numpy as np  # This is for general numerical operations
import seaborn as sns  # This allows us to efficiently and beautifully plot
import os
import geopandas as gpd
import palettable as pltt
from seaborn import palplot



In [2]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)


In [20]:
# --------------------------
# 1. Load the csv file and import the right semicolon format
# --------------------------
file = "export2\deals.csv"

df_deals = pd.read_csv(
    file,
    sep=";",          # semicolon-separated
    engine="python",  # needed for multiline fields
    encoding="utf-8",
    )

# --------------------------
# 2. Basic structural exploration
# --------------------------
# print("===== FIRST 5 ROWS =====")
# print(df_deals.head())

print("===== COLUMN NAMES =====")
print(list(df_deals.columns), "\n")

print("===== DATAFRAME INFO =====")
print(df_deals.info(), "\n")

print("===== SHAPE OF DATA (rows, columns) =====")
print(df_deals.shape, "\n")

print("===== MISSING VALUES PER COLUMN =====")
print(df_deals.isna().sum(), "\n")


===== COLUMN NAMES =====
['Deal ID', 'Is public', 'Deal scope', 'Deal size', 'Target country', 'Current size under contract', 'Current size in operation (production)', 'Current negotiation status', 'Current implementation status', 'Fully updated', 'Created at', 'Top parent companies', 'Intended size (in ha)', 'Size under contract (leased or purchased area, in ha)', 'Size in operation (production, in ha)', 'Comment on land area', 'Intention of investment', 'Comment on intention of investment', 'Carbon offset project', 'Comment on carbon offset project', 'Nature of the deal', 'Comment on nature of the deal', 'Negotiation status', 'Comment on negotiation status', 'Implementation status', 'Comment on implementation status', 'Purchase price', 'Purchase price currency', 'Purchase price area type', 'Purchase price area', 'Comment on purchase price', 'Annual leasing fee', 'Annual leasing fee currency', 'Annual leasing fee type', 'Annual leasing fee area', 'Comment on leasing fees', 'Contract f

In [21]:
#getting number of deals made per country
df_deals.groupby(['Target country'], group_keys = True)[['Deal ID']].count()

Unnamed: 0_level_0,Deal ID
Target country,Unnamed: 1_level_1
Argentina,65
Belize,4
Bolivia,5
Brazil,141
Chile,54
Colombia,29
Costa Rica,3
Cuba,2
Ecuador,1
Guatemala,4


In [22]:
#To look at how the Intention of investment column is structured (and subsequently filter by sector)

df_deals['Intention of investment'] = df_deals['Intention of investment'].str.strip('1234567890#current.|')
df_deals['Intention of investment'].head()

# df_deals['1', '2'] = df_deals['Intention of investment'].str.split(',', 1, expand = True)

0                Tourism
1                Tourism
2             Food crops
3    Food crops, Tourism
4             Food crops
Name: Intention of investment, dtype: object

In [23]:
#Counting the intention of investment based on description
Tourism = df_deals[df_deals['Intention of investment'].str.contains('Tourism')]
print(f'There are {len(Tourism)} deals regarding tourism')
Conservation = df_deals[df_deals['Intention of investment'].str.contains('Conservation' or 'conservation')]
print(f'There are {len(Conservation)} deals regarding conservation')
Food_crops = df_deals[df_deals['Intention of investment'].str.contains('Food' or 'crops' or 'Agriculture')]
print(f'There are {len(Food_crops)} deals regarding agricultural practices')
Livestock = df_deals[df_deals['Intention of investment'].str.contains('Livestock')]
print(f'There are {len(Livestock)} deals regarding livestock practices')
Forestry = df_deals[df_deals['Intention of investment'].str.contains('Timber')]
print(f'There are {len(Forestry)} deals regarding foresting practices')
Biofuels = df_deals[df_deals['Intention of investment'].str.contains('biofuels')]
print(f'There are {len(Biofuels)} deals regarding biofuel industry')
Wind_energy = df_deals[df_deals['Intention of investment'].str.contains('Wind')]
print(f'There are {len(Wind_energy)} deals regarding the wind energy industry')
Other_energy = df_deals[df_deals['Intention of investment'].str.contains('Renewable')]
print(f'There are {len(Other_energy)} deals regarding other renewable energy industries')
Solar_energy = df_deals[df_deals['Intention of investment'].str.contains('Solar')]
print(f'There are {len(Solar_energy)} deals regarding the solar energy industry')

There are 11 deals regarding tourism
There are 0 deals regarding conservation
There are 180 deals regarding agricultural practices
There are 75 deals regarding livestock practices
There are 92 deals regarding foresting practices
There are 50 deals regarding biofuel industry
There are 47 deals regarding the wind energy industry
There are 12 deals regarding other renewable energy industries
There are 16 deals regarding the solar energy industry


In [24]:
#Grouping countries and accumulating the deal sizes to get a national value of deals
dealsize_per_country = df_deals.groupby(['Target country'], group_keys = True)[['Deal size']].sum()
dealsize_per_country.head()

Unnamed: 0_level_0,Deal size
Target country,Unnamed: 1_level_1
Argentina,1426385.86
Belize,51984.0
Bolivia,34450.0
Brazil,3776830.87
Chile,329118.01


In [None]:
#filtering out the necessary columns
deals_clean = df_deals[['Deal ID', 'Target country', 'Intention of investment', 'Operating company: Country of registration/origin']]
deals_clean.head()

Unnamed: 0,Deal ID,Target country,Intention of investment,Operating company: Country of registration/origin
0,710,Argentina,Tourism,Argentina
1,717,Argentina,Tourism,Argentina
2,724,Argentina,Food crops,Ukraine
3,727,Argentina,"Food crops, Tourism",Argentina
4,729,Argentina,Food crops,Argentina


In [26]:
# Importing a csv containing contract data of the land acquisition

df_contracts = pd.read_csv(
    "export2\contracts.csv",
    sep=";",         
    engine="python",  
    encoding="utf-8",
)

df_contracts.head()


Unnamed: 0,ID,Deal ID,Contract number,Contract date,Contract expiration date,Duration of the agreement,Comment on contract
0,1w4V2aI9,724,,,,5.0,
1,tGSS5q4E,778,,,,25.0,
2,0fXI63az,847,,,,30.0,
3,d24WOchE,849,,,,90.0,
4,PDMXavWg,851,,,,25.0,


In [27]:
df_contracts_clean = df_contracts[['ID', 'Deal ID', 'Duration of the agreement']]

In [28]:
df_contracts_clean.head()

Unnamed: 0,ID,Deal ID,Duration of the agreement
0,1w4V2aI9,724,5.0
1,tGSS5q4E,778,25.0
2,0fXI63az,847,30.0
3,d24WOchE,849,90.0
4,PDMXavWg,851,25.0


In [17]:
# Importing a csv containing the information on investors within the Land Matrix
df_investors = pd.read_csv(
    "export2\investors.csv",
    sep=";",         
    engine="python",  
    encoding="utf-8",
)

df_investors.head()

Unnamed: 0,Investor ID,Name,Country of registration/origin,Classification,Investor homepage,Opencorporates link,Comment,Action comment
0,2,Government of Bangladesh,Bangladesh,State-/government (owned) company,,,,
1,11,China Asean Resources Ltd.,"China, Hong Kong Special Administrative Region",Stock-exchange listed company,,,,
2,12,Agro Forestry Research,China,,,,,
3,14,Amira Nature Foods Ltd (ANFIF),United Arab Emirates,Stock-exchange listed company,https://www.amira.net/,,Amira Nature Foods Ltd is a food company prima...,
4,18,Bigimexco,Vietnam,Private company,,,,


In [None]:
investor_countries = df_investors.groupby(['Country of registration/origin'], group_keys = True)['Name'].count()
investor_countries #maybe we can add this to a new dataframe that counts deals per country

Country of registration/origin
Afghanistan                                               1
Albania                                                   6
Algeria                                                  21
Angola                                                   58
Argentina                                               428
Armenia                                                   1
Aruba                                                     1
Australia                                                88
Austria                                                  18
Bahamas                                                   1
Bahrain                                                   6
Bangladesh                                               41
Barbados                                                  3
Belarus                                                   7
Belgium                                                  25
Belize                                                    9
Benin    

In [32]:
investor_countries_clean = df_investors[['Investor ID', 'Country of registration/origin', 'Classification']]
investor_countries_clean.head()

Unnamed: 0,Investor ID,Country of registration/origin,Classification
0,2,Bangladesh,State-/government (owned) company
1,11,"China, Hong Kong Special Administrative Region",Stock-exchange listed company
2,12,China,
3,14,United Arab Emirates,Stock-exchange listed company
4,18,Vietnam,Private company


In [56]:
#Importing the involvements dataframe to be able to merge the other dataframes together
df_involvements = pd.read_csv(
    "export2\involvements.csv",
    sep=";",         
    engine="python",  
    encoding="utf-8",
)
df_involvements.head()

involvements_clean = df_involvements[['Investor ID Upstream', 'Involvement ID']]


In [None]:
#merging the involvements dataframe with the investors dataframe
involvements_investors = pd.merge(
    involvements_clean,
    investor_countries_clean,
    left_on='Investor ID Upstream',
    right_on='Investor ID',
    how='left'
)
involvements_investors.head()



Unnamed: 0,Investor ID Upstream,Involvement ID,Investor ID,Country of registration/origin,Classification
0,12,10,12,China,
1,14,11,14,United Arab Emirates,Stock-exchange listed company
2,20,13,20,United States of America,Investment fund
3,25,16,25,India,
4,27,17,27,United States of America,
