In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import numpy as np  # This is for general numerical operations
import seaborn as sns  # This allows us to efficiently and beautifully plot
import os
import geopandas as gpd
import palettable as pltt
from seaborn import palplot



In [2]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)


In [3]:

#Load the csv file and import the right semicolon format


file = "export3\deals.csv"

df_deals = pd.read_csv(
    file,
    sep=";",          # semicolon-separated
    engine="python",  # needed for multiline fields
    encoding="utf-8",
    )


# Basic structural exploration

print("===== FIRST 5 ROWS =====")
print(df_deals.head())

print("===== COLUMN NAMES =====")
print(list(df_deals.columns), "\n")

print("===== DATAFRAME INFO =====")
print(df_deals.info(), "\n")

print("===== SHAPE OF DATA (rows, columns) =====")
print(df_deals.shape, "\n")

print("===== MISSING VALUES PER COLUMN =====")
print(df_deals.isna().sum(), "\n")


===== FIRST 5 ROWS =====
   Deal ID Is public     Deal scope  Deal size Target country  \
0       11       Yes       domestic     9380.0       Cambodia   
1       12       Yes       domestic     7000.0       Cambodia   
2       14       Yes       domestic     2400.0       Cambodia   
3       16       Yes       domestic     9863.0       Cambodia   
4       17       Yes  transnational     6523.0       Cambodia   

   Current size under contract  Current size in operation (production)  \
0                       9380.0                                     NaN   
1                       7000.0                                     NaN   
2                       2400.0                                  2000.0   
3                       9863.0                                  1500.0   
4                       6523.0                                     NaN   

    Current negotiation status Current implementation status  \
0  Concluded (Contract signed)                           NaN   
1  Conclude

In [4]:
#getting number of deals made per country
df_deals.groupby(['Target country'], group_keys = True)[['Deal ID']].count()

Unnamed: 0_level_0,Deal ID
Target country,Unnamed: 1_level_1
Afghanistan,1
Albania,3
Algeria,9
Angola,33
Argentina,425
Bangladesh,48
Belarus,2
Belize,7
Benin,6
Bolivia,23


In [9]:
#To look at how the Intention of investment column is structured (and subsequently filter by sector)

df_deals['Intention of investment'] = df_deals['Intention of investment'].str.strip('-,1234567890#current.|')

# deals_clean = df_deals.dropna(axis = 1)
# deals_clean

df_deals['Intention of investment'].head(500)

# df_deals['1', '2'] = df_deals['Intention of investment'].str.split(',', 1, expand = True)

0           Livestock, Agriculture unspecified, Industry
1      Non-food agricultural commodities, Timber plan...
2                                  Food crops, Livestock
3           Livestock, Non-food agricultural commodities
4                       Biomass for biofuels, Food crops
5         Timber plantation for wood and fiber, Industry
6          Food crops, Non-food agricultural commodities
7          Non-food agricultural commodities, Food crops
8                                                Tourism
9                                             Food crops
10                                  Biomass for biofuels
11                               Agriculture unspecified
12                               Agriculture unspecified
13                               Agriculture unspecified
14                                  Forestry unspecified
15     Biomass for biofuels, Non-food agricultural co...
16     Biomass for biofuels, Non-food agricultural co...
17                     Non-food

In [None]:
df_deals['investment_intent_norm'] = (
    df_deals['Intention of investment']
    .str.lower()
    .str.strip()
)

df_deals['investment_intent_norm'] = (
    df_deals['investment_intent_norm']
    .fillna('unknown')
)

df_deals['investment_intent_list'] = (
    df_deals['investment_intent_norm']
    .str.split(',')
)

df_deals['investment_intent_list'] = df_deals['investment_intent_list'].apply(
    lambda x: [i.strip() for i in x]
)

df_deals['investment_intent_list'].head()

0     [livestock,  agriculture unspecified,  industry]
1    [non-food agricultural commodities,  timber pl...
2                             [food crops,  livestock]
3      [livestock,  non-food agricultural commodities]
4                  [biomass for biofuels,  food crops]
Name: investment_intent_list, dtype: object

In [25]:
intent_map = {
    'food agriculture': [
        'agriculture', 'food crops', 'cereals', 'rice', 'wheat'
    ],
    'industrial agriculture': [
        'industrial agriculture', 'livestock', 'plantation', 'palm oil',
        'rubber', 'sugar', 'cotton'
    ],
    'forestry': [
        'forestry', 'timber', 'logging', 'tree plantation'
    ],
    'bioenergy & carbon': [
        'biofuels', 'biomass', 'carbon sequestration', 'carbon credits'
    ],
    'extractives': [
        'mining', 'oil', 'gas', 'hydrocarbons'
    ],
    'infrastructure & tourism': [
        'tourism', 'infrastructure', 'real estate'
    ],
    'conservation': [
        'conservation', 'protected area', 'nature reserve'
    ]
}

def map_intent(intents):
    categories = set()
    for intent in intents:
        for category, keywords in intent_map.items():
            if intent in keywords:
                categories.add(category)
    if not categories:
        categories.add('unknown / unclear')
    return list(categories)

df_deals['intent_category'] = df_deals['investment_intent_list'].apply(map_intent)
intent_exploded = df_deals.explode('intent_category')
df_deals[['intent_category','investment_intent_list' ]].head()


Unnamed: 0,intent_category,investment_intent_list
0,[industrial agriculture],"[livestock, agriculture unspecified, industry]"
1,[unknown / unclear],"[non-food agricultural commodities, timber pl..."
2,[food agriculture],"[food crops, livestock]"
3,[industrial agriculture],"[livestock, non-food agricultural commodities]"
4,[unknown / unclear],"[biomass for biofuels, food crops]"


In [18]:
# Counting the intention of investment based on description
# Tourism = df_deals[df_deals['Intention of investment'].str.contains('Tourism', na=False)]
# print(f'There are {len(Tourism)} deals regarding tourism')
# Conservation = df_deals[df_deals['Intention of investment'].str.contains('Conservation' or 'conservation', na=False)]
# print(f'There are {len(Conservation)} deals regarding conservation')
# Food_crops = df_deals[df_deals['Intention of investment'].str.contains('Food' or 'crops' or 'Agriculture', na=False)]
# print(f'There are {len(Food_crops)} deals regarding agricultural practices')
# Livestock = df_deals[df_deals['Intention of investment'].str.contains('Livestock', na=False)]
# print(f'There are {len(Livestock)} deals regarding livestock practices')
# Forestry = df_deals[df_deals['Intention of investment'].str.contains('Timber' or 'Forest' or 'Forestry', na=False)]
# print(f'There are {len(Forestry)} deals regarding foresting practices')
# Biofuels = df_deals[df_deals['Intention of investment'].str.contains('biofuels', na=False)]
# print(f'There are {len(Biofuels)} deals regarding biofuel industry')
# Wind_energy = df_deals[df_deals['Intention of investment'].str.contains('Wind', na=False)]
# print(f'There are {len(Wind_energy)} deals regarding the wind energy industry')
# Other_energy = df_deals[df_deals['Intention of investment'].str.contains('Renewable', na=False)]
# print(f'There are {len(Other_energy)} deals regarding other renewable energy industries')
# Solar_energy = df_deals[df_deals['Intention of investment'].str.contains('Solar', na=False)]
# print(f'There are {len(Solar_energy)} deals regarding the solar energy industry')
# Mining = df_deals[df_deals['Intention of investment'].str.contains('mining' or 'Mining', na=False)]
# print(f'There are {len(Mining)} deals regarding the mining industry')
# Oil = df_deals[df_deals['Intention of investment'].str.contains('oil' or 'Oil', na=False)]
# print(f'There are {len(Oil)} deals regarding the oil industry')
# Unspecified = df_deals[df_deals['Intention of investment'].str.contains('Industry' or 'Oth', na=False)]
# print(f'There are {len(Unspecified)} deals with no clear reason for investment ')

In [82]:
#Grouping countries and accumulating the deal sizes to get a national value of deals
dealsize_per_country = df_deals.groupby(['Target country'], group_keys = True)[['Deal size']].sum()
dealsize_per_country.head()

Unnamed: 0_level_0,Deal size
Target country,Unnamed: 1_level_1
Afghanistan,24.28
Albania,6718.36
Algeria,210711.0
Angola,422065.0
Argentina,10744304.96


In [84]:
#filtering out the necessary columns
deals_clean = df_deals[['Deal ID', 'Target country','Deal size',  'Intention of investment', 'Created at', 'Operating company: Country of registration/origin']]
deals_clean.head()

Unnamed: 0,Deal ID,Target country,Deal size,Intention of investment,Created at,Operating company: Country of registration/origin
0,11,Cambodia,9380.0,"Livestock, Agriculture unspecified, Industry",2013-02-15T15:58:58+00:00,Cambodia
1,12,Cambodia,7000.0,"Non-food agricultural commodities, Timber plan...",2013-02-15T15:58:58+00:00,Cambodia
2,14,Cambodia,2400.0,"Food crops, Livestock",2013-02-15T15:58:58+00:00,Cambodia
3,16,Cambodia,9863.0,"Livestock, Non-food agricultural commodities",2013-02-15T15:58:59+00:00,Cambodia
4,17,Cambodia,6523.0,"Biomass for biofuels, Food crops",2013-02-15T15:58:59+00:00,Cambodia


In [26]:
# Importing a csv containing contract data of the land acquisition

df_contracts = pd.read_csv(
    "export2\contracts.csv",
    sep=";",         
    engine="python",  
    encoding="utf-8",
)

df_contracts.head()


Unnamed: 0,ID,Deal ID,Contract number,Contract date,Contract expiration date,Duration of the agreement,Comment on contract
0,1w4V2aI9,724,,,,5.0,
1,tGSS5q4E,778,,,,25.0,
2,0fXI63az,847,,,,30.0,
3,d24WOchE,849,,,,90.0,
4,PDMXavWg,851,,,,25.0,


In [27]:
df_contracts_clean = df_contracts[['ID', 'Deal ID', 'Duration of the agreement']]

In [28]:
df_contracts_clean.head()

Unnamed: 0,ID,Deal ID,Duration of the agreement
0,1w4V2aI9,724,5.0
1,tGSS5q4E,778,25.0
2,0fXI63az,847,30.0
3,d24WOchE,849,90.0
4,PDMXavWg,851,25.0


In [89]:
# Importing a csv containing the information on investors within the Land Matrix
df_investors = pd.read_csv(
    "export3\investors.csv",
    sep=";",         
    engine="python",  
    encoding="utf-8",
)

df_investors.head()

Unnamed: 0,Investor ID,Name,Country of registration/origin,Classification,Investor homepage,Opencorporates link,Comment,Action comment
0,2,Government of Bangladesh,Bangladesh,State-/government (owned) company,,,,
1,11,China Asean Resources Ltd.,"China, Hong Kong Special Administrative Region",Stock-exchange listed company,,,,
2,12,Agro Forestry Research,China,,,,,
3,14,Amira Nature Foods Ltd (ANFIF),United Arab Emirates,Stock-exchange listed company,https://www.amira.net/,,Amira Nature Foods Ltd is a food company prima...,
4,18,Bigimexco,Vietnam,Private company,,,,


In [85]:
investor_countries = df_investors.groupby(['Country of registration/origin'], group_keys = True)['Name'].count()
investor_countries #maybe we can add this to a new dataframe that counts deals per country

Country of registration/origin
Afghanistan                                               1
Albania                                                   6
Algeria                                                  21
Angola                                                   58
Argentina                                               428
Armenia                                                   1
Aruba                                                     1
Australia                                                88
Austria                                                  18
Bahamas                                                   1
Bahrain                                                   6
Bangladesh                                               41
Barbados                                                  3
Belarus                                                   7
Belgium                                                  25
Belize                                                    9
Benin    

In [86]:
investor_countries_clean = df_investors[['Investor ID', 'Country of registration/origin', 'Classification']]
investor_countries_clean.head()

Unnamed: 0,Investor ID,Country of registration/origin,Classification
0,2,Bangladesh,State-/government (owned) company
1,11,"China, Hong Kong Special Administrative Region",Stock-exchange listed company
2,12,China,
3,14,United Arab Emirates,Stock-exchange listed company
4,18,Vietnam,Private company


In [87]:
#Importing the involvements dataframe to be able to merge the other dataframes together
df_involvements = pd.read_csv(
    "export3\involvements.csv",
    sep=";",         
    engine="python",  
    encoding="utf-8",
)
df_involvements.head()

involvements_clean = df_involvements[['Investor ID Upstream', 'Involvement ID']]


In [88]:
#merging the involvements dataframe with the investors dataframe
involvements_investors = pd.merge(
    involvements_clean,
    investor_countries_clean,
    left_on='Investor ID Upstream',
    right_on='Investor ID',
    how='left'
)
involvements_investors.head()



Unnamed: 0,Investor ID Upstream,Involvement ID,Investor ID,Country of registration/origin,Classification
0,12,10,12.0,China,
1,14,11,14.0,United Arab Emirates,Stock-exchange listed company
2,20,13,20.0,United States of America,Investment fund
3,25,16,25.0,India,
4,27,17,27.0,United States of America,
