In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import numpy as np  # This is for general numerical operations
import seaborn as sns  # This allows us to efficiently and beautifully plot
import os
import geopandas as gpd
import palettable as pltt
from seaborn import palplot



In [5]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)


In [6]:

#Load the csv file and import the right semicolon format


file = "deals.csv"

df_deals = pd.read_csv(
    file,
    sep=";",          # semicolon-separated
    engine="python",  # needed for multiline fields
    encoding="utf-8",
    )


# Basic structural exploration

print("===== FIRST 5 ROWS =====")
print(df_deals.head())

print("===== COLUMN NAMES =====")
print(list(df_deals.columns), "\n")

print("===== DATAFRAME INFO =====")
print(df_deals.info(), "\n")

print("===== SHAPE OF DATA (rows, columns) =====")
print(df_deals.shape, "\n")

print("===== MISSING VALUES PER COLUMN =====")
print(df_deals.isna().sum(), "\n")


===== FIRST 5 ROWS =====
   Deal ID Is public     Deal scope  Deal size Target country  \
0       11       Yes       domestic     9380.0       Cambodia   
1       12       Yes       domestic     7000.0       Cambodia   
2       14       Yes       domestic     2400.0       Cambodia   
3       16       Yes       domestic     9863.0       Cambodia   
4       17       Yes  transnational     6523.0       Cambodia   

   Current size under contract  Current size in operation (production)  \
0                       9380.0                                     NaN   
1                       7000.0                                     NaN   
2                       2400.0                                  2000.0   
3                       9863.0                                  1500.0   
4                       6523.0                                     NaN   

    Current negotiation status Current implementation status  \
0  Concluded (Contract signed)                           NaN   
1  Conclude

In [7]:
#getting number of deals made per country
df_deals.groupby(['Target country'], group_keys = True)[['Deal ID']].count()

Unnamed: 0_level_0,Deal ID
Target country,Unnamed: 1_level_1
Afghanistan,1
Albania,3
Algeria,9
Angola,33
Argentina,425
Bangladesh,48
Belarus,2
Belize,7
Benin,6
Bolivia,23


In [8]:
#To look at how the Intention of investment column is structured (and subsequently filter by sector)

df_deals['Intention of investment'] = df_deals['Intention of investment'].str.strip('-,1234567890#current.|')

# deals_clean = df_deals.dropna(axis = 1)
# deals_clean

df_deals['Intention of investment'].head(500)

# df_deals['1', '2'] = df_deals['Intention of investment'].str.split(',', 1, expand = True)

0           Livestock, Agriculture unspecified, Industry
1      Non-food agricultural commodities, Timber plan...
2                                  Food crops, Livestock
3           Livestock, Non-food agricultural commodities
4                       Biomass for biofuels, Food crops
5         Timber plantation for wood and fiber, Industry
6          Food crops, Non-food agricultural commodities
7          Non-food agricultural commodities, Food crops
8                                                Tourism
9                                             Food crops
10                                  Biomass for biofuels
11                               Agriculture unspecified
12                               Agriculture unspecified
13                               Agriculture unspecified
14                                  Forestry unspecified
15     Biomass for biofuels, Non-food agricultural co...
16     Biomass for biofuels, Non-food agricultural co...
17                     Non-food

### LEO's code

In [10]:
value_counts = df_deals["Intention of investment"].value_counts(dropna=False)

print("\n===== UNIQUE VALUES IN 'Intention of investment' WITH COUNTS =====")
print(value_counts.to_string())

print("\nTotal unique values:", value_counts.shape[0])


===== UNIQUE VALUES IN 'Intention of investment' WITH COUNTS =====
Intention of investment
Food crops                                                                                                                                                                                                                                                        1216
Mining                                                                                                                                                                                                                                                             733
Forest logging / management for wood and fib                                                                                                                                                                                                                       511
Wind farm                                                                                                              

In [12]:
import pandas as pd
import re

# ============================================================
# CLEAN + HARMONISE: Land Matrix "Intention of investment"
# Produces:
#   - df_deals["intention_clean"]  (cleaned string)
#   - df_deals["intention_list"]   (list of labels per deal)
#   - intention_binary             (multi-hot dummies for fine labels)
#   - intention_binary[broad_cols] (multi-hot dummies for broad sectors)
# ============================================================

# ---- Assumes df_deals already exists. If not, load it here:
# df_deals = pd.read_csv("land_matrix_simplified.csv", engine="python", on_bad_lines="skip")

COL = "Intention of investment"

print("===== STEP 0: BASIC CHECKS =====")
print("Rows:", len(df_deals))
print(f"Column '{COL}' exists? ->", COL in df_deals.columns)
print("Missing values in column:", df_deals[COL].isna().sum())

# ============================================================
# STEP 1 — Remove embedded metadata garbage (e.g., |2021#current#...)
# ============================================================

def clean_intention_string(x):
    """
    Removes Land Matrix export artifacts like:
      'Food crops|2021-12-31#current#500#Food crops'
    by deleting everything that starts with '|' and continues with metadata characters.
    """
    if pd.isna(x):
        return pd.NA

    s = str(x).strip()
    if s == "":
        return pd.NA

    # Remove *all* '|' metadata chunks
    # Example: "...|2021-12-31#current#500#..." -> "..."
    s = re.sub(r"\|[0-9#.\-a-zA-Z]*", "", s)

    # Normalize whitespace
    s = re.sub(r"\s+", " ", s).strip()

    return s if s != "" else pd.NA


df_deals["intention_clean"] = df_deals[COL].apply(clean_intention_string)

print("\n===== STEP 1: METADATA CLEANING CHECK =====")
rows_with_pipe_before = df_deals[COL].astype(str).str.contains(r"\|", na=False).sum()
rows_with_pipe_after  = df_deals["intention_clean"].astype(str).str.contains(r"\|", na=False).sum()
print("Rows containing '|' BEFORE cleaning:", rows_with_pipe_before)
print("Rows containing '|' AFTER cleaning :", rows_with_pipe_after)

# ============================================================
# STEP 2 — Fix truncations (Oth, Conservatio, Fodd, ...)
# ============================================================

recode_map = {
    "Oth": "Other",
    "Fodd": "Fodder",
    "Conservatio": "Conservation",
    "Land speculatio": "Land speculation",
    "Oil / Gas extractio": "Oil / Gas extraction"
}

df_deals["intention_clean"] = df_deals["intention_clean"].replace(recode_map)

print("\n===== STEP 2: TRUNCATION FIX CHECK =====")
for bad, good in recode_map.items():
    bad_count = (df_deals["intention_clean"] == bad).sum()
    good_count = (df_deals["intention_clean"] == good).sum()
    print(f"'{bad}' remaining:", bad_count, "| mapped-to", f"'{good}' count now:", good_count)

# ============================================================
# STEP 3 — Split multi-label rows into lists
# ============================================================

def split_intentions(x):
    if pd.isna(x):
        return pd.NA
    parts = [p.strip() for p in str(x).split(",")]
    parts = [p for p in parts if p != ""]
    return parts if len(parts) > 0 else pd.NA

df_deals["intention_list"] = df_deals["intention_clean"].apply(split_intentions)

print("\n===== STEP 3: MULTI-LABEL CHECK =====")
multi_label_rows = df_deals["intention_clean"].astype(str).str.contains(",", na=False).sum()
print("Rows that are multi-label (contain ','):", multi_label_rows)




===== STEP 0: BASIC CHECKS =====
Rows: 6621
Column 'Intention of investment' exists? -> True
Missing values in column: 21

===== STEP 1: METADATA CLEANING CHECK =====
Rows containing '|' BEFORE cleaning: 229
Rows containing '|' AFTER cleaning : 0

===== STEP 2: TRUNCATION FIX CHECK =====
'Oth' remaining: 0 | mapped-to 'Other' count now: 77
'Fodd' remaining: 0 | mapped-to 'Fodder' count now: 2
'Conservatio' remaining: 0 | mapped-to 'Conservation' count now: 16
'Land speculatio' remaining: 0 | mapped-to 'Land speculation' count now: 15
'Oil / Gas extractio' remaining: 0 | mapped-to 'Oil / Gas extraction' count now: 36

===== STEP 3: MULTI-LABEL CHECK =====
Rows that are multi-label (contain ','): 1691


In [14]:
import pandas as pd
import re

col = "intention_clean"

# =========================
# 1) BASIC COUNTS
# =========================
print("\n===== BASIC OVERVIEW =====")
print("Rows:", len(df_deals))
print("Missing (<NA>) in intention_clean:", df_deals[col].isna().sum())

# =========================
# 2) FULL UNIQUE COUNTS (TOP 30)
# =========================
print("\n===== TOP 30 RAW VALUES (MOST COMMON) =====")
vc_all = df_deals[col].value_counts(dropna=False)
print(vc_all.head(30).to_string())
print("\nTotal unique values (incl. NA):", vc_all.shape[0])

# =========================
# 3) SPLIT INTO SINGLE vs MULTI-LABEL
# =========================
is_multi = df_deals[col].astype("string").str.contains(",", na=False)

print("\n===== SINGLE vs MULTI-LABEL =====")
print("Single-label rows:", (~is_multi).sum())
print("Multi-label rows :", is_multi.sum())

print("\n===== TOP 20 SINGLE-LABEL CATEGORIES =====")
vc_single = df_deals.loc[~is_multi, col].value_counts(dropna=False)
print(vc_single.head(20).to_string())

print("\n===== TOP 20 MULTI-LABEL COMBINATIONS =====")
vc_multi = df_deals.loc[is_multi, col].value_counts(dropna=False)
print(vc_multi.head(20).to_string())

# =========================
# 4) "SUSPICIOUS" VALUES LIST
#    (helps you spot remaining cleaning needed)
# =========================
print("\n===== SUSPICIOUS / NEEDS REVIEW (TOP 50) =====")
s = df_deals[col].astype("string")

suspicious_mask = (
    s.str.contains(r"\bOth\b|\bFodd\b|\bConservatio\b", na=False) |   # truncations
    s.str.contains(r"\b(\w+)\s+\1\b", na=False) |                    # duplicated words: "crops crops"
    s.str.contains(r"logging\s*/\s*management.*logging\s*/\s*management", na=False) |  # repeated long phrase
    s.str.contains(r"\bpark\s+park\b|\bcrops\s+crops\b", na=False)   # obvious repeats
)

vc_suspicious = df_deals.loc[suspicious_mask, col].value_counts(dropna=False)
print(vc_suspicious.head(50).to_string())
print("\nSuspicious rows total:", suspicious_mask.sum())

# =========================
# 5) OPTIONAL: TOKEN-LEVEL VIEW (MOST USEFUL FOR MERGING)
#    This answers: "How often does each label appear anywhere?"
# =========================
print("\n===== TOKEN-LEVEL COUNTS (EACH LABEL COUNTED SEPARATELY) =====")
tokens = (
    df_deals[col]
      .dropna()
      .astype("string")
      .str.split(r"\s*,\s*")          # split multi-label combos
      .explode()
      .str.strip()
)

token_counts = tokens.value_counts()
print(token_counts.head(30).to_string())
print("\nTotal unique tokens:", token_counts.shape[0])




===== BASIC OVERVIEW =====
Rows: 6621
Missing (<NA>) in intention_clean: 22

===== TOP 30 RAW VALUES (MOST COMMON) =====
intention_clean
Food crops                                                                            1221
Mining                                                                                 737
Forest logging / management for wood and fib                                           511
Wind farm                                                                              312
Solar park                                                                             295
Agriculture unspecified                                                                278
Non-food agricultural commodities                                                      269
Timber plantation for wood and fib                                                     254
Food crops, Livestock                                                                  245
Biomass for biofuels                       

  s.str.contains(r"\b(\w+)\s+\1\b", na=False) |                    # duplicated words: "crops crops"


In [15]:
import re
import pandas as pd

# ============================================
# STEP 1 — CANONICAL SECTOR DICTIONARY
# ============================================

CANONICAL_SECTORS = {
    "Food crops": "Food crops",
    "Non-food agricultural commodities": "Non-food agricultural commodities",
    "Livestock": "Livestock",
    "Biomass for biofuels": "Biomass for biofuels",
    "Fodder": "Fodder",
    "Mining": "Mining",
    "Oil / Gas extraction": "Oil / Gas extraction",
    "Forest logging / management for wood and fib": "Forest logging / management",
    "Forest logging / management for wood and fiber": "Forest logging / management",
    "Timber plantation for wood and fib": "Timber plantation",
    "Timber plantation for wood and fiber": "Timber plantation",
    "Wind farm": "Wind farm",
    "Solar park": "Solar park",
    "Renewable energy unspecified": "Renewable energy",
    "Biomass for energy generation (agriculture)": "Biomass energy (agriculture)",
    "Biomass for energy generation (forestry)": "Biomass energy (forestry)",
    "For carbon sequestration/REDD": "Carbon sequestration / REDD",
    "Industry": "Industry",
    "Tourism": "Tourism",
    "Land speculation": "Land speculation",
    "Agriculture unspecified": "Agriculture unspecified",
    "Forestry unspecified": "Forestry unspecified",
    "Conservation": "Conservation",
    "Other": "Other",
}

# ============================================
# STEP 2 — CLEAN TOKEN FUNCTION
# ============================================

def clean_intention_tokens(x):
    if pd.isna(x):
        return []

    s = str(x)

    # Remove metadata after |
    s = re.sub(r"\|.*", "", s)

    # Split multi-label rows
    parts = re.split(r"\s*,\s*", s)

    clean = []
    for p in parts:
        p = p.strip()

        # Fix truncations / typos
        p = p.replace("Oth", "Other")
        p = p.replace("Fodd", "Fodder")
        p = p.replace("Conservatio", "Conservation")
        p = p.replace("extractio", "extraction")
        p = re.sub(r"\b(\w+)\s+\1\b", r"\1", p)  # crops crops → crops

        for key, val in CANONICAL_SECTORS.items():
            if key.lower() in p.lower():
                clean.append(val)

    return sorted(set(clean))

# ============================================
# STEP 3 — APPLY TO DATAFRAME
# ============================================

df_deals["sector_list"] = df_deals["intention_clean"].apply(clean_intention_tokens)

print("\n===== SAMPLE CLEANED SECTOR LISTS =====")
print(df_deals["sector_list"].head(20))

print("\n===== UNIQUE CANONICAL TOKENS =====")
print(pd.Series([s for row in df_deals["sector_list"] for s in row]).value_counts())



===== SAMPLE CLEANED SECTOR LISTS =====
0        [Agriculture unspecified, Industry, Livestock]
1     [Non-food agricultural commodities, Timber pla...
2                               [Food crops, Livestock]
3        [Livestock, Non-food agricultural commodities]
4                    [Biomass for biofuels, Food crops]
5                         [Industry, Timber plantation]
6       [Food crops, Non-food agricultural commodities]
7       [Food crops, Non-food agricultural commodities]
8                                             [Tourism]
9                                          [Food crops]
10                               [Biomass for biofuels]
11                            [Agriculture unspecified]
12                            [Agriculture unspecified]
13                            [Agriculture unspecified]
14                               [Forestry unspecified]
15    [Biomass for biofuels, Non-food agricultural c...
16    [Biomass for biofuels, Forestry unspecified, N...
17     

In [16]:
print("\n===== UNIQUE VALUES IN sector_list COLUMN =====")
print(df_deals["sector_list"].value_counts(dropna=False))

print("\n===== EXAMPLES OF MULTI-SECTOR ROWS =====")
print(df_deals[df_deals["sector_list"].apply(lambda x: isinstance(x, list) and len(x) > 1)]["sector_list"].head(20))



===== UNIQUE VALUES IN sector_list COLUMN =====
sector_list
[Food crops]                                                                                                                             1254
[Mining]                                                                                                                                  739
[Forest logging / management]                                                                                                             524
[Wind farm]                                                                                                                               314
[Solar park]                                                                                                                              302
[Agriculture unspecified]                                                                                                                 283
[Non-food agricultural commodities]                                                    

In [17]:
print("\n===== FIRST 50 VALUES OF sector_list =====")
print(df_deals["sector_list"].head(50).to_string())

print("\n===== LAST 50 VALUES OF sector_list =====")
print(df_deals["sector_list"].tail(50).to_string())


===== FIRST 50 VALUES OF sector_list =====
0        [Agriculture unspecified, Industry, Livestock]
1     [Non-food agricultural commodities, Timber pla...
2                               [Food crops, Livestock]
3        [Livestock, Non-food agricultural commodities]
4                    [Biomass for biofuels, Food crops]
5                         [Industry, Timber plantation]
6       [Food crops, Non-food agricultural commodities]
7       [Food crops, Non-food agricultural commodities]
8                                             [Tourism]
9                                          [Food crops]
10                               [Biomass for biofuels]
11                            [Agriculture unspecified]
12                            [Agriculture unspecified]
13                            [Agriculture unspecified]
14                               [Forestry unspecified]
15    [Biomass for biofuels, Non-food agricultural c...
16    [Biomass for biofuels, Forestry unspecified, N...
17  

In [9]:
df_deals['investment_intent_norm'] = (
    df_deals['Intention of investment']
    .str.lower()
    .str.strip()
)

df_deals['investment_intent_norm'] = (
    df_deals['investment_intent_norm']
    .fillna('unknown')
)

df_deals['investment_intent_list'] = (
    df_deals['investment_intent_norm']
    .str.split(',')
)

df_deals['investment_intent_list'] = df_deals['investment_intent_list'].apply(
    lambda x: [i.strip() for i in x]
)

df_deals['investment_intent_list'].head()

0       [livestock, agriculture unspecified, industry]
1    [non-food agricultural commodities, timber pla...
2                              [food crops, livestock]
3       [livestock, non-food agricultural commodities]
4                   [biomass for biofuels, food crops]
Name: investment_intent_list, dtype: object

In [25]:
intent_map = {
    'food agriculture': [
        'agriculture', 'food crops', 'cereals', 'rice', 'wheat'
    ],
    'industrial agriculture': [
        'industrial agriculture', 'livestock', 'plantation', 'palm oil',
        'rubber', 'sugar', 'cotton'
    ],
    'forestry': [
        'forestry', 'timber', 'logging', 'tree plantation'
    ],
    'bioenergy & carbon': [
        'biofuels', 'biomass', 'carbon sequestration', 'carbon credits'
    ],
    'extractives': [
        'mining', 'oil', 'gas', 'hydrocarbons'
    ],
    'infrastructure & tourism': [
        'tourism', 'infrastructure', 'real estate'
    ],
    'conservation': [
        'conservation', 'protected area', 'nature reserve'
    ]
}

def map_intent(intents):
    categories = set()
    for intent in intents:
        for category, keywords in intent_map.items():
            if intent in keywords:
                categories.add(category)
    if not categories:
        categories.add('unknown / unclear')
    return list(categories)

df_deals['intent_category'] = df_deals['investment_intent_list'].apply(map_intent)
intent_exploded = df_deals.explode('intent_category')
df_deals[['intent_category','investment_intent_list' ]].head()


Unnamed: 0,intent_category,investment_intent_list
0,[industrial agriculture],"[livestock, agriculture unspecified, industry]"
1,[unknown / unclear],"[non-food agricultural commodities, timber pl..."
2,[food agriculture],"[food crops, livestock]"
3,[industrial agriculture],"[livestock, non-food agricultural commodities]"
4,[unknown / unclear],"[biomass for biofuels, food crops]"


In [18]:
# Counting the intention of investment based on description
# Tourism = df_deals[df_deals['Intention of investment'].str.contains('Tourism', na=False)]
# print(f'There are {len(Tourism)} deals regarding tourism')
# Conservation = df_deals[df_deals['Intention of investment'].str.contains('Conservation' or 'conservation', na=False)]
# print(f'There are {len(Conservation)} deals regarding conservation')
# Food_crops = df_deals[df_deals['Intention of investment'].str.contains('Food' or 'crops' or 'Agriculture', na=False)]
# print(f'There are {len(Food_crops)} deals regarding agricultural practices')
# Livestock = df_deals[df_deals['Intention of investment'].str.contains('Livestock', na=False)]
# print(f'There are {len(Livestock)} deals regarding livestock practices')
# Forestry = df_deals[df_deals['Intention of investment'].str.contains('Timber' or 'Forest' or 'Forestry', na=False)]
# print(f'There are {len(Forestry)} deals regarding foresting practices')
# Biofuels = df_deals[df_deals['Intention of investment'].str.contains('biofuels', na=False)]
# print(f'There are {len(Biofuels)} deals regarding biofuel industry')
# Wind_energy = df_deals[df_deals['Intention of investment'].str.contains('Wind', na=False)]
# print(f'There are {len(Wind_energy)} deals regarding the wind energy industry')
# Other_energy = df_deals[df_deals['Intention of investment'].str.contains('Renewable', na=False)]
# print(f'There are {len(Other_energy)} deals regarding other renewable energy industries')
# Solar_energy = df_deals[df_deals['Intention of investment'].str.contains('Solar', na=False)]
# print(f'There are {len(Solar_energy)} deals regarding the solar energy industry')
# Mining = df_deals[df_deals['Intention of investment'].str.contains('mining' or 'Mining', na=False)]
# print(f'There are {len(Mining)} deals regarding the mining industry')
# Oil = df_deals[df_deals['Intention of investment'].str.contains('oil' or 'Oil', na=False)]
# print(f'There are {len(Oil)} deals regarding the oil industry')
# Unspecified = df_deals[df_deals['Intention of investment'].str.contains('Industry' or 'Oth', na=False)]
# print(f'There are {len(Unspecified)} deals with no clear reason for investment ')

In [82]:
#Grouping countries and accumulating the deal sizes to get a national value of deals
dealsize_per_country = df_deals.groupby(['Target country'], group_keys = True)[['Deal size']].sum()
dealsize_per_country.head()

Unnamed: 0_level_0,Deal size
Target country,Unnamed: 1_level_1
Afghanistan,24.28
Albania,6718.36
Algeria,210711.0
Angola,422065.0
Argentina,10744304.96


In [84]:
#filtering out the necessary columns
deals_clean = df_deals[['Deal ID', 'Target country','Deal size',  'Intention of investment', 'Created at', 'Operating company: Country of registration/origin']]
deals_clean.head()

Unnamed: 0,Deal ID,Target country,Deal size,Intention of investment,Created at,Operating company: Country of registration/origin
0,11,Cambodia,9380.0,"Livestock, Agriculture unspecified, Industry",2013-02-15T15:58:58+00:00,Cambodia
1,12,Cambodia,7000.0,"Non-food agricultural commodities, Timber plan...",2013-02-15T15:58:58+00:00,Cambodia
2,14,Cambodia,2400.0,"Food crops, Livestock",2013-02-15T15:58:58+00:00,Cambodia
3,16,Cambodia,9863.0,"Livestock, Non-food agricultural commodities",2013-02-15T15:58:59+00:00,Cambodia
4,17,Cambodia,6523.0,"Biomass for biofuels, Food crops",2013-02-15T15:58:59+00:00,Cambodia


In [26]:
# Importing a csv containing contract data of the land acquisition

df_contracts = pd.read_csv(
    "export2\contracts.csv",
    sep=";",         
    engine="python",  
    encoding="utf-8",
)

df_contracts.head()


Unnamed: 0,ID,Deal ID,Contract number,Contract date,Contract expiration date,Duration of the agreement,Comment on contract
0,1w4V2aI9,724,,,,5.0,
1,tGSS5q4E,778,,,,25.0,
2,0fXI63az,847,,,,30.0,
3,d24WOchE,849,,,,90.0,
4,PDMXavWg,851,,,,25.0,


In [27]:
df_contracts_clean = df_contracts[['ID', 'Deal ID', 'Duration of the agreement']]

In [28]:
df_contracts_clean.head()

Unnamed: 0,ID,Deal ID,Duration of the agreement
0,1w4V2aI9,724,5.0
1,tGSS5q4E,778,25.0
2,0fXI63az,847,30.0
3,d24WOchE,849,90.0
4,PDMXavWg,851,25.0


In [89]:
# Importing a csv containing the information on investors within the Land Matrix
df_investors = pd.read_csv(
    "export3\investors.csv",
    sep=";",         
    engine="python",  
    encoding="utf-8",
)

df_investors.head()

Unnamed: 0,Investor ID,Name,Country of registration/origin,Classification,Investor homepage,Opencorporates link,Comment,Action comment
0,2,Government of Bangladesh,Bangladesh,State-/government (owned) company,,,,
1,11,China Asean Resources Ltd.,"China, Hong Kong Special Administrative Region",Stock-exchange listed company,,,,
2,12,Agro Forestry Research,China,,,,,
3,14,Amira Nature Foods Ltd (ANFIF),United Arab Emirates,Stock-exchange listed company,https://www.amira.net/,,Amira Nature Foods Ltd is a food company prima...,
4,18,Bigimexco,Vietnam,Private company,,,,


In [85]:
investor_countries = df_investors.groupby(['Country of registration/origin'], group_keys = True)['Name'].count()
investor_countries #maybe we can add this to a new dataframe that counts deals per country

Country of registration/origin
Afghanistan                                               1
Albania                                                   6
Algeria                                                  21
Angola                                                   58
Argentina                                               428
Armenia                                                   1
Aruba                                                     1
Australia                                                88
Austria                                                  18
Bahamas                                                   1
Bahrain                                                   6
Bangladesh                                               41
Barbados                                                  3
Belarus                                                   7
Belgium                                                  25
Belize                                                    9
Benin    

In [86]:
investor_countries_clean = df_investors[['Investor ID', 'Country of registration/origin', 'Classification']]
investor_countries_clean.head()

Unnamed: 0,Investor ID,Country of registration/origin,Classification
0,2,Bangladesh,State-/government (owned) company
1,11,"China, Hong Kong Special Administrative Region",Stock-exchange listed company
2,12,China,
3,14,United Arab Emirates,Stock-exchange listed company
4,18,Vietnam,Private company


In [87]:
#Importing the involvements dataframe to be able to merge the other dataframes together
df_involvements = pd.read_csv(
    "export3\involvements.csv",
    sep=";",         
    engine="python",  
    encoding="utf-8",
)
df_involvements.head()

involvements_clean = df_involvements[['Investor ID Upstream', 'Involvement ID']]


In [88]:
#merging the involvements dataframe with the investors dataframe
involvements_investors = pd.merge(
    involvements_clean,
    investor_countries_clean,
    left_on='Investor ID Upstream',
    right_on='Investor ID',
    how='left'
)
involvements_investors.head()



Unnamed: 0,Investor ID Upstream,Involvement ID,Investor ID,Country of registration/origin,Classification
0,12,10,12.0,China,
1,14,11,14.0,United Arab Emirates,Stock-exchange listed company
2,20,13,20.0,United States of America,Investment fund
3,25,16,25.0,India,
4,27,17,27.0,United States of America,
