# Merging and Cleaning ICIJ Data   

This notebook will merge and clean the raw ICIJ datasets to be used for later analysis 

In [5]:
## importing packages and libraries that we've used in class
import pandas as pd
import numpy as np
import random
import re
import string
import requests
import plotnine 
from plotnine import *
import matplotlib.pyplot as plt
import yaml


## sklearn imports
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

## print mult things
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



### Code to merge datasets

In [6]:

rels = pd.read_csv("relationships.csv")

# Read each node file, and add a 'node_type' column to them
addresses       = pd.read_csv("nodes-addresses.csv"     ).assign(node_type="address")
entities        = pd.read_csv("nodes-entities.csv"      ).assign(node_type="entity")
intermediaries  = pd.read_csv("nodes-intermediaries.csv").assign(node_type="intermediary")
officers        = pd.read_csv("nodes-officers.csv"      ).assign(node_type="officer")
others         = pd.read_csv("nodes-others.csv"        ).assign(node_type="other")


# Stack node files into a single dataset
nodes = pd.concat(
    [addresses, entities, intermediaries, officers, others],
    ignore_index=True,
    sort=False
)

# Merge in the metadata for the 'start' node
#    Suffix all node columns with '_start'; this brings in, e.g., name_start, countries_start, node_type_start, etc.
rels = rels.merge(
    nodes.add_suffix("_start"),
    left_on="node_id_start",
    right_on="node_id_start",
    how="left"
)

# Merge in the metadata for the 'end' node
#    Suffixing with '_end' brings in name_end, countries_end, node_type_end, etc.
rels = rels.merge(
    nodes.add_suffix("_end"),
    left_on="node_id_end",
    right_on="node_id_end",
    how="left"
)

print(rels.info())




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3339267 entries, 0 to 3339266
Data columns (total 54 columns):
 #   Column                          Dtype 
---  ------                          ----- 
 0   node_id_start                   int64 
 1   node_id_end                     int64 
 2   rel_type                        object
 3   link                            object
 4   status                          object
 5   start_date                      object
 6   end_date                        object
 7   sourceID                        object
 8   address_start                   object
 9   name_start                      object
 10  countries_start                 object
 11  country_codes_start             object
 12  sourceID_start                  object
 13  valid_until_start               object
 14  note_start                      object
 15  node_type_start                 object
 16  original_name_start             object
 17  former_name_start               object
 18  ju

In [7]:
rels.head()
print("Number of rows in merged relationships dataset:")
len(rels)



Unnamed: 0,node_id_start,node_id_end,rel_type,link,status,start_date,end_date,sourceID,address_start,name_start,...,internal_id_end,incorporation_date_end,inactivation_date_end,struck_off_date_end,dorm_date_end,status_end,service_provider_end,ibcRUC_end,type_end,closed_date_end
0,10002580,14106952,registered_address,registered address,,,,Panama Papers,STEMBRIDGE TRUST (IRELAND) LIMITED 5 THE COURT...,FUSION TRADING LIMITED,...,,,,,,,,,,
1,10004460,14101133,registered_address,registered address,,,,Panama Papers,MF CORPORATE (UK) LIMITED 520 S. 7TH STREET S...,MF CORPORATE (UK) LIMITED,...,,,,,,,,,,
2,10023813,14105100,registered_address,registered address,,,,Panama Papers,REIG INVESTMENTS LLP INVISION HOUSE WILBURY WA...,REIG INVESTMENTS LLP,...,,,,,,,,,,
3,10023840,14100712,registered_address,registered address,,,,Panama Papers,MARCUSSI INTERNATIONAL LLP INVISION HOUSE WILB...,MARCUSSI INTERNATIONAL LLP,...,,,,,,,,,,
4,10010428,14093957,registered_address,registered address,,,,Panama Papers,COLLYER BRISTOW LLP SOLICITORS 4 BEDFORD ROW L...,PULSAR HOLDINGS CORPORATION,...,,,,,,,,,,


Number of rows in merged relationships dataset:


3339267

### Saving Uncleaned Merged Dataset

In [8]:
rels.to_csv("rels.csv", index=False)

### Creating Uncleaned Sample Dataset to experiment on

In [9]:
rels_sample_dirty = rels.sample(n=300000, random_state=42)  # random_state for reproducibility

# saving the "dirty" sample to a CSV file
rels_sample_dirty.to_csv("rels_sample_dirty.csv", index=False)

# Cleaning Merged Dataset

In [10]:
# creating a copy of the merged dataset to be cleaned 
df = rels

In [11]:
# Standardize column names
df.columns = (
    df.columns
      .str.strip()                       # Remove leading/trailing whitespace
      .str.lower()                       # Lowercase
      .str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation
      .str.replace(r'\s+', '_', regex=True)     # Replace spaces with underscores
)

# Display the standardized column names
print("Standardized columns:")
print(df.columns.tolist())

Standardized columns:
['node_id_start', 'node_id_end', 'rel_type', 'link', 'status', 'start_date', 'end_date', 'sourceid', 'address_start', 'name_start', 'countries_start', 'country_codes_start', 'sourceid_start', 'valid_until_start', 'note_start', 'node_type_start', 'original_name_start', 'former_name_start', 'jurisdiction_start', 'jurisdiction_description_start', 'company_type_start', 'internal_id_start', 'incorporation_date_start', 'inactivation_date_start', 'struck_off_date_start', 'dorm_date_start', 'status_start', 'service_provider_start', 'ibcruc_start', 'type_start', 'closed_date_start', 'address_end', 'name_end', 'countries_end', 'country_codes_end', 'sourceid_end', 'valid_until_end', 'note_end', 'node_type_end', 'original_name_end', 'former_name_end', 'jurisdiction_end', 'jurisdiction_description_end', 'company_type_end', 'internal_id_end', 'incorporation_date_end', 'inactivation_date_end', 'struck_off_date_end', 'dorm_date_end', 'status_end', 'service_provider_end', 'ibcruc_

In [12]:
## converting date columns to datetime format  

# Define a list of column names that are expected to contain date information.
# These include incorporation dates for start/end nodes and start/end dates for relationships.
date_cols = ['incorporation_date_start', 'incorporation_date_end', 'start_date', 'end_date']  

# Iterate over each column name in the date_cols list.
for col in date_cols:
  # Check if the current column exists in the datasets's columns.
  if col in df.columns:
      # If the column exists, convert its data type to datetime.
      # The `errors='coerce'` argument ensures that any values that cannot be converted
      # to a datetime format will be replaced with NaT (Not a Time).
      df[col] = pd.to_datetime(df[col], errors='coerce')


# Columns that were successfully converted to datetime will show as 'datetime64[ns]',
# while columns that could not be converted will show as 'object' or 'NaT'.
print("Data types after parsing dates:")
print(df[date_cols].dtypes)




Data types after parsing dates:
incorporation_date_start    datetime64[ns]
incorporation_date_end      datetime64[ns]
start_date                  datetime64[ns]
end_date                    datetime64[ns]
dtype: object


In [13]:
# Store the initial number of rows in the data to count duplicates later.
initial_count = len(df)

# Remove duplicate rows from the data
# The `inplace=True` argument modifies the dataset `df` directly, meaning it doesn't return a new dataframe but changes `df` itself.
df.drop_duplicates(inplace=True)

# Calculate and print the number of duplicate rows that were removed.
# This is found by subtracting the current number of rows from the initial count.
print(f"Dropped {initial_count - len(df)} duplicate rows.")


Dropped 8177 duplicate rows.


In [14]:
# Select all columns that have are an 'object' data type, which typically includes text columns.
text_cols = df.select_dtypes(include='object').columns.tolist()

for col in text_cols:
    # Collapse multiple spaces, strip, and convert to title case (or lowercase if preferred)
    df[col] = (
        df[col]
        #.astype(str)
        .str.replace(r'\s+', ' ', regex=True)  # collapse multiple spaces
        .str.strip()
      # .str.title()                            # or .str.lower() depending on preference
    )


In [15]:
## Just in case: drop columns with 100% missing values 

# Identify columns where all values are missing (NaN)
# df.isnull() creates a boolean dataframe of the same shape as df, with True where values are NaN
# .all() checks if all values in each column are True (i.e., all are NaN)
# df.columns[...] selects the names of these columns
# .tolist() converts the column names to a list
empty_cols = df.columns[df.isnull().all()].tolist()

# Drop the identified empty columns from the dataframe 
# `inplace=True` modifies the dataframe directly
df.drop(columns=empty_cols, inplace=True)

# Print the names of the columns that were dropped (if there are any)
print(f"Dropped empty columns: {empty_cols}")



Dropped empty columns: []


In [16]:
df.head()

Unnamed: 0,node_id_start,node_id_end,rel_type,link,status,start_date,end_date,sourceid,address_start,name_start,...,internal_id_end,incorporation_date_end,inactivation_date_end,struck_off_date_end,dorm_date_end,status_end,service_provider_end,ibcruc_end,type_end,closed_date_end
0,10002580,14106952,registered_address,registered address,,NaT,NaT,Panama Papers,STEMBRIDGE TRUST (IRELAND) LIMITED 5 THE COURT...,FUSION TRADING LIMITED,...,,NaT,,,,,,,,
1,10004460,14101133,registered_address,registered address,,NaT,NaT,Panama Papers,MF CORPORATE (UK) LIMITED 520 S. 7TH STREET SU...,MF CORPORATE (UK) LIMITED,...,,NaT,,,,,,,,
2,10023813,14105100,registered_address,registered address,,NaT,NaT,Panama Papers,REIG INVESTMENTS LLP INVISION HOUSE WILBURY WA...,REIG INVESTMENTS LLP,...,,NaT,,,,,,,,
3,10023840,14100712,registered_address,registered address,,NaT,NaT,Panama Papers,MARCUSSI INTERNATIONAL LLP INVISION HOUSE WILB...,MARCUSSI INTERNATIONAL LLP,...,,NaT,,,,,,,,
4,10010428,14093957,registered_address,registered address,,NaT,NaT,Panama Papers,COLLYER BRISTOW LLP SOLICITORS 4 BEDFORD ROW L...,PULSAR HOLDINGS CORPORATION,...,,NaT,,,,,,,,


# Saving the full merged and cleaned dataset

In [17]:
## Code to save data set
df.to_csv("ICIJ_Merged.csv", index=False)

### Saving new sample dataset after merged dataset has been cleaned

In [18]:
rels_sample2 = df.sample(n=300000, random_state=42)  # random_state for reproducibility

rels_sample2.to_csv("rels_sample2.csv", index=False)