In [None]:
import pandas as pd
import os

# Set the path to the directory containing the CSV files
data_path = "/Users/tommyly/network/data/embeddingdata/opencorporates/us_ny_unzipped"



In [None]:
# Function to load a CSV file
def load_csv(file_name):
    file_path = os.path.join(data_path, file_name)
    return pd.read_csv(file_path, low_memory=False)

# Load each file into a separate DataFrame
additional_identifiers_df = load_csv('additional_identifiers.csv')
alternative_names_df = load_csv('alternative_names.csv')
companies_df = load_csv('companies.csv')
non_reg_addresses_df = load_csv('non_reg_addresses.csv')
officers_df = load_csv('officers.csv')

In [None]:
officers_df.columns

In [None]:
non_reg_addresses_df.columns

In [None]:
alternative_names_df.columns

In [None]:
additional_identifiers_df.columns

In [None]:
companies_df.columns

In [None]:
# Group officers by company and analyze positions
officer_hierarchy = officers_df.groupby('company_number')['position'].value_counts().unstack()

# Look at the distribution of positions across companies
position_distribution = officers_df['position'].value_counts()

print(position_distribution)

In [None]:
# Count the number of officers per company
officers_per_company = officers_df.groupby('company_number').size().sort_values(ascending=False)

print(officers_per_company.head())

In [None]:
# Analyze the types and frequency of alternative names
name_types = alternative_names_df['type'].value_counts()

print(name_types)

In [None]:
# Analyze the distribution of companies across countries and regions
country_distribution = companies_df['registered_address.country'].value_counts()
region_distribution = companies_df['registered_address.region'].value_counts()

print("Top 10 countries:")
print(country_distribution.head(10))
print("\nTop 10 regions:")
print(region_distribution.head(10))

In [None]:
# Check if 'branch' column indicates subsidiaries
if 'branch' in companies_df.columns:
    branch_counts = companies_df['branch'].value_counts()
    print("Branch types:")
    print(branch_counts)

In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import networkx as nx
import os

# Convert CSV to Parquet
def csv_to_parquet(csv_path, parquet_path):
    df = pd.read_csv(csv_path, low_memory=False)
    table = pa.Table.from_pandas(df)
    pq.write_table(table, parquet_path)

# Convert all CSV files to Parquet
data_path = "/Users/tommyly/network/data/embeddingdata/opencorporates/us_ny_unzipped"
for file in os.listdir(data_path):
    if file.endswith('.csv'):
        csv_path = os.path.join(data_path, file)
        parquet_path = os.path.join(data_path, file.replace('.csv', '.parquet'))
        csv_to_parquet(csv_path, parquet_path)
        print(f"Converted {file} to Parquet")


In [None]:
import os
import networkx as nx
import pyarrow.parquet as pq

data_path = "/Users/tommyly/network/data/embeddingdata/opencorporates/us_ny_unzipped"

# Create a graph
G = nx.Graph()

# Add company nodes
companies = pq.read_table(os.path.join(data_path, 'companies.parquet'))
company_data = companies.to_pydict()
G.add_nodes_from([
    (company_number, {'type': 'company', 'name': name})
    for company_number, name in zip(company_data['company_number'], company_data['name'])
])

# Add officer nodes and edges
officers = pq.read_table(os.path.join(data_path, 'officers.parquet'))
officer_data = officers.to_pydict()
G.add_nodes_from([
    (f"officer_{id}", {'type': 'officer', 'name': f"{first} {last}"})
    for id, first, last in zip(officer_data['id'], officer_data['first_name'], officer_data['last_name'])
])
G.add_edges_from([
    (company, f"officer_{id}", {'relationship': position})
    for company, id, position in zip(officer_data['company_number'], officer_data['id'], officer_data['position'])
])

# Add address edges
addresses = pq.read_table(os.path.join(data_path, 'non_reg_addresses.parquet'))
address_data = addresses.to_pydict()
G.add_nodes_from([
    (f"address_{i}", {'type': 'address', 'full_address': addr})
    for i, addr in enumerate(address_data['in_full'])
])
G.add_edges_from([
    (company, f"address_{i}", {'relationship': 'registered_at'})
    for i, company in enumerate(address_data['company_number'])
])

# Print some basic network statistics
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print(f"Is connected: {nx.is_connected(G)}")
print(f"Number of connected components: {nx.number_connected_components(G)}")

# Save the graph
nx.write_gexf(G, os.path.join(data_path, "opencorporates_graph.gexf"))
print("Graph saved successfully.")