In [1]:
from openpyxl import load_workbook

# Load the workbook
workbook = load_workbook(filename='/Users/myself/Desktop/walmartCaseData.xlsx', data_only=True)

# List all sheet names
sheet_names = workbook.sheetnames
sheet_names


['Exhibit 6 Probability of Custom',
 'Exhibit 9 Selected Financials 2',
 'Employment and Store Metrics',
 'Exhibit 12 Walmart vs. Amazon P',
 'Exhibit 14 Walmart vs. Amazon C',
 'Exhibit 15 Walmart Allocation o']

In [2]:
import pandas as pd

# Load each sheet into a DataFrame
dfs = {sheet: pd.read_excel('/Users/myself/Desktop/walmartCaseData.xlsx', sheet_name=sheet) for sheet in sheet_names}

# Display the column names for each DataFrame
column_names = {sheet: list(df.columns) for sheet, df in dfs.items()}
column_names


{'Exhibit 6 Probability of Custom': ['Store Distance (miles)',
  'Population Density 1 (thousands)',
  'Population Density 5 (thousands)',
  'Population Density 10 (thousands)',
  'Population Density 20 (thousands)',
  'Population Density 50 (thousands)',
  'Population Density 100 (thousands)',
  'Population Density 250 (thousands)'],
 'Exhibit 9 Selected Financials 2': ['Metric',
  'Walmart',
  'Target',
  'Dollar General',
  'Kroger'],
 'Employment and Store Metrics': ['Metric',
  'Walmart',
  'Target',
  'Dollar General',
  'Kroger'],
 'Exhibit 12 Walmart vs. Amazon P': ['Metric', 'Walmart.com', 'Amazon'],
 'Exhibit 14 Walmart vs. Amazon C': ['Category',
  'Amazon $',
  'Amazon %',
  'Walmart $',
  'Walmart %'],
 'Exhibit 15 Walmart Allocation o': ['Category',
  '2020 ($ millions)',
  '2019 ($ millions)']}

In [None]:
from transformers import BertModel, BertTokenizer
import torch
from scipy.spatial.distance import cosine

# Initialize BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to generate embeddings for text
def generate_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Target field names and their embeddings
target_fields = ['population_density', 'income_per_capita']
target_embeddings = {field: generate_embeddings(field) for field in target_fields}

# Flatten the list of column names to generate their embeddings
all_column_names = [column for sublist in column_names.values() for column in sublist]
column_embeddings = {column: generate_embeddings(column) for column in all_column_names}

# Function to calculate cosine similarity
def calculate_cosine_similarity(embedding1, embedding2):
    return 1 - cosine(embedding1, embedding2)

# Calculate cosine similarity between each column and target field
similarity_scores = {}

for column, column_embedding in column_embeddings.items():
    for target_field, target_embedding in target_embeddings.items():
        similarity = calculate_cosine_similarity(column_embedding, target_embedding)
        similarity_scores[(column, target_field)] = similarity

similarity_scores
