In [2]:
from openpyxl import load_workbook

# Load the workbook
workbook = load_workbook(filename='/Users/myself/Desktop/walmartCaseData.xlsx', data_only=True)

# List all sheet names
sheet_names = workbook.sheetnames
sheet_names


['Exhibit 6 Probability of Custom',
 'Exhibit 9 Selected Financials 2',
 'Employment and Store Metrics',
 'Exhibit 12 Walmart vs. Amazon P',
 'Exhibit 14 Walmart vs. Amazon C',
 'Exhibit 15 Walmart Allocation o']

In [3]:
import pandas as pd

# Load each sheet into a DataFrame
dfs = {sheet: pd.read_excel('/Users/myself/Desktop/walmartCaseData.xlsx', sheet_name=sheet) for sheet in sheet_names}

# Display the column names for each DataFrame
column_names = {sheet: list(df.columns) for sheet, df in dfs.items()}
column_names


{'Exhibit 6 Probability of Custom': ['Store Distance (miles)',
  'Population Density 1 (thousands)',
  'Population Density 5 (thousands)',
  'Population Density 10 (thousands)',
  'Population Density 20 (thousands)',
  'Population Density 50 (thousands)',
  'Population Density 100 (thousands)',
  'Population Density 250 (thousands)'],
 'Exhibit 9 Selected Financials 2': ['Metric',
  'Walmart',
  'Target',
  'Dollar General',
  'Kroger'],
 'Employment and Store Metrics': ['Metric',
  'Walmart',
  'Target',
  'Dollar General',
  'Kroger'],
 'Exhibit 12 Walmart vs. Amazon P': ['Metric', 'Walmart.com', 'Amazon'],
 'Exhibit 14 Walmart vs. Amazon C': ['Category',
  'Amazon $',
  'Amazon %',
  'Walmart $',
  'Walmart %'],
 'Exhibit 15 Walmart Allocation o': ['Category',
  '2020 ($ millions)',
  '2019 ($ millions)']}

In [8]:
from transformers import BertModel, BertTokenizer
import torch
from scipy.spatial.distance import cosine

# Initialize BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Target field names and their embeddings
target_fields = ['population_density', 'income_per_capita']
target_embeddings = {field: generate_embeddings(field) for field in target_fields}

# Flatten the list of column names to generate their embeddings
all_column_names = [column for sublist in column_names.values() for column in sublist]
column_embeddings = {column: generate_embeddings(column) for column in all_column_names}

# Calculate cosine similarity between each column and target field
similarity_scores = {}

for column, column_embedding in column_embeddings.items():
    for target_field, target_embedding in target_embeddings.items():
        similarity = calculate_cosine_similarity(column_embedding, target_embedding)
        similarity_scores[(column, target_field)] = similarity

similarity_scores


{('Store Distance (miles)', 'population_density'): 0.5665388107299805,
 ('Store Distance (miles)', 'income_per_capita'): 0.5849781632423401,
 ('Population Density 1 (thousands)',
  'population_density'): 0.7867864966392517,
 ('Population Density 1 (thousands)', 'income_per_capita'): 0.6690823435783386,
 ('Population Density 5 (thousands)',
  'population_density'): 0.7854726314544678,
 ('Population Density 5 (thousands)', 'income_per_capita'): 0.6524317860603333,
 ('Population Density 10 (thousands)',
  'population_density'): 0.7861742377281189,
 ('Population Density 10 (thousands)',
  'income_per_capita'): 0.6583580374717712,
 ('Population Density 20 (thousands)',
  'population_density'): 0.775789737701416,
 ('Population Density 20 (thousands)',
  'income_per_capita'): 0.6480918526649475,
 ('Population Density 50 (thousands)',
  'population_density'): 0.7724123597145081,
 ('Population Density 50 (thousands)',
  'income_per_capita'): 0.6379535794258118,
 ('Population Density 100 (thousa

In [9]:
import numpy as np

# Assuming 'generate_embeddings' is your function to get embeddings
def generate_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    # Detach the tensor from the graph, and then convert it to a numpy array
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Adjust the cosine similarity calculation to check for any shape issues:
def calculate_cosine_similarity(embedding1, embedding2):
    # Ensure both embeddings are 1-D
    embedding1 = np.squeeze(embedding1)
    embedding2 = np.squeeze(embedding2)
    return 1 - cosine(embedding1, embedding2)

# Example usage with checks
try:
    emb1 = generate_embeddings("example text 1")
    emb2 = generate_embeddings("example text 2")
    similarity = calculate_cosine_similarity(emb1, emb2)
    print("Cosine Similarity:", similarity)
except Exception as e:
    print("Error:", e)


Cosine Similarity: 0.972713828086853


In [10]:

# Calculate cosine similarity between each column and target field
similarity_scores = {}

for column, column_embedding in column_embeddings.items():
    for target_field, target_embedding in target_embeddings.items():
        similarity = calculate_cosine_similarity(column_embedding, target_embedding)
        similarity_scores[(column, target_field)] = similarity

similarity_scores


{('Store Distance (miles)', 'population_density'): 0.5665388107299805,
 ('Store Distance (miles)', 'income_per_capita'): 0.5849781632423401,
 ('Population Density 1 (thousands)',
  'population_density'): 0.7867864966392517,
 ('Population Density 1 (thousands)', 'income_per_capita'): 0.6690823435783386,
 ('Population Density 5 (thousands)',
  'population_density'): 0.7854726314544678,
 ('Population Density 5 (thousands)', 'income_per_capita'): 0.6524317860603333,
 ('Population Density 10 (thousands)',
  'population_density'): 0.7861742377281189,
 ('Population Density 10 (thousands)',
  'income_per_capita'): 0.6583580374717712,
 ('Population Density 20 (thousands)',
  'population_density'): 0.775789737701416,
 ('Population Density 20 (thousands)',
  'income_per_capita'): 0.6480918526649475,
 ('Population Density 50 (thousands)',
  'population_density'): 0.7724123597145081,
 ('Population Density 50 (thousands)',
  'income_per_capita'): 0.6379535794258118,
 ('Population Density 100 (thousa