In [1]:
# Define base name
base_name = "organization_descriptions"

data_path = f"/Users/tom.willcocks/Downloads/{base_name}.csv"

In [2]:
if base_name == 'organization_descriptions':
    col = "description"
elif base_name == 'organizations':
    col = "short_description"

In [3]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv(data_path)

In [4]:
df.shape

(1399685, 9)

In [5]:
# Convert the 'description' column to strings, replacing NaN values with an empty string
sample_descriptions = df[col].fillna('').astype(str).tolist()

In [6]:
# Add a boolean column to the DataFrame indicating whether "heat pump" is in the description
df['contains_heat_pump'] = df[col].str.contains("heat pump", case=False)

# Print the sum of the new boolean column to see how many descriptions contain "heat pump"
print(df['contains_heat_pump'].sum())

714


In [7]:
import spacy
import random

model_path = "/Users/tom.willcocks/Documents/code/discovery_utils/tests/heat_pump_model_spacy"

# Disable all components except 'textcat' when loading the model
components_to_disable = ['tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
nlp = spacy.load(model_path, disable=components_to_disable)

# Verify which components are active after disabling
print("Active pipeline components:", nlp.pipe_names)

Active pipeline components: ['tok2vec', 'textcat']


In [8]:
# Ensure descriptions are strings, replacing NaN values with an empty string
df[col] = df[col].fillna('').astype(str)

# Process descriptions and predict the category with nlp.pipe for efficient batch processing
# Initialize a list to store the classification results
classification_results = []

# Calculate total number of documents for progress tracking
total_docs = len(df[col])
percent_increment = total_docs / 100  # 1% of total

print("Processing descriptions...")

# Use nlp.pipe for efficient processing and keep track of progress
for i, doc in enumerate(nlp.pipe(df[col]), start=1):
    # Update on each percentage increment
    if i % percent_increment < 1:
        print(f"Processing complete: {i / total_docs * 100:.1f}%")
    
    # Assuming your model outputs a category 'HEAT_PUMP_RELEVANT' with a score
    # Modify the category name as necessary based on your model's output
    score = doc.cats.get("HEAT_PUMP_RELEVANT", 0)
    # Consider a document relevant to heat pumps if the score exceeds a certain threshold, e.g., 0.5
    is_relevant = score > 0.5
    classification_results.append(is_relevant)

# Add the model's relevance determination back to the DataFrame
df['heat_pump_classifier'] = classification_results

# Count how many companies are identified as being about heat pumps by the model
heat_pump_companies_count = df['heat_pump_classifier'].sum()
print(f"Number of companies about heat pumps: {heat_pump_companies_count}")


Processing descriptions...
Processing complete: 1.0%
Processing complete: 2.0%
Processing complete: 3.0%
Processing complete: 4.0%
Processing complete: 5.0%
Processing complete: 6.0%
Processing complete: 7.0%
Processing complete: 8.0%
Processing complete: 9.0%
Processing complete: 10.0%
Processing complete: 11.0%
Processing complete: 12.0%
Processing complete: 13.0%
Processing complete: 14.0%
Processing complete: 15.0%
Processing complete: 16.0%
Processing complete: 17.0%
Processing complete: 18.0%
Processing complete: 19.0%
Processing complete: 20.0%
Processing complete: 21.0%
Processing complete: 22.0%
Processing complete: 23.0%
Processing complete: 24.0%
Processing complete: 25.0%
Processing complete: 26.0%
Processing complete: 27.0%
Processing complete: 28.0%
Processing complete: 29.0%
Processing complete: 30.0%
Processing complete: 31.0%
Processing complete: 32.0%
Processing complete: 33.0%
Processing complete: 34.0%
Processing complete: 35.0%
Processing complete: 36.0%
Processing

In [9]:
from datetime import datetime

# Generate a timestamp
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
# Define the file name with the timestamp
file_name = f"heatpump_{base_name}_{timestamp}.csv"

# Select only the specified columns
df = df[['id', 'name', col, 'heat_pump_spaCy']]

# Save the DataFrame to a CSV file with the timestamped file name
df.to_csv(file_name, index=False)

print(f"DataFrame saved as {file_name}")

DataFrame saved as heatpump_organization_descriptions_20240321-134557.csv


In [10]:
# # The desired number of samples for each category
# desired_samples = 3

# # Initialize lists to store positives and negatives
# positives = []
# negatives = []

# # Dynamically adjust the sample size for efficiency
# sample_size = 1000  # Start with a smaller sample size
# while len(positives) < desired_samples or len(negatives) < desired_samples:
#     random_sample = random.sample(descriptions_list, sample_size)
#     for doc in nlp.pipe(random_sample, batch_size=500):
#         # Check if enough samples have been collected
#         if len(positives) >= desired_samples and len(negatives) >= desired_samples:
#             break
#         # Classify and store the document based on its category
#         if doc.cats["RELEVANT"] >= 0.5:  # Adjust threshold as needed
#             positives.append(doc.text)
#         else:
#             negatives.append(doc.text)
#     # Double the sample size for the next iteration if more samples are needed
#     sample_size = min(sample_size * 2, 50000)  # Cap the sample size at 50,000

# # Randomly sample 3 positives and 3 negatives for review
# sample_positives = random.sample(positives, min(desired_samples, len(positives)))
# sample_negatives = random.sample(negatives, min(desired_samples, len(negatives)))

# # Review your samples by printing them one after the other
# print("Sample RELEVANT:")
# for text in sample_positives:
#     print(text)
#     print("---")  # Separator for readability

# print("Sample NOT_RELEVANT:")
# for text in sample_negatives:
#     print(text)
#     print("---")  # Separator for readability