In [2]:
import pandas as pd

# Load data
company_df = pd.read_csv('Company Descriptions.csv')
industry_df = pd.read_csv('Industry Segments - Top 10 Keywords.csv')


In [5]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load NLTK stopwords
stop_words = set(stopwords.words('english'))

# Preprocess text
def preprocess_text(text):
    if isinstance(text, str):  # Check if the value is a string
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token not in stop_words]
        return tokens
    return []

company_df['Processed_Description'] = company_df['company_description'].apply(preprocess_text)
industry_df['Processed_Tags'] = industry_df['Tags'].apply(preprocess_text)


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(description_tokens, tag_tokens):
    combined_tokens = list(set(description_tokens + tag_tokens))
    description_vector = [1 if token in description_tokens else 0 for token in combined_tokens]
    tag_vector = [1 if token in tag_tokens else 0 for token in combined_tokens]
    similarity = cosine_similarity([description_vector], [tag_vector])[0][0]
    return similarity

# Calculate similarity for each company and industry segment
similarities = []
for index, row in company_df.iterrows():
    description_tokens = row['Processed_Description']
    for _, ind_row in industry_df.iterrows():
        tag_tokens = ind_row['Processed_Tags']
        similarity = calculate_similarity(description_tokens, tag_tokens)
        similarities.append((row['company_name'], ind_row['Industry segment'], similarity))


In [9]:
from collections import defaultdict

# Store best matches
best_matches = defaultdict(lambda: ('', 0))  # Company name -> (Industry Segment, Max Similarity)

# Find best matches
for company, segment, similarity in similarities:
    if similarity > best_matches[company][1]:
        best_matches[company] = (segment, similarity)


In [10]:
output_data = [(company, segment) for company, (segment, _) in best_matches.items()]
output_df = pd.DataFrame(output_data, columns=['Company Name', 'Best Matched Segment'])
output_df.to_csv('Output.csv', index=False)