In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. Load your datasets
try:
    df_final = pd.read_csv('job_postings_with_subclusters_v4.csv')
    df_original = pd.read_csv('postings_filtered.csv')
except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please ensure both CSV files are in the correct directory.")
    exit()


# Ensure the description columns are clean and free of missing values (NaNs)
df_final['description'] = df_final['description'].fillna('')
df_original['description'] = df_original['description'].fillna('')

# 2. Initialize the TF-IDF Vectorizer
# This tool converts text into a matrix of TF-IDF features, focusing on the most significant words.
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)

# 3. Create TF-IDF vectors for the descriptions
# Fit on the original data and transform both sets for a consistent vocabulary.
tfidf_original = vectorizer.fit_transform(df_original['description'])
tfidf_final = vectorizer.transform(df_final['description'])

# 4. Calculate cosine similarity between the two sets of descriptions
# This creates a similarity matrix: rows are from your final file, columns are from the original.
print("Calculating similarity between job descriptions...")
cosine_sim_matrix = cosine_similarity(tfidf_final, tfidf_original)

# 5. Find the best match for each job in your final dataframe
# For each job in df_final, this finds the index of the most similar job in df_original.
best_matches_indices = np.argmax(cosine_sim_matrix, axis=1)

# 6. Retrieve the company name and job ID using the matched indices
matched_company_names = df_original.loc[best_matches_indices, 'company_name'].values
matched_job_ids = df_original.loc[best_matches_indices, 'job_id'].values

# 7. Add the matched data as new columns to your final dataframe
df_final['company_name'] = matched_company_names
df_final['job_id'] = matched_job_ids

print("\nSuccessfully matched and added company names and job IDs.")
# Display the first few rows of the updated dataframe to verify
print(df_final[['title', 'company_name', 'job_id', 'Final label']].head())

# 8. Save the final, enriched dataframe to a new CSV file
output_filename = 'job_postings_matched_with_companies.csv'
df_final.to_csv(output_filename, index=False)

print(f"\nYour new file has been saved as '{output_filename}'")

Calculating similarity between job descriptions...

Successfully matched and added company names and job IDs.
                                     title            company_name  \
0  Motion Graphic Designer and Film Editor       Elica Electric Co   
1                         Graphic Designer             Puffer Labs   
2                         Graphic Designer  Coalition Technologies   
3                        Graphic Designer              TekWissen ®   
4  Senior Graphic Designer - Presentations    HARMAN International   

       job_id     Final label  
0  3407282046  Graphic Design  
1  3869490625  Graphic Design  
2  3870352558  Graphic Design  
3  3884434792  Graphic Design  
4  3884436300  Graphic Design  

Your new file has been saved as 'job_postings_matched_with_companies.csv'
