In [None]:
import os
import json
import openai
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

from dotenv import load_dotenv
load_dotenv('/home/pervinco/LLM-tutorials/keys.env')
openai_api_key = os.getenv('GRAVY_LAB_OPENAI')

from src.data_processor import translate_and_convert_to_string, process_vision_result, extract_workstyle_info

In [None]:
gt_path = "./data/amy_gt.json"
data_path = "./data/amy_culture_fit.json"

n_iter = 100
temperature = 0.1
prompt_version = 1
embed_model = "text-embedding-3-large"

output_dir = "./result"
csv_file_name = f"Non-CoT-N_{n_iter}.csv"

In [None]:
if prompt_version == 1:
    from src.prompt_processor import vision_prompt, workstyle_prompt, summary_prompt
else:
    from src.cot_prompt_processor import vision_prompt, workstyle_prompt, summary_prompt

In [None]:
client = openai.OpenAI(api_key=openai_api_key)

In [None]:
with open(data_path, 'r', encoding='utf-8') as file:
    hr_data_dict = json.load(file)

with open(data_path, 'r', encoding="utf-8") as file:
    gt_data_dict = json.load(file)

In [None]:
processed_data_summary = translate_and_convert_to_string(hr_data_dict['summaryResult'])
vision_data = process_vision_result(hr_data_dict['visionResult'], hr_data_dict['summaryResult'])
workstyle_data = extract_workstyle_info(hr_data_dict['workstyleResult'], hr_data_dict['summaryResult'])

In [None]:
print(processed_data_summary)


채용 권장 수준, 입사 후 적응 기간, 조기 퇴사 가능성 : 
조기 퇴사 가능성: 낮음
입사 후 적응 기간: 보통
채용 권장 수준: 보통

검사 항목별 결과 : 
5) 사고방식이 기업 비전,가치관에 부합하는가?: 매우 그렇다
2) 타 팀, 타 구성원과의 원만한 협업을 기대할 수 있는가?: 그렇다
3) 경영진, 상급자와의 원활한 소통을 기대할 수 있는가?: 그렇다
4) 기업이 추구하는 일하는 방식과 부합하는가?: 매우 그렇다
1) 구성원들과 원활한 소통이 가능한가?: 그렇다

이직 스트레스 요인 : 
공정인사

위험 성향 : 오만형


In [None]:
print(vision_data)

{'company_top_keywords': '전문성:4.6, 성과:4.2, 사회공헌:4.2', 'company_remaining_keywords': '상생:4.0, 최고지향:3.6, 고객:3.0, 성장:3.0', 'compute_top_keywords': '창조:5.0, 열정:5.0, 혁신:4.58', 'compute_remaining_keywords': '신속성:4.23, 사회공헌:4.17, 고객:4.0, 성장:3.85, 소통:3.75, 성과:3.33, 도전:3.21, 상생:3.13, 최고지향:2.78, 문제해결:2.5, 인재:2.27, 즐거움:1.5, 전문성:0.38', 'compute_vision_total_evalation': '보통'}


In [None]:
print(workstyle_data)

for k, v in workstyle_data.items():
    print(k, v)

{'company_keywords': '스피드형:3.2, 책임형:2.7, 목표지향형:4.0, 끈기형:3.0, 긍정형:3.5, 유니크형:2.9, 혁신형:3.4, 도전형:3.0, 스마트형:3.5, 윤리형:2.2, 성취형:2.2, 열린사고형:3.3, 솔선수범형:3.5, 몰입형:4.3, 신뢰형:1.8, 자기확신형:2.7', 'compute_keywords': '스피드형:4.33, 책임형:0.63, 목표지향형:3.33, 끈기형:0.67, 긍정형:2.5, 유니크형:5.0, 혁신형:5.0, 도전형:3.5, 스마트형:2.75, 윤리형:1.25, 성취형:3.44, 열린사고형:3.75, 솔선수범형:1.88, 몰입형:3.44, 신뢰형:0.65, 자기확신형:3.0', 'workstyle_match_percentage': 18.75, 'workstyle_company_total_score': 49.2, 'workstyle_compute_total_score': 45.12, 'comparison_ratio': 91.70731707317073, 'compute_workstyle_total_evalation': '우수'}
company_keywords 스피드형:3.2, 책임형:2.7, 목표지향형:4.0, 끈기형:3.0, 긍정형:3.5, 유니크형:2.9, 혁신형:3.4, 도전형:3.0, 스마트형:3.5, 윤리형:2.2, 성취형:2.2, 열린사고형:3.3, 솔선수범형:3.5, 몰입형:4.3, 신뢰형:1.8, 자기확신형:2.7
compute_keywords 스피드형:4.33, 책임형:0.63, 목표지향형:3.33, 끈기형:0.67, 긍정형:2.5, 유니크형:5.0, 혁신형:5.0, 도전형:3.5, 스마트형:2.75, 윤리형:1.25, 성취형:3.44, 열린사고형:3.75, 솔선수범형:1.88, 몰입형:3.44, 신뢰형:0.65, 자기확신형:3.0
workstyle_match_percentage 18.75
workstyle_company_total_score 49.2
workstyle_compu

In [None]:
vision_input = f"""
기업의 비전 데이터: {hr_data_dict["visionResult"]['company']}
피검사자의 비전 데이터: {hr_data_dict["visionResult"]['compute']}
"""

workstyle_input = f"""
company_keywords : {workstyle_data['company_keywords']}
compute_keywords : {workstyle_data['compute_keywords']}
workstyle_company_total_score : {workstyle_data['workstyle_company_total_score']}
workstyle_compute_total_score : {workstyle_data['workstyle_compute_total_score']}
workstyle_compute_total_score : {workstyle_data['workstyle_match_percentage']}
comparison_ratio: {workstyle_data['comparison_ratio']}
compute_workstyle_total_evaluation : {workstyle_data['compute_workstyle_total_evalation']}
"""

summary_input = f"""
additionalInformation : {hr_data_dict['summaryResult']['additionalInformation']}
recruitentQuestions : {hr_data_dict['summaryResult']['recruitentQuestions']}
turnOVerFactors : {hr_data_dict['summaryResult']['turnOverFactors']}
fued :{hr_data_dict['summaryResult']['fued']}
"""

In [None]:
def run_openai_api(n_iter, prompt, input, temperature):
    results = []
    for i in range(n_iter):
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": input}
            ],
            temperature=temperature
        )
        response_content = completion.choices[0].message.content
        results.append({
            "iteration": i + 1,
            "response": response_content
        })
    return results

In [None]:
print(f"company top3 : {vision_data['company_top_keywords']}\n")
print(f"compute top3 : {vision_data['compute_top_keywords']}\n")

print(f"compnay remain : {vision_data['company_remaining_keywords']}\n")
print(f"compute remain : {vision_data['compute_remaining_keywords']}\n")

print(f"fianl eval : {vision_data['compute_vision_total_evalation']}")

company top3 : 전문성:4.6, 성과:4.2, 사회공헌:4.2

compute top3 : 창조:5.0, 열정:5.0, 혁신:4.58

compnay remain : 상생:4.0, 최고지향:3.6, 고객:3.0, 성장:3.0

compute remain : 신속성:4.23, 사회공헌:4.17, 고객:4.0, 성장:3.85, 소통:3.75, 성과:3.33, 도전:3.21, 상생:3.13, 최고지향:2.78, 문제해결:2.5, 인재:2.27, 즐거움:1.5, 전문성:0.38

fianl eval : 보통


In [None]:
vision_results = run_openai_api(n_iter, vision_prompt, vision_input, temperature)
workstyle_results = run_openai_api(n_iter, workstyle_prompt, workstyle_input, temperature)
summary_results = run_openai_api(n_iter, summary_prompt, summary_input, temperature)

In [None]:
def create_results_dataframe(vision_results, workstyle_results):
    """Create separate dataframes for vision and workstyle results"""
    vision_df = pd.DataFrame(vision_results)
    vision_df['type'] = 'vision'

    workstyle_df = pd.DataFrame(workstyle_results)
    workstyle_df['type'] = 'workstyle'

    summary_df = pd.DataFrame(summary_results)
    summary_df['type'] = 'summary'
    
    # Combine the dataframes
    combined_df = pd.concat([vision_df, workstyle_df, summary_df], ignore_index=True)
    return combined_df

In [None]:
df = create_results_dataframe(vision_results, workstyle_results)
df.to_csv(os.path.join(output_dir, csv_file_name), index=False, encoding='utf-8-sig')
print(f"Results saved to {os.path.join(output_dir, csv_file_name)}")

Results saved to ./result/CoT_N-1.csv


In [None]:
df = pd.read_csv(f"{output_dir}/{csv_file_name}")
responses = df["response"].tolist()

In [None]:
def calculate_embedding_similarity_and_embeddings(responses, client):
    embeddings = []
    for response in responses:
        embedding_response = client.embeddings.create(
            input=response,
            model=embed_model
        )
        embeddings.append(embedding_response.data[0].embedding)

    embeddings = np.array(embeddings)
    similarity_matrix = cosine_similarity(embeddings)
    mean_similarity = similarity_matrix.mean()
    
    return mean_similarity, embeddings

In [None]:
def calculate_lexical_similarity(responses):
    """Calculate lexical overlap between responses"""
    def lexical_overlap(response1, response2):
        words1 = set(response1.split())
        words2 = set(response2.split())
        return len(words1 & words2) / len(words1 | words2)
    
    lexical_similarities = [
        lexical_overlap(responses[i], responses[j])
        for i in range(len(responses)) for j in range(i + 1, len(responses))
    ]
    mean_lexical_similarity = sum(lexical_similarities) / len(lexical_similarities)
    return mean_lexical_similarity

In [None]:
def analyze_responses(df, client):
    """Analyze responses for each type (vision and workstyle)"""
    results = {}
    
    for response_type in ['vision', 'workstyle', 'summary']:
        type_responses = df[df['type'] == response_type]['response'].tolist()
        
        # Calculate semantic similarity
        mean_similarity, embeddings = calculate_embedding_similarity_and_embeddings(type_responses, client)
        
        # Calculate lexical similarity
        mean_lexical_similarity = calculate_lexical_similarity(type_responses)
        
        results[response_type] = {
            'semantic_similarity': mean_similarity,
            'lexical_similarity': mean_lexical_similarity,
            'embeddings': embeddings
        }
    
    return results

In [None]:
# Analyze responses
analysis_results = analyze_responses(df, client)

# Print results for both types
for response_type, metrics in analysis_results.items():
    print(f"\nResults for {response_type.upper()}:")
    print(f"Semantic Similarity (mean): {metrics['semantic_similarity']:.2f}")
    print(f"Lexical Overlap (mean): {metrics['lexical_similarity']:.2f}")

ZeroDivisionError: division by zero