In [2]:
import pandas as pd
import math
from collections import Counter
import ast

# Đọc file CSV
file_path = "./ReportPreprocessingtomcat.csv"  # Thay bằng đường dẫn thực tế đến file CSV của bạn
df = pd.read_csv(file_path)

# Hàm chuẩn bị dữ liệu từ cột summary
def prepare_data(df):
    data = []
    for index, row in df.iterrows():
        summary_dict = ast.literal_eval(row['summary'])  # Chuyển chuỗi thành dictionary
        data.append({
            "key": row['key'],
            "summary_stemmed": summary_dict['stemmed'],  # Lấy danh sách stemmed
            "fixed_files": ast.literal_eval(row['fixed_files'])  # Chuyển chuỗi fixed_files thành list
        })
    return data

# Hàm xây dựng chỉ mục ngược
def build_inverted_index(data):
    inverted_index = {}
    for item in data:
        doc_id = item["key"]
        words = item["summary_stemmed"]
        for word in words:
            if word not in inverted_index:
                inverted_index[word] = []
            if doc_id not in inverted_index[word]:
                inverted_index[word].append(doc_id)
    return inverted_index

# Hàm xây dựng từ vựng
def build_vocabulary(inverted_index):
    vocabulary = {word: idx for idx, word in enumerate(inverted_index.keys())}
    return vocabulary

# Hàm xây dựng VSM thưa với TF-IDF dạng ltc
def build_sparse_vsm_ltc(data, inverted_index, vocabulary):
    N = len(data)
    sparse_vsm = {}
    for item in data:
        doc_id = item["key"]
        words = item["summary_stemmed"]
        term_freq = Counter(words)
        tfidf_vector = {}
        for word, freq in term_freq.items():
            tf = 1 + math.log(freq)  # l: logarithmic TF
            df_t = len(inverted_index.get(word, []))
            idf = math.log(N / df_t) if df_t > 0 else 0  # t: standard IDF
            tfidf_vector[word] = tf * idf
        norm = math.sqrt(sum(w ** 2 for w in tfidf_vector.values()))
        if norm > 0:
            for word in tfidf_vector:
                tfidf_vector[word] /= norm  # c: cosine normalization
        sparse_vector = {vocabulary[word]: weight for word, weight in tfidf_vector.items()}
        sparse_vsm[doc_id] = sparse_vector
    return sparse_vsm

# Hàm tính TF-IDF lnc cho query
def compute_query_lnc(query, vocabulary):
    term_freq = Counter(query)
    tf_vector = {}
    for word, freq in term_freq.items():
        if word in vocabulary:
            tf_vector[word] = 1 + math.log(freq)  # l: logarithmic TF, no IDF
    norm = math.sqrt(sum(w ** 2 for w in tf_vector.values()))
    if norm > 0:
        for word in tf_vector:
            tf_vector[word] /= norm  # c: cosine normalization
    sparse_query = {vocabulary[word]: weight for word, weight in tf_vector.items()}
    return sparse_query

# Hàm tính cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = 0
    for idx in vec1:
        if idx in vec2:
            dot_product += vec1[idx] * vec2[idx]
    return dot_product

# Hàm tìm top k summary và fixed_files tương ứng
def find_top_k_with_fixed_files(query, sparse_vsm, data, vocabulary, k=5):
    query_vector = compute_query_lnc(query, vocabulary)
    similarities = {}

    for doc_id, doc_vector in sparse_vsm.items():
        sim = cosine_similarity(query_vector, doc_vector)
        similarities[doc_id] = sim

    # Sắp xếp theo độ tương đồng giảm dần và lấy top k
    top_k = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:k]

    # Lấy fixed_files tương ứng
    results = []
    for doc_id, sim in top_k:
        for item in data:
            if item["key"] == doc_id:
                results.append({
                    "key": doc_id,
                    "similarity": sim,
                    "fixed_files": item["fixed_files"]
                })
                break
    return results

# Chuẩn bị dữ liệu và xây dựng VSM
data = prepare_data(df)
inverted_index = build_inverted_index(data)
vocabulary = build_vocabulary(inverted_index)
sparse_vsm = build_sparse_vsm_ltc(data, inverted_index, vocabulary)

# Ví dụ query
query =  ['bug', 'wrong', 'statement', 'gener', 'use', 'attribut', 'page', 'direct']  # Thay bằng danh sách từ stemmed của bạn
top_k_results = find_top_k_with_fixed_files(query, sparse_vsm, data, vocabulary, k=3)

# In kết quả
print(f"Query: {query}")
print("Top 3 similar summaries and their fixed files:")
for result in top_k_results:
    print(f"Key: {result['key']}, Similarity: {result['similarity']:.4f}")
    print(f"Fixed Files: {result['fixed_files']}")
    print()


Query: ['bug', 'wrong', 'statement', 'gener', 'use', 'attribut', 'page', 'direct']
Top 3 similar summaries and their fixed files:
Key: 56012, Similarity: 0.9100
Fixed Files: ['java\\org\\apache\\jasper\\compiler\\PageInfo.java', ' java\\org\\apache\\jasper\\compiler\\Validator.java', '.']

Key: 54012, Similarity: 0.2779
Fixed Files: ['java\\org\\apache\\jasper\\compiler\\Compiler.java', ' java\\org\\apache\\jasper\\compiler\\PageInfo.java', ' java\\org\\apache\\jasper\\compiler\\TagPluginManager.java', ' java\\org\\apache\\jasper\\compiler\\tagplugin\\TagPluginContext.java', ' java\\org\\apache\\jasper\\tagplugins\\jstl\\core\\Set.java', '.']

Key: 48296, Similarity: 0.2610
Fixed Files: ['java\\org\\apache\\catalina\\ant\\jmx\\JMXAccessorTask.java', ' java\\org\\apache\\catalina\\tribes\\membership\\Membership.java', ' java\\org\\apache\\naming\\resources\\DirContextURLConnection.java', '.']

