# PA2

In [16]:
import pandas as pd
import os
import math
from nltk.stem import PorterStemmer

In [17]:
folder_path = "data"
docs = []
for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        file_path = os.path.join(root, file_name)
        with open(file_path, "r") as file:
            doc = file.read()
            docs.append(doc)

## q1
Construct a dictionary based on the terms extracted from the given documents.
- Record the document frequency of each term.
- Save your dictionary as a txt file (dictionary.txt).

In [18]:
docs_txt = " ".join(docs)
print(len(docs_txt))

3293992


### tokenize (pa1)

In [19]:
def tokenize(doc: str, stopword: str):
    # lowercase
    doc = doc.lower()
    # 去除特殊符號
    # 以空格取代特殊符號
    new_doc = ""
    for i in doc:
        if (i >= 'a' and i <= 'z') or (i == ' '):
            new_doc += i
        else:
            new_doc += ' '

    # 斷詞
    tokens = [i for i in new_doc.split(" ")]

    # Stopword Removal
    stop_tokens = [i for i in tokens if i not in stopword]

    # Stemming
    porter = PorterStemmer()
    stemming_tokens = [porter.stem(i) for i in stop_tokens]

    return stemming_tokens

In [None]:
# 每份文件詞頻
def calcu_freq(tokenized_words: list):
    word_dict = {}
    for i in tokenized_words:
        if i in word_dict:
            word_dict[i] += 1
        else:
            word_dict[i] = 1

    return word_dict

In [None]:
# 計算每個 term 的 df
def calcu_df(all_terms: list, all_tokenized_doc: list):
    df_list = []
    for term in all_terms:
        df = 0
        for temp_doc in all_tokenized_doc:
            if term in temp_doc:
                df += 1
        df_list.append([term, df])
    
    return df_list

In [22]:
with open("stopwords.txt", "r") as file:
    stopword = file.read()

# 每篇文章 tokenize
# [[doc1], [doc2], ...]
all_tokenized_doc = [tokenize(i, stopword) for i in docs]

# 統計總共有哪些 term
all_tokenized_words = [i for k in all_tokenized_doc for i in k]
all_terms = list(set(all_tokenized_words))

In [None]:
df_list = calcu_df(all_terms, all_tokenized_doc)
print(df_list)

In [None]:
# 將 dict 存成 txt 並 sort
df = pd.DataFrame(df_list, columns=['term', 'df'])
# sort by term
df_sorted = df.sort_values(by='term', ascending=True).reset_index(drop=True)
# 製作 index
df_sorted['t_index'] = df_sorted.index + 1
# reorder columns
df_final = df_sorted[['t_index', 'term', 'df']]

df_final.to_csv("output/dictionary.txt", index=False)

## q2
Transfer each document into a tf-idf unit vector
- Save it as a txt file (DocID.txt).

- tf = (該 term 在該文件出現次數) / (該文件 term 總數)
- idf = log(N / df)
  - N = 文件總數
  - df = 包含該 term 的文件數
- Unit Vector
  - normalization
  - tfidf_i / |V|
  - |V| = sqrt(sum(tfidf_i ^ 2))

In [40]:
term_df = pd.read_csv("output/dictionary.txt")
term_df = term_df.set_index("term")

with open("stopwords.txt", "r") as file:
    stopword = file.read()

# 文件總數
N = len(all_tokenized_doc)
print(N)

# traverse each document
for idx, one_tokenized_doc in enumerate(all_tokenized_doc):
    
    # 計算該文件 term freq
    word_dict = calcu_freq(one_tokenized_doc)
    # 該文件長度
    term_len = len(one_tokenized_doc)
    
    tfidf_list = []
    visited = []
    # traverse each term in a document
    for term in one_tokenized_doc:
        # 確保不要有重複的 term 
        if term in visited:
            continue
        visited.append(term)
        # 計算 tf
        tf = word_dict[term] / term_len

        # 抓 dictionary 內的 df
        row = term_df.loc[term]
        t_index = row['t_index']
        t_df = row['df']

        idf = math.log10(N / t_df)

        tfidf = tf * idf

        
        tfidf_list.append([t_index, tfidf])
    
    
    tfidf_list = pd.DataFrame(tfidf_list, columns=['t_index', 'tf-idf'])
    # Unit Vector normalization
    tfidf_val = tfidf_list.copy()
    tfidf_val['v_length'] = tfidf_val['tf-idf'] ** 2
    v_length = math.sqrt(tfidf_val['v_length'].sum())
    tfidf_list['tf-idf'] = round(tfidf_list['tf-idf'] / v_length, 3)


    tfidf_list = tfidf_list.sort_values(by='t_index', ascending=True).reset_index(drop=True)


    save_path = "output/" + str(idx + 1) + ".txt"
    tfidf_list.to_csv(save_path, index=False)

1095


## q3
Write a function cosine(Docx, Docy) which loads the tf-idf
vectors of documents x and y and returns their cosine 
similarity

cosine similarity = (A · B) / (|A| * |B|)
- A · B -> 交集向量相乘後加總
- |A| -> sqrt(sum(tfidf ^ 2))

In [42]:
def cosine(DocX, DocY):
    DocX_df = pd.read_csv(DocX)
    DocY_df = pd.read_csv(DocY)

    # 找出交集的 term
    merged_df = pd.merge(DocX_df, DocY_df, on='t_index', how='inner', suffixes=('_x', '_y'))
    # 計算 X·Y
    merged_df['dot'] = merged_df['tf-idf_x'] * merged_df['tf-idf_y']
    inner_product = merged_df['dot'].sum()

    # 計算 |X|
    DocX_df['X_length'] = DocX_df['tf-idf'] ** 2
    X_length = math.sqrt(DocX_df['X_length'].sum())
    # 計算 |Y|
    DocY_df['Y_length'] = DocY_df['tf-idf'] ** 2
    Y_length = math.sqrt(DocY_df['Y_length'].sum())

    cosine_similarity = inner_product / (X_length * Y_length)
    return cosine_similarity

In [43]:
DocX = "output/1.txt"
DocY = "output/2.txt"
cosine_similarity = cosine(DocX, DocY)
print(cosine_similarity)

0.27714960226192065
