In [40]:
import pandas as pd
import numpy as np

In [7]:
# read .txt file of wiki article
with open('../text/phenomenology.txt', 'r') as file:
    phen = file.read()

with open('../text/epistemology.txt', 'r') as file:
    epis = file.read()

with open('../text/dzogchen.txt', 'r') as file:
    dzog = file.read()

In [23]:
# word count dict
def text_to_word_count_df(texts, names):
    word_counts = {names[i]: {} for i in range(len(texts))}

    for i, text in enumerate(texts):
        words = text.lower().split()
        for word in words:
            word_counts[names[i]][word] = word_counts[names[i]].get(word, 0) + 1

    df = pd.DataFrame(word_counts).fillna(0).astype(int)

    return df

df = text_to_word_count_df([phen, epis, dzog], ['Phenomenology', 'Epistemology', 'Dzogchen'])
df.head(50)

Unnamed: 0,Phenomenology,Epistemology,Dzogchen
phenomenology,41,0,0
is,135,228,102
a,76,215,59
philosophical,4,7,1
study,4,10,1
and,99,257,144
movement,1,0,0
largely,2,0,0
associated,1,6,5
with,35,32,29


In [36]:
# remove common word rows
common_words = ['the', 'of', 'and','is','to','in','a','as','it', 'that', 'this', 'they', 'while','about', 'with', 'has', 'or', 'are', 'which','on', 'by', 'these', 'from', 'for']
df = df.drop(common_words,errors='ignore')
df.head(50)

Unnamed: 0,Phenomenology,Epistemology,Dzogchen
phenomenology,41,0,0
philosophical,4,7,1
study,4,10,1
movement,1,0,0
largely,2,0,0
associated,1,6,5
early,4,3,8
20th,2,3,0
century,2,8,3
seeks,2,1,0


In [37]:
# print top 10 words per column along with col name
def top_words(df):
    for col in df.columns:
        print(f'\n{col}:')
        print(df[col].sort_values(ascending=False).head(10))

top_words(df)


Phenomenology:
phenomenology    41
consciousness    31
not              27
be               26
what             26
an               23
experience       23
object           22
husserl          20
one              17
Name: Phenomenology, dtype: int64

Epistemology:
knowledge        154
epistemology      63
belief            58
beliefs           44
how               43
not               40
between           35
their             35
justification     31
like              31
Name: Epistemology, dtype: int64

Dzogchen:
dzogchen     72
all          20
three        19
practice     17
practices    15
series       15
according    14
path         14
rigpa        13
be           13
Name: Dzogchen, dtype: int64


In [43]:
print(f'Total words in Dzogchen: {sum(df["Dzogchen"])}')
print(f'Total words in Phenomenology: {sum(df["Phenomenology"])}')
print(f'Total words in Epistemology: {sum(df["Epistemology"])}')
print('\n')
print(f'Similarity between Phenomenology and Epistemology: {np.dot(df['Phenomenology'], df['Epistemology'])}')
print(f'Similarity between Phenomenology and Dzogchen: {np.dot(df['Phenomenology'], df['Dzogchen'])}')
print(f'Similarity between Epistemology and Dzogchen: {np.dot(df['Epistemology'], df['Dzogchen'])}')

Total words in Dzogchen: 3063
Total words in Phenomenology: 2877
Total words in Epistemology: 5460


Similarity between Phenomenology and Epistemology: 11541
Similarity between Phenomenology and Dzogchen: 4395
Similarity between Epistemology and Dzogchen: 8348


In [44]:
# Problem: epistemology has more words in total, so the similarity score is higher
# Solution: normalize by the product of the magnitudes of the vectors => cosine similarity
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [45]:
print(f'cosine similarity phen-epis: {cosine_similarity(df["Phenomenology"], df["Epistemology"])}')
print(f'cosine similarity phen-dzog: {cosine_similarity(df["Phenomenology"], df["Dzogchen"])}')
print(f'cosine similarity epis-dzog: {cosine_similarity(df["Epistemology"], df["Dzogchen"])}')

cosine similarity phen-epis: 0.331805125086239
cosine similarity phen-dzog: 0.26557055972949184
cosine similarity epis-dzog: 0.2377686323758884


## tf-idf
find out which words are most unique to each text

In [67]:
def tf_idf(df, term, document):
    tf = np.log(1 + df[document][term] / sum(df[document]))
    idf = np.log(len(df.columns) / sum(df.loc[term] > 0))
    return tf * idf

In [75]:
print(tf_idf(df, 'mind', 'Phenomenology'))
print(tf_idf(df, 'phenomenology', 'Phenomenology'))
print(tf_idf(df, 'embodied', 'Epistemology'))

0.0
0.015545765461703437
0.0


In [97]:
df.loc[('mind')]

Phenomenology     1
Epistemology     10
Dzogchen          9
Name: mind, dtype: int64

In [102]:
def query_tf_idf(df, query):
    print('querying:', query)
    score = {col: 0 for col in df.columns}
    for col in df.columns:
        score[col] = sum([tf_idf(df, term, col) for term in query.split()])
    if max(score.values()) == 0:
        return 'No match found'
    return max(score, key=score.get)

In [105]:
print(query_tf_idf(df, 'philosophy'))
print(query_tf_idf(df, 'relax'))
print(query_tf_idf(df, 'knowledge'))
print(query_tf_idf(df, 'belief'))


querying: philosophy
Phenomenology
querying: relax
Dzogchen
querying: knowledge
No match found
querying: belief
Epistemology


## PPMI
