# CSC461 – Assignment4 – NLP
## 11-12-2023
## **Muhammad Umar Asif**
## **FA21-BSE-075**

The assignment is about finding
*   BOW
*   Term Frequency
*   Inverse Document Frequency
*   TF-IDF

And calculate cosine, manhattan, and euclidean distances.




In [78]:
#import important libraries

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cityblock

import pandas as pd
import math

In [79]:
data = (
    "data science is one of the most important courses in computer science",
    "this is one of the best data science courses",
    "the data scientists perform data analysis"
)

data

('data science is one of the most important courses in computer science',
 'this is one of the best data science courses',
 'the data scientists perform data analysis')

In [80]:
# find bag of words
count_vectorizer = CountVectorizer()
c_vector_matrix= count_vectorizer.fit_transform(data)

c_tokens = count_vectorizer.get_feature_names_out()

bow_matrix = c_vector_matrix.toarray()
df_bow = pd.DataFrame(data = bow_matrix, columns = c_tokens)
df_bow

Unnamed: 0,analysis,best,computer,courses,data,important,in,is,most,of,one,perform,science,scientists,the,this
0,0,0,1,1,1,1,1,1,1,1,1,0,2,0,1,0
1,0,1,0,1,1,0,0,1,0,1,1,0,1,0,1,1
2,1,0,0,0,2,0,0,0,0,0,0,1,0,1,1,0


In [81]:
# find term frequency

def tf(corpus):
    dic_list = []
    for document in corpus:
        dic = {}
        total_words = len(document.split())
        for word in document.split():
            if word in dic:
                dic[word] += 1
            else:
                dic[word] = 1
        for word, freq in dic.items():
            dic[word] = round(freq / total_words, 2)
        dic_list.append(dic)
    return dic_list

result = tf(data)

df_tf = pd.DataFrame(result).fillna(0)
df_tf


Unnamed: 0,data,science,is,one,of,the,most,important,courses,in,computer,this,best,scientists,perform,analysis
0,0.08,0.17,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.0,0.0,0.0,0.0,0.0
1,0.11,0.11,0.11,0.11,0.11,0.11,0.0,0.0,0.11,0.0,0.0,0.11,0.11,0.0,0.0,0.0
2,0.33,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.17,0.17


In [83]:
# find inverse document frequency

def idf(corpus):
    N = len(corpus)
    term_doc_count = {}
    for document in corpus:
        unique_terms = set(document.split())
        for term in unique_terms:
            term_doc_count[term] = term_doc_count.get(term, 0) + 1

    idf_dict = {}
    for term, doc_count in term_doc_count.items():
        idf_dict[term] = round(math.log(N / (1+doc_count)), 2)

    return idf_dict


idf_dict = idf(data)
df_idf = pd.DataFrame(list(idf_dict.items()), columns=['Term', 'IDF'])
print(df_idf)


          Term   IDF
0    important  0.41
1      science  0.00
2         data -0.29
3           in  0.41
4      courses  0.00
5     computer  0.41
6          the -0.29
7           is  0.00
8         most  0.41
9          one  0.00
10          of  0.00
11        this  0.41
12        best  0.41
13     perform  0.41
14    analysis  0.41
15  scientists  0.41


In [89]:
# find tf-idf

tfidf_vect = TfidfVectorizer()
tfidf_matrix = round(tfidf_vect.fit_transform(data),2)

t_tokens = tfidf_vect.get_feature_names_out()
df_tfidf = pd.DataFrame(data = tfidf_matrix.toarray(), columns = t_tokens)
df_tfidf

Unnamed: 0,analysis,best,computer,courses,data,important,in,is,most,of,one,perform,science,scientists,the,this
0,0.0,0.0,0.33,0.25,0.19,0.33,0.33,0.25,0.33,0.25,0.25,0.0,0.5,0.0,0.19,0.0
1,0.0,0.42,0.0,0.32,0.25,0.0,0.0,0.32,0.0,0.32,0.32,0.0,0.32,0.0,0.25,0.42
2,0.46,0.0,0.0,0.0,0.54,0.0,0.0,0.0,0.0,0.0,0.0,0.46,0.0,0.46,0.27,0.0


In [86]:
#generate cosine similarity matrix

t_cosine_similarity_matrix = cosine_similarity(tfidf_matrix)
t_cosine_similarity_matrix

#convert cosine similarity matrix to Pandas dataframe

df_t_similarity = pd.DataFrame(data = t_cosine_similarity_matrix)
print("cosine similarity in tf-idf:\n")
df_t_similarity

cosine similarity in tf-idf:



Unnamed: 0,0,1,2
0,1.0,0.577324,0.157338
1,0.577324,1.0,0.203217
2,0.157338,0.203217,1.0


In [93]:
#calculate scityblock (manhattan distance)

print("manhanttan distance between s1 and s2:")
print(round(1/cityblock(df_tfidf.iloc[0], df_tfidf.iloc[1]), 2))

print("manhanttan distance between s1 and s3:")
print(round(1/cityblock(df_tfidf.iloc[0], df_tfidf.iloc[2]), 2))

print("manhanttan distance between s2 and s3:")
print(round(1/cityblock(df_tfidf.iloc[1], df_tfidf.iloc[2]), 2))

manhanttan distance between s1 and s2:
0.36
manhanttan distance between s1 and s3:
0.22
manhanttan distance between s2 and s3:
0.24


In [88]:
#calculate euclidean distances

print("euclidean distance between s1 and s2:")
print(round(math.dist(df_tfidf.iloc[0], df_tfidf.iloc[1]),2))

print("euclidean distance between s2 and s3:")
print(round(math.dist(df_tfidf.iloc[1], df_tfidf.iloc[2]),2))

print("euclidean distance between s1 and s3:")
print(round(math.dist(df_tfidf.iloc[0], df_tfidf.iloc[2]),2))

euclidean distance between s1 and s2:
0.92
euclidean distance between s2 and s3:
1.26
euclidean distance between s1 and s3:
1.3
