# Corpus

In [1]:
doc1 = "about_coep.txt"
doc2 = "coep_cutoff.txt"
doc3 = "placements.txt"
corpus = [doc1, doc2, doc3]

# Preprocessing

In [3]:
from string import punctuation as punctuation
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tag import pos_tag

#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer() 

stopwords = stopwords.words('english')

punctuation = "["+punctuation+"]"

#Remove punctuation 
def remove_punctuation(string) :
    return(re.sub(punctuation, "", string))

#Remove stopwords
def remove_stopwords(tokens) :
    without_stopwords = []
    for token in tokens :
        if token not in stopwords :
            without_stopwords.append(token.lower())
    return without_stopwords

#Lemmatize
def lemmatize(tokens) :
    lemmas = []
    pos = pos_tag(tokens)
    for word, tag in pos :
        tag_starting = tag[0].lower()
        tag_starting = tag_starting if tag_starting in ['a', 'r', 'n', 'v'] else None
        if not tag_starting :
            lemma = word
        else :
            lemma = lemmatizer.lemmatize(word, tag_starting)
        lemmas.append(lemma)
    return lemmas

In [4]:
#Dictionary of document:[words]
doc_terms = {}
#Getting terms in each document(Removing spacing)
for doc in corpus :
    doc_terms[doc] = []
    with open("corpus/"+doc, 'r') as file :
        for line in file :
            line = remove_punctuation(line.lower())
            doc_terms[doc].extend(line.split())
    print(doc_terms[doc])

['college', 'of', 'engineering', 'pune', 'coep', 'is', 'an', 'autonomous', 'institute', 'located', 'in', 'pune', 'maharashtra', 'india', 'and', 'was', 'established', 'in', 'the', 'year', '1854', 'established', 'in', '1854', 'it', 'is', 'one', 'of', 'the', 'oldest', 'engineering', 'colleges', 'in', 'india']
['while', 'coep', 'cutoff', '2020', 'depends', 'on', 'entrance', 'exam', 'mht', 'cet', 'this', 'exam', 'is', 'conducted', 'by', 'dte', 'maharashtra', 'and', 'not', 'by', 'coep', 'for', 'ug', 'civil', 'engineering', 'coep', 'admission', 'cut', 'off', 'for', 'mhtcet', 'reservation', 'maharashtra', 'cutoff', 'by', 'score', 'obc', '163', 'st', '133', 'nt1', '87', 'sc', '147']
['placements', 'from', '2020', 'batch', 'about', '50', 'of', 'the', 'students', 'got', 'placements', 'maximum', 'placements', 'are', 'conducted', 'for', 'a', 'computer', 'course', 'and', 'along', 'with', 'that', 'students', 'who', 'opt', 'for', 'computer', 'minor', 'have', 'great', 'opportunities', 'for', 'placement

In [7]:
indexes = []
for doc, terms in doc_terms.items() :
    terms = remove_stopwords(terms)
    #print(doc_terms[doc])
    terms = lemmatize(terms)
    doc_terms[doc] = terms
    print(doc_terms[doc])
    indexes.extend(terms) 
indexes = list(set(indexes))

indexes_dict = {}
for i in range(len(indexes)) :
    indexes_dict[indexes[i]] = i
    
#print(len(indexes))

['college', 'engineering', 'pune', 'coep', 'autonomous', 'institute', 'locate', 'pune', 'maharashtra', 'india', 'establish', 'year', '1854', 'establish', '1854', 'one', 'oldest', 'engineering', 'college', 'india']
['coep', 'cutoff', '2020', 'depend', 'entrance', 'exam', 'mht', 'cet', 'exam', 'conduct', 'dte', 'maharashtra', 'coep', 'ug', 'civil', 'engineering', 'coep', 'admission', 'cut', 'mhtcet', 'reservation', 'maharashtra', 'cutoff', 'score', 'obc', '163', 'st', '133', 'nt1', '87', 'sc', '147']
['placement', '2020', 'batch', '50', 'student', 'get', 'placement', 'maximum', 'placement', 'conduct', 'computer', 'course', 'along', 'student', 'opt', 'computer', 'minor', 'great', 'opportunity', 'placement']


# Term-Document Matrix

In [8]:
import numpy as np
import pandas as pd

term_document_matrix = np.zeros((len(indexes), len(doc_terms)))
indexes = list(indexes)

#Getting term frequencies
for row in range(len(indexes)) :
    for col in range(len(corpus)) :
        term_document_matrix[row][col] = doc_terms[corpus[col]].count(indexes[row])

cols = [corpus[i] for i in range(len(corpus))]
term_doc_table = pd.DataFrame(term_document_matrix, index=indexes, columns=cols)

print(term_doc_table)

             about_coep.txt  coep_cutoff.txt  placements.txt
great                   0.0              0.0             1.0
obc                     0.0              1.0             0.0
147                     0.0              1.0             0.0
2020                    0.0              1.0             1.0
year                    1.0              0.0             0.0
one                     1.0              0.0             0.0
oldest                  1.0              0.0             0.0
entrance                0.0              1.0             0.0
exam                    0.0              2.0             0.0
133                     0.0              1.0             0.0
87                      0.0              1.0             0.0
minor                   0.0              0.0             1.0
college                 2.0              0.0             0.0
opt                     0.0              0.0             1.0
admission               0.0              1.0             0.0
mht                     

# Tf-Idf Matrix

In [9]:
from sklearn.preprocessing import normalize

N = len(corpus)

log_tf = np.zeros((len(indexes), len(corpus)))
for i in range(len(indexes)) :
    for j in range(len(corpus)) :
        log_tf[i] = np.log10(1+ term_document_matrix[i])

log_df = np.zeros((len(indexes), len(corpus)))
df_dict = {}
for i in range(len(indexes)) :
    df = np.count_nonzero(term_document_matrix[i])
    #print(indexes[i], term_document_matrix[i], df)
    df = np.log10(N/df)
    df_dict[indexes[i]] = df
    log_df[i] = [df]*N


tf_idf = np.multiply(log_tf, log_df)
tf_idf = normalize(tf_idf, axis=0, norm='l2')

table = pd.DataFrame(tf_idf, index=indexes, columns=cols)
print(table)

             about_coep.txt  coep_cutoff.txt  placements.txt
great              0.000000         0.000000        0.219857
obc                0.000000         0.194921        0.000000
147                0.000000         0.194921        0.000000
2020               0.000000         0.071939        0.081143
year               0.228366         0.000000        0.000000
one                0.228366         0.000000        0.000000
oldest             0.228366         0.000000        0.000000
entrance           0.000000         0.194921        0.000000
exam               0.000000         0.308942        0.000000
133                0.000000         0.194921        0.000000
87                 0.000000         0.194921        0.000000
minor              0.000000         0.000000        0.219857
college            0.361951         0.000000        0.000000
opt                0.000000         0.000000        0.219857
admission          0.000000         0.194921        0.000000
mht                0.000

# Query Preprocessing

In [13]:
query = input()
query = remove_punctuation(query).lower()
query = remove_stopwords(query.split())
query = lemmatize(query)

query_tf = {}
for term in query :
    if term not in query_tf :
        query_tf[term] = 1
    else :
        query_tf[term] += 1
        
query = set(query)

bihar


# Making query vector

In [14]:
tf = np.zeros((len(indexes), 1))
df = np.zeros((len(indexes), 1))
common_terms = query.intersection(indexes)
print(common_terms)
if(common_terms) :
    for term in common_terms :
        tf[indexes_dict[term]][0] = query_tf[term]
        df[indexes_dict[term]][0] = df_dict[term]
log_tf = np.log10(1+tf)
log_df = df
tf_idf_query = np.multiply(log_tf, log_df)
#print(tf_idf_query)
tf_idf_query = normalize(tf_idf_query, axis=0, norm='l2')

print(tf_idf_query.T)

set()
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]]


# Cosine similarity to find relevance

In [15]:
scores = np.array([0]*len(corpus))

scores = np.sum(tf_idf * tf_idf_query, axis = 0)

ranks = np.argsort(scores)[::-1]
print("Scores:",scores)
print("Ranks:",ranks)

print("Order of relevance :")

for i in ranks :
    print(corpus[i], end = " ")


Scores: [0. 0. 0.]
Ranks: [2 1 0]
Order of relevance :
placements.txt coep_cutoff.txt about_coep.txt 