# Information retrieval - Project 2 

# <font color=green>Part1</font>

## Cleaning Questions

### Function to remove:
- Stop words
- Punctuatuions
- Digits
- Removing strings like  __''__ and __``__

In [2]:
import string, nltk, re
from nltk.corpus import stopwords

In [3]:
english_stopwords = stopwords.words('english')
punc = string.punctuation


def myCleaner(list):

    clean_text = []
    for word in list:
        word = word.lower()
        if word not in (english_stopwords) and (word not in punc):
            word = re.sub('[0-9]+','',word).strip()
            word = re.sub("[``|'']+",'',word).strip()            

            if len(word) > 0:
                clean_text.append(word)        
    return clean_text

## STEM
- Function to do stemming process

In [4]:
from nltk.stem import PorterStemmer 
stemmer = PorterStemmer()

In [5]:
def myStemmer(list):
    stemmed_list = []
    for word in list:
        stemmed_list.append(stemmer.stem(word))
    return stemmed_list

## preprocess

- Function to preprocess dataset Using __myCleaner__ and __myStemmer__ and __Regular expression__

In [6]:
import re

In [7]:
def preprocess(file):
    coarse = list()
    fine = list()
    preprocessed = list()
    with open(file) as fh:
        for line in fh:
            question = re.findall('.+:[\S]+ (.+)',line)[0]
            
            question = question.split() #Convert each question into List
            stop = myCleaner(question)
            stem = myStemmer(stop)
            
            
            if len(stem) == 0:
                continue
            
            CleanedQuestion = ' '.join(stem)
            
            if CleanedQuestion in preprocessed: #Skip duplicate questions
                continue
            
            coarse.append(re.findall('([\S]+):',line)[0])
            fine.append(re.findall('.+:([\S]+) ',line)[0])
            preprocessed.append(CleanedQuestion)

    return preprocessed, coarse, fine


## Encoding

- Function to assign each class a number  

#### *Function to get list of classes used in dataset*

In [8]:
def get_fine_dict():
    fine = {}
    j = 0
    with open('train_5500.label.txt') as fh:
        for line in fh:
            c = re.findall('.+:([\S]+) ',line)[0] 
            if c not in fine:
                fine[c] = j + 1
                j += 1
    return fine   

def get_coarse_dict():
    coarse = {}
    i = 0
    with open('train_5500.label.txt') as fh:
        for line in fh:
                c = re.findall('([\S]+):',line)[0]
                if c not in coarse:
                        coarse[c] = i + 1
                        i += 1
    return coarse

#### *assign number to each class and vice-versa*

In [9]:
def reverse(dictionary):
    r = {}
    for i,j in dictionary.items():
        r[j] = i
    return r

In [10]:
coarse = get_coarse_dict()
fine = get_fine_dict()

rev_coarse = reverse(coarse)
rev_fine = reverse(fine)

              
def encode(list_,coarseORfine):
    encList = []
    if coarseORfine == 'coarse' :
        for item in list_:
            encList.append(coarse[item.upper()])
        return encList
        
    elif coarseORfine == 'fine':
        for item in list_:
            encList.append(fine[item.lower()])
        return encList
    
def decode(list_, coarseORfine):
    decList = []
    if coarseORfine == 'coarse' :
        for item in list_:
            decList.append(rev_coarse[item])
        return decList
        
    elif coarseORfine == 'fine':
        for item in list_:
            decList.append(rev_fine[item])
        return decList

## TF-IDF
- Function to calculate tf-idf

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

countvec = CountVectorizer()
tfidf = TfidfTransformer()

def vectorize(list,trte):
    if trte == 'tr':
        count_vector = countvec.fit_transform(list)
        tfidf_vector = tfidf.fit_transform(count_vector).toarray()
        # print(countvec.get_feature_names_out())
        # print(count_vector.toarray())  
        return tfidf_vector
    
    elif trte == 'te':
        count_vector = countvec.transform(list)
        tfidf_vector = tfidf.transform(count_vector).toarray()
        # print(countvec.get_feature_names_out())
        # print(count_vector.toarray())
        return tfidf_vector

## Train
- function to make models learn __coarse__ class

In [12]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn import metrics
import pickle
import time
from vectorize import vectorize

#### *Making data ready for training*

In [13]:
x_train = preprocess('train_5500.label.txt')[0]
y_train_coarse = preprocess('train_5500.label.txt')[1]
x_train_tfidf = vectorize(x_train,'tr')

x_test = preprocess('TREC_10.label.txt')[0]
y_test_coarse = preprocess('TREC_10.label.txt')[1]
x_test_tfidf = vectorize(x_test,'te')

#### *Training*
- This function after training, saves each model for future predictions.

In [14]:
def train_part1():
    model_prediction_part1 = dict()
    times = dict()
    
    # Multinomial Naive Bayes
    s11 = time.time() 
    model = MultinomialNB().fit(x_train_tfidf, encode(y_train_coarse,'coarse'))
    e11 = time.time()
    filename = 'Multinomial.sav'
    pickle.dump(model, open(filename, 'wb'))
    s12 = time.time()
    model_prediction_part1['Multinomial'] = model.predict(x_test_tfidf)
    e12 = time.time()
    times['Multinomial'] = ((e11-s11) * 10**3,(e12-s12) * 10**3)

    # Bernoulli Naive Bayes
    s21 = time.time()
    model = BernoulliNB().fit(x_train_tfidf, encode(y_train_coarse,'coarse'))
    e21 = time.time()
    filename = 'Bernoulli_Naive_Bayes.sav'
    pickle.dump(model, open(filename, 'wb'))
    s22 = time.time()
    model_prediction_part1['Bernoulli'] = model.predict(x_test_tfidf) 
    e22 = time.time()
    times['Bernoulli'] = ((e21-s21) * 10**3,(e22-s22) * 10**3)


    # SVM - Linear Kernel
    s31 = time.time()
    model = SVC(kernel='linear').fit(x_train_tfidf, encode(y_train_coarse,'coarse'))
    e31 = time.time()
    filename = 'SVM_LinearKernel.sav'
    pickle.dump(model, open(filename, 'wb'))
    s32 = time.time()
    model_prediction_part1['SVM|Linear Kernel'] = model.predict(x_test_tfidf)
    e32 = time.time()
    times['SVM|Linear Kernel'] = ((e31-s31) * 10**3,(e32-s32) * 10**3)

    # SVM - rbf
    s41 = time.time()
    model = SVC().fit(x_train_tfidf, encode(y_train_coarse,'coarse'))
    e41 = time.time()
    filename = 'SVM_rbf.sav'
    pickle.dump(model, open(filename, 'wb'))
    s42 = time.time()
    model_prediction_part1['SVM|rbf'] = model.predict(x_test_tfidf)
    e42 = time.time()
    times['SVM|rbf'] = ((e41-s41) * 10**3,(e42-s42) * 10**3)

    #KNN
    Ks = [3,5,6]
    for k in Ks:
        s51 = time.time()
        model = KNeighborsClassifier(n_neighbors = k).fit(x_train_tfidf, encode(y_train_coarse,'coarse'))
        e51 = time.time()
        filename = f'knn{k}.sav'
        pickle.dump(model, open(filename, 'wb'))
        s52 = time.time()
        model_prediction_part1[f'knn|k={k}'] = model.predict(x_test_tfidf)
        e52 = time.time()
        times[f'knn|k={k}'] = ((e51-s51) * 10**3,(e52-s52) * 10**3)
        
    print("\n                accuracy")
    for model_name in model_prediction_part1.keys():
        print(f"{model_name} :{' '*(25 - len(model_name))}",round(accuracy_score(encode(y_test_coarse,'coarse'), model_prediction_part1[model_name])*100, 2))
        # print(model_name, ': ', round(accuracy_score(encode(y_test_coarse,'coarse'), model_prediction_part1[model_name])*100, 2))


    print(f"\n\n{'.'*66}")
    for model_name in model_prediction_part1.keys():
        macro = precision_recall_fscore_support(encode(y_test_coarse,'coarse'),model_prediction_part1[model_name],average='macro',zero_division=0)
        micro = precision_recall_fscore_support(encode(y_test_coarse,'coarse'),model_prediction_part1[model_name],average='micro',zero_division=0)

        print(f"\n{model_name}|{' '*(25 - len(model_name))}   precision         recall          Fscore")
        print('   Macro:                     ',round(macro[0]*100,2), end= '             ')
        print(round(macro[1]*100,2), end= '           ')
        print(round(macro[2]*100,2))
        print('   Micro:                     ',round(micro[0]*100,2), end= '              ')
        print(round(micro[1]*100,2),end= '            ')
        print(round(micro[2]*100,2)) 


    print(f"\n\n{'.'*66}")
    print("\nRun time(train,test)[ms]\n")
    for t in times.items():
        print(t[0],end=f"|{' '*(25 - len(t[0]))}")
        for i in t[1]:
            print(round(i,2), end='    ')
        print('\n')

#### *Loads models to predict test dataset*

In [15]:
def load_train_part1():
    model_prediction_part1 = dict()
    times = dict()
    # Multinomial Naive Bayes
    s11 = time.time() 
    filename = 'Multinomial.sav'
    model = pickle.load(open(filename, 'rb'))
    e11 = time.time()
    s12 = time.time()
    model_prediction_part1['Multinomial'] = model.predict(x_test_tfidf)
    e12 = time.time()
    times['Multinomial'] = ((e11-s11) * 10**3,(e12-s12) * 10**3)

    # Bernoulli Naive Bayes
    s21 = time.time()
    filename = 'Bernoulli_Naive_Bayes.sav'
    model = pickle.load(open(filename, 'rb'))
    e21 = time.time()
    s22 = time.time()
    model_prediction_part1['Bernoulli'] = model.predict(x_test_tfidf) 
    e22 = time.time()
    times['Bernoulli'] = ((e21-s21) * 10**3,(e22-s22) * 10**3)


    # SVM - Linear Kernel
    s31 = time.time()
    filename = 'SVM_LinearKernel.sav'
    model = pickle.load(open(filename, 'rb'))
    e31 = time.time()
    s32 = time.time()
    model_prediction_part1['SVM|Linear Kernel'] = model.predict(x_test_tfidf)
    e32 = time.time()
    times['SVM|Linear Kernel'] = ((e31-s31) * 10**3,(e32-s32) * 10**3)

    # SVM - rbf
    s41 = time.time()
    filename = 'SVM_rbf.sav'
    model = pickle.load(open(filename, 'rb'))
    e41 = time.time()
    s42 = time.time()
    model_prediction_part1['SVM|rbf'] = model.predict(x_test_tfidf)
    e42 = time.time()
    times['SVM|rbf'] = ((e41-s41) * 10**3,(e42-s42) * 10**3)

    #KNN
    Ks = [3,5,6]
    for k in Ks:
        s51 = time.time()
        filename = f'knn{k}.sav'
        model = pickle.load(open(filename, 'rb'))
        e51 = time.time()
        s52 = time.time()
        model_prediction_part1[f'knn|k={k}'] = model.predict(x_test_tfidf)
        e52 = time.time()
        times[f'knn|k={k}'] = ((e51-s51) * 10**3,(e52-s52) * 10**3)





    print("\n                accuracy")
    for model_name in model_prediction_part1.keys():
        print(f"{model_name} :{' '*(25 - len(model_name))}",round(accuracy_score(encode(y_test_coarse,'coarse'), model_prediction_part1[model_name])*100, 2))
        # print(model_name, ': ', round(accuracy_score(encode(y_test_coarse), model_prediction_part1[model_name])*100, 2))


    print(f"\n\n{'.'*66}")
    for model_name in model_prediction_part1.keys():
        macro = precision_recall_fscore_support(encode(y_test_coarse,'coarse'),model_prediction_part1[model_name],average='macro',zero_division=0)
        micro = precision_recall_fscore_support(encode(y_test_coarse,'coarse'),model_prediction_part1[model_name],average='micro',zero_division=0)

        print(f"\n{model_name}|{' '*(25 - len(model_name))}   precision         recall          Fscore")
        print('   Macro:                     ',round(macro[0]*100,2), end= '             ')
        print(round(macro[1]*100,2), end= '           ')
        print(round(macro[2]*100,2))
        print('   Micro:                     ',round(micro[0]*100,2), end= '              ')
        print(round(micro[1]*100,2),end= '            ')
        print(round(micro[2]*100,2)) 


    print(f"\n\n{'.'*66}")
    print("\nRun time(train,test)[ms]\n")
    for t in times.items():
        print(t[0],end=f"|{' '*(25 - len(t[0]))}")
        for i in t[1]:
            print(round(i,2), end='    ')
        print('\n')

In [None]:
# train_part1()

load_train_part1()

# <font color=red>End of Part1</font>

# <font color=green>Part2</font>

# <font color=green>2.1</font>

In [16]:
def sub(list_, target):
    j = 0
    for i in list_:
        if i != target:
            list_[j] = 0
            j += 1
        else:
            list_[j] = 1
            j += 1
    return list_

In [17]:
def accuracy_per_label(y_true, y_pred, labels):
    i = 0
    accs = dict()
    for label in labels:
        copy_y_true = y_true.copy()
        copy_y_pred = y_pred.copy()
        bin_y_true = sub(copy_y_true, label)
        bin_y_pred = sub(copy_y_pred, label)
        accs[decode(labels, 'coarse')[i]] = round(accuracy_score(bin_y_true,bin_y_pred),2)
        i += 1
    return accs

In [18]:
def train_part2():
    
    x_test = preprocess('TREC_10.label.txt')[0]
    y_test_coarse = preprocess('TREC_10.label.txt')[1]
    x_test_tfidf = vectorize(x_test,'te')
    
    labels = ['abbr', 'desc', 'enty', 'hum', 'num', 'loc']
    elabels = encode(labels,'coarse')
    filename = 'SVM_LinearKernel.sav'
    model = pickle.load(open(filename, 'rb'))
    y_pred = model.predict(x_test_tfidf)
    
    acc = accuracy_per_label(encode(y_test_coarse,'coarse'), y_pred, encode(labels,'coarse'))
    print('ACCURACY')
    for i in acc:
        print(f"\n{i}|{' '*(10 - len(i))}", acc[i] * 100)
    
    
    
    print(f"\n\n{'.'*66}")
    
    metrics = precision_recall_fscore_support(encode(y_test_coarse,'coarse'), y_pred, average=None, labels=elabels)
    i = 0
    for label in labels:
        print(f"\n{label}|{' '*(25 - len(label))}   precision         recall")
        print('                              ',round(metrics[0][i],2) * 100, end= '             ')
        print(round(metrics[1][i],2)*100)
        i += 1
                

In [None]:
# train_part2()

# <font color=green>2.2</font>

In [None]:
# y_train_fine = preprocess('train_5500.label.txt')[2]
# model = SVC(kernel='linear').fit(x_train_tfidf, encode(y_train_fine,'fine'))

# filename = 'SVM_LinearKernel_fine.sav'
# pickle.dump(model, open(filename, 'wb'))

In [19]:
x_test_tfidf = vectorize(x_test,'te')
filename = 'SVM_LinearKernel.sav'
model = pickle.load(open(filename, 'rb'))
coarse_pred= model.predict(x_test_tfidf)

In [25]:
x = 0
desc=[]
enty=[]
abbr=[]
hum=[]
num=[]
loc=[]
for label in coarse_pred:
    if label == 1:
        desc.append(x_test[x])
        x += 1
        continue
    if label == 2:
        enty.append(x_test[x])
        x += 1
        continue
    if label == 3:
        abbr.append(x_test[x])
        x += 1
        continue
    if label == 4:
        hum.append(x_test[x])
        x += 1
        continue
    if label == 5:
        desc.append(x_test[x])
        x += 1
        continue
    if label == 6:
        loc.append(x_test[x])
        x += 1
        continue

In [32]:
class_dataset = [desc,enty,abbr,hum,num,loc]
filename = 'SVM_LinearKernel_fine.sav'
fine_model = pickle.load(open(filename, 'rb'))
result = dict()
# ['desc','enty','abbr',"hum",'num','loc']
p = 1
for data in class_dataset:
    if len(data) == 0:
        print(p)
        p +=1
        continue
    data = vectorize(data,'te')
    result[p] = fine_model.predict(data)
    p += 1
    


5


# <font color=green>Part3</font>

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, rand_score

In [None]:
x_train = preprocess('train_5500.label.txt')[0]
y_train_coarse = preprocess('train_5500.label.txt')[1]
y_train_fine = preprocess('train_5500.label.txt')[2]

location = list()
y_loc = list()
for i in range(len(y_train_coarse)):
    if y_train_coarse[i] == 'LOC':
        location.append(x_train[i])
        y_loc.append(fine[y_train_fine[i]])
location_tfidf = vectorize(location,'tr')


In [None]:
ks = [3, 4, 5, 6, 7]
k_preds = dict()
for k in ks:
    kmeans = KMeans(n_clusters=k,n_init="auto", init='k-means++').fit(location_tfidf)
    k_preds[k] = kmeans.labels_

In [None]:
silhouette = dict()
for k in ks:
    silhouette[k] = silhouette_score(location_tfidf,k_preds[k])

randindex = dict()
for k in ks:
    randindex[k] = rand_score(y_loc,k_preds[k])
    
#precesion and recal
pr = dict()
for k in ks:
    pr[k] =  precision_recall_fscore_support(y_loc, k_preds[k], zero_division = 1,average = 'macro')


In [None]:
print("   Silhouette    Rand Index    precision     recall")
for k in ks:
    print(f"{k} |    {round(silhouette[k],4)}         {round(randindex[k],3)}          {round(pr[k][0],2)*100}         {round(pr[k][1],2)*100}")

# <font color=red>End of Part3</font>