# making data ready

In [1]:
f=open('train_data.txt',encoding='utf-8')                     # reading the training data
train_data=f.read()
train_data=train_data.split('\n')
f.close()

X=[]                                                          # separating training data into X_train and Y_train
Y=[]
for text in train_data:
    Y.append(text.split(': ')[0])
    if len(text.split(': '))==2:
        X.append(text.split(': ')[1])
    else:
        X.append(':'.join(text.split(': ')[1:]))
        

from sklearn.model_selection import train_test_split          # splitting the data into train and test data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42,stratify=Y)

from sklearn.feature_extraction.text import TfidfVectorizer   # converting text to vector using Tf-Idf
vectorizer = TfidfVectorizer()                         
vectorizer=vectorizer.fit(X_train)                     
X_train_vec=vectorizer.transform(X_train).toarray() 
X_test_vec=vectorizer.transform(X_test).toarray()

from sklearn.preprocessing import LabelEncoder                
encoder=LabelEncoder()
encoder.fit(Y_train)
Y_train_vec=encoder.transform(Y_train)
Y_test_vec=encoder.transform(Y_test)

# SVM

In [2]:
from sklearn.model_selection import GridSearchCV                      # finding hyper parameter C=(1/lambda)
from sklearn.svm import LinearSVC
model = LinearSVC(loss="squared_hinge",multi_class="crammer_singer")
model.fit(X_train_vec, Y_train_vec)




LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='crammer_singer', penalty='l2', random_state=None,
          tol=0.0001, verbose=0)

In [3]:
f=open('test_data.txt',encoding='utf-8')                        # reading the test data
test_data=f.read()
test_data=test_data.split('\n')
f.close()

questions=[]                                                     # separating test data into question and answer
answers=[]
for text in test_data:
    answers.append(text.split(':::')[0])
    questions.append(text.split(':::')[1])

In [4]:
f=open('english_questions.txt',encoding='utf-8')        # this file contains translated questions
eng_data=f.read()
eng_data=eng_data.split('\n')                           # NOTE: to avoid errors during translation first i am translating all                                                       
f.close()                                               #       telugu questions to english and stored them in a file
eng_questions=[]                                        
for text in eng_data:
#     eng_questions.append(text)
    eng_questions.append(text.lower())
    

In [5]:
import re
from collections import Counter
import math
import pandas as pd

In [6]:
def from_text_to_vec(text):
    WORD = re.compile(r"\w+")
    words = WORD.findall(text)
    return Counter(words)

In [7]:
def get_cosine(text1, text2):
    vect1 = from_text_to_vec(text1)
    vect2 = from_text_to_vec(text2)
    intersection = set(vect1.keys()) & set(vect2.keys())
    nume = sum([vect1[x] * vect2[x] for x in intersection])
    
    sum1 = sum([vect1[x]**2 for x in vect1.keys()])
    sum2 = sum([vect2[x]**2 for x in vect2.keys()])
    denom = math.sqrt(sum1) * math.sqrt(sum2)
    
    if not denom:
        return 0.0
    else:
        return nume/denom

In [8]:
def cosine_charcterwise(word1,word2):                 ### computing cosine similarity between search query & possible answers
        w1_len = len(word1)
        w2_len = len(word2)
        match = 0

        if w1_len > w2_len:
            for i in range(len(word2)):
                if word1[i] == word2[i]:
                    match += 1
            sim_char = match/w1_len
        elif w2_len > w1_len:
            for i in range(len(word1)):
                if word2[i] == word1[i]:
                    match += 1
            sim_char = match/w2_len
        else:
            sim_char = 1
        return sim_char

In [9]:
def webscraping(search_engine,query_telugu,query_english,sentences):
    
    print('entered search query: ',query_telugu)
    print('english query: ',query_english)
    print(sentences)
    search_engine=search_engine.lower()
    if search_engine=='google':                                       
        search_query='https://www.google.dz/search?q='+query_english
    elif search_engine=='bing':
        search_query='https://www.bing.com/search?q='+query_english         
    else:
        print('chose correct search engine')
        return 
    
    
    if search_engine=='google':
        
        from bs4 import BeautifulSoup
        import requests
        headers = {'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}
        page=requests.get(search_query,headers=headers)                
        
        soup =BeautifulSoup(page.content)              
        
        processed_links_google=[]                     
        links=soup.find_all("div","r")
        for link in links:
            a_tag=link.find("a")
            processed_links_google.append(a_tag["href"])
        #print(processed_links_google)
    
        f=open('scrapped_google.txt','w+',encoding='utf-8')     ### now extracting text from urls
        
        from tqdm import tqdm
        for i in tqdm(processed_links_google):
            
            try:
        
                page=requests.get(i,headers=headers,timeout=20)
                if page.status_code!=200:
                    continue
                soup =BeautifulSoup(page.content)
                for q in soup.find_all('p'):
                    matched_text=q.get_text()
                    f.write(matched_text)
                    f.write(' ')
            except:
                f.write(' ')
            
        f.close()
        
    ###################### TEXT PROCESSING #########################
         
            
    f2=open('scrapped_'+search_engine+'.txt','r',encoding='utf-8')
    a=f2.read()
    f2.close()
    cleaned_a=''.join(i for i in a if ord(i)<128)   ### removing characters that are not in ASCII format (0-127 numbers)


    from nltk.tokenize import sent_tokenize         ### sentence tokenization

    sentences=list(sent_tokenize(cleaned_a))
    #print(sentences)

        
    ##################### COMPUTING COS-SIM & PRINTING TOP SENTENCES #########################
        
        
   

    import numpy as np
    from nltk.tokenize import word_tokenize
    def cosine_sim(sent1,sent2):                 ### computing cosine similarity between query & extracted sentences
        sent1_words=word_tokenize(sent1)
        sent2_words=word_tokenize(sent2)
        total_words=sent1_words+sent2_words
        unique_words=list(set(total_words))

        sent1_bow=[]
        sent2_bow=[]
        for i in unique_words:
            if i in sent1_words:
                sent1_bow.append(1)
            else:
                sent1_bow.append(0)

            if i in sent2_words:
                sent2_bow.append(1)
            else:
                sent2_bow.append(0)


        a=np.array(sent1_bow)
        b=np.array(sent2_bow)
        numerator=sum(a*b)
        denominator=((sum(a**2))*(sum(b**2)))**0.5
        cosine_similarity=numerator/denominator
        return cosine_similarity

    sim_values=[]                                  ### storing similarity values in a list
    for sent in sentences:
        u=cosine_sim(query_english,sent)
        sim_values.append(u)
    
    
    #print(sim_values)
        
    import pandas as pd                            ### creating pandas dataframe to visualize the results
    d={'sentence':sentences,'cos-sim':sim_values}
    output=pd.DataFrame(d)
    #print(output)
    output.sort_values(by=['cos-sim'],inplace=True,ascending=False)
    
    top_sentences=[]
    sent_count=[10,20,30,40]
    for k in sent_count:
#         top_sents = sentences[:k]
        top_sent=min(len(sentences),k)
        top_sents=output.sentence.head(top_sent).values
        top_sentences.append(top_sents)
    
    
    
    
    
    
    ###################### Question Classification #########################
    
    
    
    questions_vec=vectorizer.transform([query_telugu]).toarray()          # converting questions to vector using Tf-Idf

    predicted_qc_svm=model.predict(questions_vec)
    quest_class=encoder.inverse_transform(predicted_qc_svm)[0]
    pred_class_list.append(quest_class)
    print(quest_class)
    
    
    
    ###################### NER and Best-answer extraction #########################
    
    
    
    import re
    import spacy
    def ner_spacy(input_text):
        nlp = spacy.load('en_core_web_sm')
        ners = nlp(input_text)

        L=[]
        for ent in ners.ents:
            L.append((ent.text,ent.label_))

        unique_ners = ["PERSON","NORP","FAC","ORG","GPE","LOC","PRODUCT","EVENT","WORK_OF_ART","LAW","LANGUAGE","DATE","TIME","PERCENT","MONEY","QUANTITY","ORDINAL","CARDINAL"]
        ners={}
        for i in unique_ners:
            l = []
            for j in L:
                if (j[1] == i):
                    l.append(j[0])

            ners[str(i)] = l

        return ners


    possible_ners=[]
    for entry in top_sentences:
        ner_sent=''
        for i in entry:
            ner_sent=ner_sent+i+''
        a=ner_spacy(ner_sent)
        possible_ners.append(a)

    possible_answers_list=[]
    
    for i in range(len(sent_count)):
        answers_extracted = []
        if quest_class == "LOCA":
            answers_extracted = possible_ners[i]["GPE"] + possible_ners[i]["LOC"]
        elif quest_class == "PERS":
            answers_extracted = possible_ners[i]["PERSON"]
        elif quest_class == "DATE":
            answers_extracted = possible_ners[i]["DATE"]
        elif quest_class == "ORGA":
            answers_extracted = possible_ners[i]["ORG"]
        elif quest_class == "PERC":
            answers_extracted = possible_ners[i]["PERCENT"]
        elif quest_class == "TIME":
            answers_extracted = possible_ners[i]["TIME"]
        elif quest_class == "NUMB":
            answers_extracted = possible_ners[i]["CARDINAL"] + possible_ners[i]["QUANTITY"]



            ######################  #########################

        modified_answers=[sub.lower().replace('one','1') for sub in answers_extracted]
        modified_answers=[sub.lower().replace('two','2') for sub in modified_answers]
        modified_answers=[sub.lower().replace('three','3') for sub in modified_answers]
        modified_answers=[sub.lower().replace('four','4') for sub in modified_answers]
        modified_answers=[sub.lower().replace('five','5') for sub in modified_answers]
        modified_answers=[sub.lower().replace('six','6') for sub in modified_answers]
        modified_answers=[sub.lower().replace('seven','7') for sub in modified_answers]
        modified_answers=[sub.lower().replace('eight','8') for sub in modified_answers]
        modified_answers=[sub.lower().replace('nine','9') for sub in modified_answers]
        modified_answers=[sub.lower().replace('ten','10') for sub in modified_answers]
        answers_extracted=modified_answers

        possible_answers_list.append(answers_extracted)
    
    ###################### removing answers that are there in the query words #########################
    
    def cosine_sim_answers(sent1,sent2):                 ### computing cosine similarity between search query & possible answers
        X_list = sent1.lower().split()
        Y_list = sent2.lower().split()

        l1 =[];l2 =[] 

        X_set = {w for w in X_list } 
        Y_set = {w for w in Y_list } 

        rvector = X_set.union(Y_set) 
        for w in rvector: 
            if w in X_set: l1.append(1)
            else: l1.append(0) 
            if w in Y_set: l2.append(1) 
            else: l2.append(0) 
        c = 0
        for i in range(len(rvector)): 
                c+= l1[i]*l2[i] 
        cosine = c / float((sum(l1)*sum(l2))**0.5) 

        return cosine
    
    possible_answers_modified=[]
    for count in range(len(sent_count)):
        possible_answers=[]
        for i in possible_answers_list[count]:
            try:
                if (re.search(i.lower(),query_english)) == None:
                    possible_answers.append(i)
            except:
                None

            if possible_answers!=[]:
                try:
                    for j in possible_answers:
                            if (re.search('state',query_english)!=None or 'where' in query_words) and (j.lower() in ['india','indian',"india's"]):
                                possible_answers.remove(i)
                except:
                    None
            
        possible_answers_modified.append(possible_answers)
    
    possible_answers_list=possible_answers_modified
    
    possible_answers_modified_1=[]
    for count in range(len(sent_count)):
        possible_answers=[]
        for i in possible_answers_list[count]:
            try:
                if (re.search(i.lower(),query_english)) == None:
                    possible_answers.append(i)
            except:
                None

            if possible_answers!=[]:
                try:
                    for j in possible_answers:
#                             if (re.search('state',query_english)!=None or 'where' in query_words) and (j.lower() in ['india','indian',"india's"]):
#                                 possible_answers.remove(i)
                            if (re.search('country',query_english) == None) and (j.lower() in ['india','indian',"india's"]):
                                possible_answers.remove(i)
                except:
                    None
            
        possible_answers_modified_1.append(possible_answers)
    
    possible_answers_list=possible_answers_modified_1
    
    possible_answers_modified_2=[]
    for count in range(len(sent_count)):
        possible_answers=[]
        for i in possible_answers_list[count]:
            try:
                if (re.search(i.lower(),query_english)) == None:
                    possible_answers.append(i)
            except:
                None

            if possible_answers!=[]:
                try:
                    for j in possible_answers:
                            if (quest_class == "DATE") and (j.lower() in ['day','a day','the day','month','a month','the month','year','a year','the year']):
                                possible_answers.remove(i)
                except:
                    None
            
        possible_answers_modified_2.append(possible_answers)
    
    possible_answers_list=possible_answers_modified_2
    
    print(possible_answers_list)
    print('\n')
                   
        
    
    best_ans=[]
    for i in possible_answers_list:

        unique_answers=[]
        for ans in i:
            if ans not in unique_answers:
                unique_answers.append(ans)

        unique_answers_count=[]
        for j in unique_answers:
            unique_answers_count.append(i.count(j))

            #print(unique_answers)
            #print(unique_answers_count)
        if unique_answers_count==[]:
            best_ans.append('***')
        elif max(unique_answers_count)>1:
            max_index=unique_answers_count.index(max(unique_answers_count))
            best_ans.append(unique_answers[max_index])
        else:
            best_ans.append(i[0])    
    print('best answer')
    print(best_ans)

    return best_ans
    
    