In [None]:
!pip install sentence-transformers
!pip install unidecode
!pip install word2number
!pip install gensim

In [None]:
! sudo apt install openjdk-8-jdk
! sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
! pip install language-check
! pip install pycontractions

In [None]:
from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
from pycontractions import Contractions
import gensim.downloader as api
import en_core_web_sm
import re
import json
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sentence_transformers import models, losses, util
from sentence_transformers.cross_encoder import CrossEncoder
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
with open('/content/drive/MyDrive/Data/android/questions_related.json') as f:
    questions = pd.read_json(f,orient='table')

In [None]:
nlp = en_core_web_sm.load()

# Choose model accordingly for contractions function
model = api.load("glove-twitter-25")
# model = api.load("glove-twitter-100")
# model = api.load("word2vec-google-news-300")

cont = Contractions(kv_model=model)
cont.load_models()

# exclude words from spacy stopwords list
deselect_stop_words = ['no', 'not']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False


def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text


def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())


def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text


def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = list(cont.expand_texts([text], precise=True))[0]
    return text


def text_preprocessing(text, accented_chars=True, contractions=True, 
                       convert_num=True, extra_whitespace=True, 
                       lemmatization=True, lowercase=True, punctuations=False,
                       remove_html=True, remove_num=False, special_chars=False, 
                       stop_words=True):
    """preprocess text with default option set to true for all steps"""
    if remove_html == True: #remove html tags
        text = strip_html_tags(text)
    if extra_whitespace == True: #remove extra whitespaces
        text = remove_whitespace(text)
    if accented_chars == True: #remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: #expand contractions
        text = expand_contractions(text)
    if lowercase == True: #convert all characters to lowercase
        text = text.lower()
    
    # return text

    doc = nlp(text) #tokenise text

    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text
        # remove stop words
        if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
            flag = False
        # remove punctuations
        if punctuations == True and token.pos_ == 'PUNCT' and flag == True: 
            flag = False
        # remove special characters
        if special_chars == True and token.pos_ == 'SYM' and flag == True: 
            flag = False
        # remove numbers
        if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) \
        and flag == True:
            flag = False
        # convert number words to numeric numbers
        if convert_num == True and token.pos_ == 'NUM' and flag == True:
            try:
                edit = w2n.word_to_num(token.text)
            except:
                a=1
                # print('error in converting number words to numeric numbers',token.text)
        # convert tokens to base form
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        # append tokens edited and not removed to list 
        if edit != "" and flag == True:
            clean_text.append(edit)        
    return clean_text



In [None]:
def listToString(s):  
    
    # initialize an empty string 
    str1 = ""  
    
    # traverse in the string   
    for ele in s:  
        str1 += str(ele) + ' '  
    
    # return string   
    return str1  

In [None]:
quora_model = CrossEncoder('sentence-transformers/ce-distilroberta-base-quora', max_length=512)

In [None]:
quora_model.predict([' I study in Class 8 and find math easy ','Math for class 8 is very easy'])

0.9924936

In [None]:
questions.info()

In [None]:
score=0
num = 100
for i in range(1,num):
    q1 = random.choice(questions.index.values)
    q2 = random.choice(questions.index.values)
    while(q2==q1):
        q2 = random.choice(questions.index.values)
    score = score+ quora_model.predict([
                                        listToString(questions.loc[q1,'Tags'])+
                                        listToString(text_preprocessing(questions.loc[q1,'Title'],stop_words= False))+
                                        listToString(text_preprocessing(questions.loc[q1,'Text'],stop_words= False)),
                                        listToString(questions.loc[q2,'Tags'])+
                                        listToString(text_preprocessing(questions.loc[q2,'Title'],stop_words= False))+
                                        listToString(text_preprocessing(questions.loc[q2,'Text'],stop_words= False))
    ])
    print(i)

score = score/(num)
print(score)

In [None]:
score

0.01220326471142471

In [None]:
n = 100
count =0
rel_score=0
for i in questions.index.values:
    if(count==n):
        break
    q1 = i
    if(len(questions.loc[i,'Related'])!=0):
        q2 = random.choice(questions.loc[i,'Related'])
        rel_score = rel_score+ quora_model.predict([
                                                    listToString(questions.loc[q1,'Tags'])+
                                                    listToString(text_preprocessing(questions.loc[q1,'Title'],stop_words= False))+
                                                    listToString(text_preprocessing(questions.loc[q1,'Text'],stop_words= False)),
                                                    listToString(questions.loc[q2,'Tags'])+
                                                    listToString(text_preprocessing(questions.loc[q2,'Title'],stop_words= False))+
                                                    listToString(text_preprocessing(questions.loc[q2,'Text'],stop_words= False))
                                            
        ])
        count +=1
        print(count)
rel_score = rel_score/n
print(rel_score)