# StackOverflow Search Optimazation

#### Importing the required libraries

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import nltk
import requests
from itertools import combinations

W0606 17:56:17.851990 57928 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


#### Processing the input query to get tags
This function uses NLTK (Natural language toolkit) to tokenize the query an d remove stopwords. A maximum of only 5 tags are returned as per the StackExchange API.

In [2]:
def get_tags(input):
    nltk.download('punkt')
    nltk.download('stopwords')
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    tokenized_word=word_tokenize(input)
    stop_words=set(stopwords.words("english"))
    filtered_sent=[]
    for w in tokenized_word:
        if w not in stop_words:
            filtered_sent.append(w)
    if len(filtered_sent) > 5:
      return filtered_sent[:5]
    else:
      return filtered_sent

#### Requesting the StackExchange API for questions using the tags obatained
A list of all the combination of tags is created to request the API. This is done to maximize the chance of getting questions with atleast any one of the tag included. 

In [3]:
def get_questions(tags):
    temp = []
    messages = []
    data = []
    #Creating a list of all the possible combinations of tags
    for i in range(1, len(tags)+1):
        comb = []
        comb.append(list(combinations(tags, i)))
        for j in range(0, len(comb[0])):
            temp.append(list(comb[0][j]))
    #Making API calls to all the possible URLs
    messages = []
    desc = []
    for i in range(len(temp)-1, -1, -1):
        url = ''
        for j in temp[i]:
            url += j + '%3B'
        URL = f'https://api.stackexchange.com/2.2/questions?order=asc&sort=activity&tagged={url}&site=stackoverflow'
        r = requests.get(url = URL)
        data = r.json()
        for item in data['items']:
          desc.append(item)
          messages.append(item['title'])
    return [messages,desc]

#### Calculating the similarities
Using TensorFlow Hub's Universal Sentence Encoder to calculate embeddings for all the questions. Then the inner product of the embeddings gives the similarities between them.

In [4]:
#Converting sentences to embeddings and computing the inner product to calculate similarity
def get_similarity(questions):
    url = "https://tfhub.dev/google/universal-sentence-encoder/2" 
    embed = hub.Module(url)
    placeholder = tf.placeholder(tf.string, shape=(None))
    question_encodings = embed(placeholder)
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        embeddings = session.run(question_encodings, feed_dict={placeholder: questions})
        similarity = np.inner(embeddings, embeddings[-1:])
    dictItems = []
    i = 0
    for i in range(0, len(similarity)-1 ):
        temp = { "probability" : similarity.item(i), "title" : questions[i] }
        dictItems.append(temp)
    return dictItems

#### Reduce logging

In [5]:
tf.logging.set_verbosity(tf.logging.ERROR)


#### Execution:
1. `input` is defined.
2. tags are obtained by calling `get_tags()`
3. questions are obtained by calling `get_question()`
4. Finally `get_similarity()` is called to calculate similarities.

#### Note:
* Sorting is taken care of in the frontend.
* Error handling is also handled by the frontend.

In [6]:
input = "define numpy array"

In [7]:
tags = get_tags(input)
tags

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nipun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nipun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['define', 'numpy', 'array']

In [8]:
questions = get_questions(tags)
questions[0]

['Adding a dimension to every element of a numpy.array',
 'Is &#39;for x in array&#39; always result in sorted x? [Python/NumPy]',
 'Definition of mathematical operations (sin…) on NumPy arrays containing objects',
 'Removing Array Elements in Python while keeping track of their position',
 'Returning array of data mapping values to parameters in python',
 'Calculating conditional probabilities from joint pmfs in numpy, too slow. Ideas? (python-numpy)',
 'Simple question: In numpy how do you make a multidimensional array of arrays?',
 'Better use a tuple or numpy array for storing coordinates',
 'Convert a python numpy array to c++ stl vector',
 'Extending a series of nonuniform netcdf data in a numpy array',
 'Objects array with numpy',
 'removing pairs of elements from numpy arrays that are NaN (or another value) in Python',
 'merging indexed array in Python',
 'How does one wrap numpy array types?',
 'linear combinations in python/numpy',
 'Python: shape of a matrix and imshow()',
 

In [9]:
similarity = get_similarity(questions[0])
similarity

[{'probability': 0.6837819814682007,
  'title': 'Adding a dimension to every element of a numpy.array'},
 {'probability': 0.4473450779914856,
  'title': 'Is &#39;for x in array&#39; always result in sorted x? [Python/NumPy]'},
 {'probability': 0.7187051773071289,
  'title': 'Definition of mathematical operations (sin…) on NumPy arrays containing objects'},
 {'probability': 0.7189610004425049,
  'title': 'Removing Array Elements in Python while keeping track of their position'},
 {'probability': 0.7198627591133118,
  'title': 'Returning array of data mapping values to parameters in python'},
 {'probability': 0.7035109996795654,
  'title': 'Calculating conditional probabilities from joint pmfs in numpy, too slow. Ideas? (python-numpy)'},
 {'probability': 0.7655227184295654,
  'title': 'Simple question: In numpy how do you make a multidimensional array of arrays?'},
 {'probability': 0.7191239595413208,
  'title': 'Better use a tuple or numpy array for storing coordinates'},
 {'probability