# Pre-trained Glove Model on Title Feature 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
%%time
!wget --header="Host: 34.125.84.120:5000" --header="User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" --header="Accept-Language: en-US,en;q=0.9" --header="Referer: http://34.125.84.120:5000/edit/Final_df.csv" "http://34.125.84.120:5000/files/Final_df.csv?download=1" -c -O 'Final_df.csv'

In [3]:
!pip install swifter

### Importing libraries


In [4]:
import pandas as pd
import bs4
import numpy as np
import swifter
import re
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
import pickle
import numpy as np
from nltk.corpus import stopwords

### Loading Data

In [5]:
%%time

df = pd.read_csv('./Final_df.csv')

In [6]:
df.shape, df.columns

In [7]:
df.dtypes

## Data Preprocessing

In [8]:
# # https://stackoverflow.com/a/47091490/4084039
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    #phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


def text_preprocessing(text):
    '''This function does text preprocessing 
       It includes removal of html tags,
       converting to lowercase, 
       decontraction,
       removal of any non alphanumeric characters expect '.','#',and '+' and
       word lemmatization
       
       
       Function takes one parameter - text
       returns - preprocessed text
    '''
    
    # Some titles (~42) start with '<' but doesnt have closing '>'. 
    #eg: #text = '<asp: RegularExpressionValidator and RegexOptions.IgnorePatternWhitespace'
    # beautifulsoup gives emppty string on such text so remove '<' before removing html tags from titles.
    text = text.replace("<","")
    # Remove html tags from question corpus
    text = bs4.BeautifulSoup(text, 'lxml').get_text()
    # Convert each word to lowercase
    text = text.lower()
    # text decontraction. eg: won't to will not. Can't to cannot
    text = decontracted(text)
    # Remove any non-alphanumeric characters if present
    #text = re.sub('\W', ' ',text).strip()
    text = re.sub("[^a-zA-Z'.+# ]+", '', text) # kepping + for c++, . for .net, vb.net etc, # for C

    # why lemmatization is choose over stemming
    #https://stackoverflow.com/questions/1787110/what-is-the-difference-between-lemmatization-vs-stemming
    # Lemmatization   
    lemmatizer = WordNetLemmatizer()
    
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    
    text = text.strip()
    return text

In [9]:
%%time
df['Cleaned_Titles'] = df['Title'].swifter.apply(lambda x: text_preprocessing(x))

In [10]:
print("Original: ",df['Title'].iloc[2])
print("Cleaned: ",df['Cleaned_Titles'].iloc[2])
print("_____________________________________________________________")
print("Original: ",df['Title'].iloc[3])
print("Cleaned: ",df['Cleaned_Titles'].iloc[3])

print("_____________________________________________________________")
print("Original: ",df['Title'].iloc[1000])
print("Cleaned: ",df['Cleaned_Titles'].iloc[1000])

## Pre-trained Glove Embeddings
- Stanfords glove embedding

In [11]:

#please use below code to load glove vectors
with open('../input/pretrainedglove/glove_vectors', 'rb') as f:
    model = pickle.load(f)
    glove_words = set(model.keys())
    
print("No. of unique words:",len(glove_words))
stopwords = stopwords.words("english")

In [12]:
def get_embedding(sentence):
    '''Get 300 dim word embedding for each wrod from pre-trained glove model.
       Avg word embedding to create sentence embedding
       
       Function accepets only one parameter - sentence (text input)
       returns - 300 dim sentence embedding'''
    
    pretrained_w2v = []
    for word in sentence.split():
        if (word in glove_words) and(word not in stopwords):
            pretrained_w2v.append(model[word])
    avg_w2v = np.array(pretrained_w2v).mean(axis=0)
    return avg_w2v
    

### Computing Sentence Embedding

In [13]:
df['Sentence_Embedding'] = df['Cleaned_Titles'].swifter.apply(lambda x: get_embedding(x))

In [14]:
df['Sentence_Embedding'].head()

In [15]:
df['Sentence_Embedding'].iloc[0].shape

In [16]:
df.isna().sum()

In [17]:
df.dropna(inplace=True)

In [18]:
df.columns


In [19]:
df.shape

In [None]:
# %%time
# df[['Cleaned_Titles','Sentence_Embedding','Question_Id','Title']].to_csv('cleaned_titles.csv', index=False)
# # %%time
# # df[['Cleaned_question_corpus','Sentence_Embedding','Title']].to_csv('qcorpus_embedding.csv', index=False)

### Get top 5 similar questions given a user query

In [20]:
def get_similar_questions(query):
    ''' Function to accept user query and show top 5 similar question alongwith cosine similarity score.
        Function accepts one parameter: query (text input)
        Processing: Text preprocessing of query, compute sentence embedding using avg pre-trained glove embeddings.
        Returns: Prints dataframe of similar titles and cosine similarity score.
    '''
    preprocessed_query = text_preprocessing(query)
    query_embedding = get_embedding(preprocessed_query)
    embeddings = [x for x in df['Sentence_Embedding']]
    df['Cosine_sim'] = cosine_similarity(np.array(query_embedding).reshape(1, -1),np.array(embeddings)).T
    df.sort_values(by='Cosine_sim', ascending=False, inplace=True)
    print(df[['Title','Cosine_sim']].head().values)
    


In [21]:
%%time
query = 'python sort dictionary'
get_similar_questions(query)

In [22]:
%%time
query = 'CSS Performance'
get_similar_questions(query)

In [23]:
%%time
query = 'python convert date to datetime'
get_similar_questions(query)

In [24]:
%%time
query = 'how to create list of lists in python'
get_similar_questions(query)

In [25]:
%%time
get_similar_questions('pd.melt() not working python')

In [26]:
%%time
query = 'try: 22/0 except Exception as e:print("Error! Code: {c}, Message, {m}".format(c = e.code, m = str(e))'
get_similar_questions(query)

In [27]:
%%time
get_similar_questions('def main(): return {a:1, b:2}')

In [28]:
%%time
get_similar_questions('import KNN \
                       knn= KNN(n=4) \
                       knn.fit(Xtrain, ytrain)')

## Inference:-

#### For query - 'python sort dictionary'
- First question in result set is exactly same so model performed very well.
- Other questions are also related.

#### For query = 'CSS Performance'
- Results are very similar.

#### For query -  'python convert date to datetime'
- Result set contains similar question.

#### For query - 'how to create list of lists in python'
- Result set includes question - list of lists in python. 
- Ranking of result can be better
