In [1]:
import csv
import string
import numpy as np
from html.parser import HTMLParser
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
# Some variables to use in code
data_file1 = "./data/D1.csv"; # filepath for data1
data_file2 = "./data/D2.csv"; # filepath for data2

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

#### Read the csv file and retrieve all posts:

In [4]:
def read_data(file):
  '''
  Read the csv file and return all posts
  '''
  all_posts = [];
  with open(file, "rt", encoding="utf8") as datafile:
    next(datafile)
    csvreader = csv.reader(datafile)
    for row in csvreader:
      all_posts.append(row)
  return all_posts

#### Read the text and code tags seperately with html parser:

In [5]:
class HTMLDataParser(HTMLParser):
  '''
  Parse the html into two types of text
  1) code
  2) normal text
  '''
  def __init__(self):
    # initialize the base class
    HTMLParser.__init__(self);
    
    self.code_content = "";
    self.text_content = "";
    self.is_code_tag = False;
    
  def handle_starttag(self, tag, attrs):
    if tag == "code":
      self.is_code_tag = True;
    else:
      for attr in attrs:
        if attr[1] is not None:
          self.text_content = self.text_content + attr[1]
        
  def handle_endtag(self, tag):
    if(tag == "code"):
      self.is_code_tag = False;
  
  def handle_data(self, data):
    if self.is_code_tag:
      self.code_content = self.code_content + data;
    else:
      self.text_content = self.text_content + data;

<h4> Step1 : Data Clean </h4> 
<p>
   We clean the data by doing
    <ol>
        <li> split text and code into seperate parts </li>
        <li> trim and remove punctuation in text </li>
        <li> lowercase all the text </li>
        </ol>
</p>

In [6]:
def data_clean(title, body):
  '''
  Parse the text as html and split into code and text parts
  Then remove punctuation marks and convert to lower case
  '''
  html_parser = HTMLDataParser()
  if body is not None:
    html_parser.unescape(body)
    html_parser.feed(body)
  
  text = title + html_parser.text_content
  code = html_parser.code_content
  
  text = text.lower()
  #remove the punctuation using the character deletion step of translate
  no_punctuation = text.translate(str.maketrans('','', string.punctuation))
  text_tokens = word_tokenize(no_punctuation)
  
  no_punctuation_code = code.translate(str.maketrans('','',string.punctuation))
  code_tokens = word_tokenize(no_punctuation_code)
  
  return (text_tokens, code_tokens)

<h4> Step 2: Remove stop words </h4>
    <p> We can remove stop words only for text but not for code as tokens in code are very syntactic. </p>
    
<h4> Step 3. Stemming </h4>
<p> Apply stemming to tokens in text but not for source code since it has a predefined structure. </p>

In [7]:
def remove_stopwords_and_do_stemming(text_tokens, code_tokens):
  stemmed = []
  for token in text_tokens:
    if not token in stop_words:
      stemmed.append(stemmer.stem(token))
        
  return (stemmed, code_tokens)

#### Preprocess the data:

<p> Pre process the data by applying the above <b>three steps</b> one by one and then filter out unique words in entire corpus as they don't add any value for document similarity. </p>
<p> Based on observations in Task-1, we can set filter size to 15000 for dataset-1 and 60000 for dataset-2 </p>

In [8]:
def preprocess_doc(title="", body=""):
  '''
  preprocess each document by using above steps
  '''
  #1. clean the data
  #2. remove stop words
  #3. do stemming
  (text_tokens, code_tokens) = data_clean(title, body)
  (text_tokens, code_tokens) = remove_stopwords_and_do_stemming(text_tokens, code_tokens);
  # concat both text and code into one list and send
  text_tokens = text_tokens + code_tokens
  return text_tokens

def filter_unique_words(corpus, k=10000):
  # first get a list of all words
  all_tokens = [token for doc in corpus for token in doc]
  # use nltk fdist to get a frequency distribution of all words
  fdist = FreqDist(all_tokens)
  
  #Create a set of top k tokens
  top_k_tokens,_ = zip(*fdist.most_common(k))
  top_k_tokens = set(top_k_tokens)
  
  # Filter the tokens that are not present in top k
  for doc in corpus:
    doc = filter(lambda token: token in top_k_tokens, doc)
      
  return corpus
  
def preprocess_corpus(corpus, k):
  '''
  Take the corpus as input and filter out tokens that are not in top k frequent
  '''
  tokenized_corpus= []
  for doc in corpus:
    tokenized_corpus.append(preprocess_doc(doc[0], doc[1]))
  
  filtered_tokenized_corpus = filter_unique_words(tokenized_corpus, k)
  return filtered_tokenized_corpus

#### Build TF-IDF matrix for both datasets:

In [33]:
def build_tf_idf(corpus, k):
  tokens = preprocess_corpus(corpus, k)
  # No need of tokenization and lower case as we already pre-processed
  tf_idf = TfidfVectorizer(tokenizer=lambda x:x, lowercase=False, ngram_range=(1,3)) 
  
  return (tf_idf, tf_idf.fit_transform(tokens))

In [34]:
# Read data and prepare corpus
corpus1 = read_data(data_file1)
corpus2 = read_data(data_file2)

In [35]:
# Prepare tf-idf matrix for both corpuses ( this take a while)
(tf_idf1, data1_vec) = build_tf_idf(corpus1, k=15000)
(tf_idf2, data2_vec) = build_tf_idf(corpus2, k=60000)

  after removing the cwd from sys.path.


In [36]:
def cosine_similarity(query_tf_idf, corpus_tf_idf, k=5):
  '''
  calculate cosine similarity and get the top k similar posts indices
  '''
  cosine_similarities = linear_kernel(query_tf_idf, corpus_tf_idf).flatten()
  related_docs_indices = cosine_similarities.argsort()[:-k-1:-1]
  return related_docs_indices

<h4> Query match: </h4>
<p>
Given a query post with title and body, calculate TF-IDF score in the vector space of already prepared corpus.
And then take cosine similarity between query post vector and corpus to result top k matches.
</p>

In [40]:
print("To match with existing questions already in corpus, enter dataset no.,  title and body . \n\n")
inp_dataset = input("Dataset 1 (or) 2 : \n")
inp_title = input("\nEnter the title to match :\n")
inp_body = input("\nEnter the body to match :\n")

show_tb = input("\nTo show only matched posts titles, press 1 \n or to see both title & body press 2: \n")

show_topk =  input("Enter the number of top posts that matched to show :\n")

if inp_dataset == "1":
  (tf_idf, data_vec, corpus) = (tf_idf1, data1_vec, corpus1)
elif inp_dataset == "2":
  (tf_idf, data_vec, corpus) = (tf_idf2, data2_vec, corpus2)
else:
  raise Exception("Input data set option is 1 or 2")

query_tokens = preprocess_doc(inp_title, inp_body) # Preprocess the input query title and body
query_vec = tf_idf.transform([query_tokens])
indices = cosine_similarity(query_vec, data_vec, int(show_topk)) # Take cosine similarity between query and posts

print("Similar posts matching the query (best match at top) : \n\n")
for i in indices:
  print("Title : " + corpus[i][0])
  if show_tb == 2:
    print("Body : " + corpus[i][1])
  print("---------------------------")

To match with existing questions already in corpus, enter dataset no.,  title and body . 


Dataset 1 (or) 2 : 
2

Enter the title to match :
What can be saved before factory reset?

Enter the body to match :
"<p>My phone has a GPS problem and service says the need to wipe it completely.<br> They are probably just lazy and would rather do it the easy way.<br> C'est la vie.</p>  <p>The phone is SGS unrooted 2.3.4 - Touch Wiz. I don't want to root it.</p>  <p>Now, what I am going to do is this:<br> - Save contacts with Kies<br> - Save files</p>  <p>Is there anything else I can do or that I should know?</p>  <p>For example, I believe that apps associated with the Google Account will be reinstalled after I re-enter my account into the newly formatted phone, correct? However, app data like savegames won't be ported, I have to search for them in the phone memory?</p>  <p>Can I backup SMS?</p>  <p>Can I back-up settings?</p>  <p>Will imported contacts keep all fields like I have them now, i.e

  after removing the cwd from sys.path.


Similar posts matching the query (best match at top) : 


Title : What can be saved before factory reset?
---------------------------
Title : How to add new contacts to an outlook.com account in an Android device?
---------------------------
Title : Saved my contacts in google account but they don't appear on my new phone
---------------------------
Title : Android 7 - How to save new contacts to the phone
---------------------------
Title : Is it possible to backup settings and apps for SGS with Samsung Kies?
---------------------------


#### Topic modeling with LDA:

<h5>Observations : </h5>
<p style="text-decoration: underline">
As the corpus is taken from single topic (dataset1 is from AI/machine learning and dataset2 is about Android), all the documents are mapped to single topic. For this reason, query filtering in this corpus using topic modeling is not efficient.
</p>

In [31]:
lda = LatentDirichletAllocation(n_components=10, n_jobs=-1, random_state=0)
doc_topic_prob = lda.fit_transform(data_vec)

query_topic = lda.transform(query_vec).argmax()
rand_doc_topics = []
for i in np.random.randint(0, data_vec.shape[0], 4):
  rand_doc_topics.append(doc_topic_prob[i].argmax())
  
print("query topic: " + str(query_topic))
print("10 random documents topics: " + str(rand_doc_topics))



query topic: 0
10 random documents topics: [0, 0, 0, 0]
