In [20]:
"""
In this notebook I play with stackoverflow questions dataset.
The data set is so big,so i get a subset of dataset.
Logic is so simple,
Encode questions with a defined embedding, of dimension 100
Compare all questions with cosine with a search query.
starspace_embedding.tsv must be downloaded 
"""

import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances_argmin
import os
import nltk
import pickle
import re
import numpy as np
import csv

In [2]:
with open("D:/nndata2/tagged_posts.tsv","r", encoding='cp932', errors='ignore' ) as fp:
    lines = fp.readlines()

In [3]:
#some lines have length 4 so only get ones with length 3
splitted = [line.strip().split('\t')  for line in lines  if len(line.split('\t')) == 3 ]

In [6]:
posts_df = pd.DataFrame(splitted,columns=["post_id","title","tag"])
print("len posts_df",len(posts_df))
posts_df.head(5)

len posts_df 2171574


Unnamed: 0,post_id,title,tag
0,post_id,title,tag
1,9,Calculate age in C#,c#
2,16,Filling a DataSet or DataTable from a LINQ que...,c#
3,39,Reliable timer in a console application,c#
4,42,Best way to allow plugins for a PHP application,php


In [7]:
counts_by_tag = posts_df.groupby(['tag']).count()
counts_by_tag

Unnamed: 0_level_0,post_id,title
tag,Unnamed: 1_level_1,Unnamed: 2_level_1
c#,394450,394450
c_cpp,281300,281300
java,383456,383456
javascript,375866,375866
php,321752,321752
python,208607,208607
r,36359,36359
ruby,99930,99930
swift,34809,34809
tag,1,1


In [11]:
filtered = posts_df[posts_df.tag.isin(["c#","java","python"])]

In [12]:
filtered.head(4)

Unnamed: 0,post_id,title,tag
1,9,Calculate age in C#,c#
2,16,Filling a DataSet or DataTable from a LINQ que...,c#
3,39,Reliable timer in a console application,c#
5,59,"How do I get a distinct, ordered list of names...",c#


In [13]:
counts_by_tag = filtered.groupby(['tag']).count()
counts_by_tag

Unnamed: 0_level_0,post_id,title
tag,Unnamed: 1_level_1,Unnamed: 2_level_1
c#,394450,394450
java,383456,383456
python,208607,208607


In [14]:
filtered2 = filtered[0:200000]

In [15]:
counts_by_tag = filtered2.groupby(['tag']).count()
counts_by_tag

Unnamed: 0_level_0,post_id,title
tag,Unnamed: 1_level_1,Unnamed: 2_level_1
c#,98358,98358
java,66754,66754
python,34888,34888


In [16]:
posts_df = filtered2

In [80]:
def question_to_vec(question, embeddings, dim):
    vec = np.zeros((dim,), dtype=np.float32)
    count = 0
    for w in question.split():
        if w in embeddings:
            count += 1
            vec += embeddings[w]
    if count == 0:
        return vec
    return vec/count

def load_embeddings(embeddings_path):
    embeddings = {}
    with open(embeddings_path, newline='') as embedding_file:
        reader = csv.reader(embedding_file, delimiter='\t')
        for line in reader:
            word = line[0]
            embedding = np.array(line[1:]).astype(np.float32)
            embeddings[word] = embedding
        dim = len(line) - 1
    return embeddings, dim


def text_prepare(text):
    """Performs tokenization and simple preprocessing."""
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))

    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])

    return text.strip()

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cmustafa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
starspace_embeddings , embeddings_dim = load_embeddings('D:/nndata2/starspace_embedding.tsv')


In [24]:
tag = "c#" 
count = 98358
tag_posts = posts_df[posts_df['tag'] == tag]
tag_post_ids = tag_posts['post_id'].values
print("len(tag_post_ids)",len(tag_post_ids))
tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)
for i, title in enumerate(tag_posts['title']):
    v = question_to_vec(title, starspace_embeddings, embeddings_dim)         
    tag_vectors[i, :] = v

len(tag_post_ids) 98358


In [26]:
tag_vectors[0]

array([-0.121911  , -0.0148207 ,  0.0685611 , -0.0391922 ,  0.0207903 ,
       -0.0263119 , -0.0491494 ,  0.00619848,  0.0694078 , -0.160971  ,
       -0.0996862 , -0.00131709,  0.02722   , -0.0355591 ,  0.00401683,
        0.142795  , -0.0803947 ,  0.0476321 ,  0.0112376 ,  0.0156932 ,
       -0.00610658, -0.0823146 ,  0.0976203 , -0.0572465 ,  0.0509001 ,
        0.123337  ,  0.0711459 ,  0.106239  , -0.0286553 ,  0.00737424,
        0.149057  ,  0.0738868 ,  0.0742279 ,  0.0444604 ,  0.0210809 ,
       -0.034772  , -0.202033  , -0.150109  , -0.0620737 , -0.0197366 ,
        0.0943944 , -0.119607  ,  0.0213234 ,  0.0242412 , -0.0424364 ,
       -0.181983  ,  0.10005   , -0.0628281 ,  0.0928079 ,  0.108563  ,
       -0.187201  , -0.123544  , -0.0813036 ,  0.156296  ,  0.0786888 ,
        0.0934746 ,  0.0757428 ,  0.126971  ,  0.129636  , -0.117174  ,
       -0.101729  ,  0.0552807 ,  0.0580586 ,  0.0523162 , -0.11047   ,
       -0.095839  , -0.147583  ,  0.0798012 , -0.0705922 , -0.08

In [27]:
type(starspace_embeddings)

dict

In [28]:
list(starspace_embeddings.keys())[0:10]

['using',
 'android',
 'file',
 'java',
 'get',
 'error',
 'php',
 'use',
 'javascript',
 'python']

In [31]:
print(starspace_embeddings['python'])
counts_by_tag

[-0.0824964   0.0174975  -0.0582816  -0.00762401 -0.0347168  -0.0284079
  0.00320952 -0.0248444   0.00890259  0.0155737   0.0156591  -0.0763307
 -0.0525748   0.0251436  -0.0601807   0.0453156  -0.0508035  -0.0970938
 -0.00843726 -0.0113547   0.0104599   0.0206409  -0.0479884   0.0261706
  0.00947456 -0.0144043  -0.00072671  0.0178546   0.0276209   0.0333171
  0.0231061  -0.025076    0.0025155   0.0430403  -0.0074121   0.0810426
  0.0899528   0.019445   -0.103867    0.00639882  0.114456    0.00398528
 -0.037371    0.00700451  0.0457162   0.00477849 -0.00839937  0.0513829
  0.023151    0.0215419  -0.0224248   0.0768661   0.0155577  -0.0351739
  0.0935909   0.0317492   0.104912   -0.00737383  0.00053634 -0.0351506
  0.102329    0.0230716   0.0412404   0.107931   -0.0479629  -0.0722398
  0.0218439   0.0808713  -0.0346209   0.162488    0.0456327   0.126931
 -0.00448303  0.0335758  -0.058264    0.113977   -0.0568185  -0.0493557
 -0.0004806   0.0143738  -0.0232347   0.0633993   0.0554496  -0.

Unnamed: 0_level_0,post_id,title
tag,Unnamed: 1_level_1,Unnamed: 2_level_1
c#,98358,98358
java,66754,66754
python,34888,34888


In [None]:
all_tags = []
for tag, count in counts_by_tag.items():
    print("tag",tag)
    tag_posts = posts_df[posts_df['tag'] == tag]
    
    tag_post_ids = tag_posts['post_id'].values
    
    #tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)
    for i, title in enumerate(tag_posts['title'].values):        
        #tag_vectors[i, :] = question_to_vec(title, starspace_embeddings, embeddings_dim) 
        all_tags.append(question_to_vec(title, starspace_embeddings, embeddings_dim) )

In [66]:
counts_by_tag = posts_df['tag'].value_counts().to_dict()
counts_by_tag

all_tags = []
for tag, count in counts_by_tag.items():
    print("tag",tag)
    tag_posts = posts_df[posts_df['tag'] == tag]
    
    tag_post_ids = tag_posts['post_id'].values
    
    for i, title in enumerate(tag_posts['title'].values):        

        all_tags.append(question_to_vec(title, starspace_embeddings, embeddings_dim) )


tag c#
tag java
tag python


In [137]:
all_tags = []
for item in posts_df.values:
    #print(item[0],item[1],item[2])
    item_id = item[0]
    item_title = item[1]
    item_tag = item[2]
    all_tags.append(question_to_vec(item_title, starspace_embeddings, embeddings_dim) )

'How to download attachment from gmail using python?'

In [158]:
question = "Reliable timer in a console application"
#question = "List object returning null"
question = text_prepare(question)
question_vec = question_to_vec(question, starspace_embeddings, embeddings_dim)
best_thread = pairwise_distances_argmin(
            X=question_vec.reshape(1, embeddings_dim),
            Y=all_tags,
            metric='cosine'
        )
best_thread
posts_df.iloc[best_thread[0]]["title"]

'Reliable timer in a console application'

In [163]:
posts_df[0:10]

Unnamed: 0,post_id,title,tag
1,9,Calculate age in C#,c#
2,16,Filling a DataSet or DataTable from a LINQ que...,c#
3,39,Reliable timer in a console application,c#
5,59,"How do I get a distinct, ordered list of names...",c#
6,109,Decoding T-SQL CAST in C#/VB.NET,c#
8,174,How do I print an HTML document from a web ser...,c#
9,260,Adding scripting functionality to .NET applica...,c#
11,289,How do you sort a dictionary by value?,c#
13,337,XML Processing in Python,python
16,482,WinForms ComboBox data binding gotcha,c#


In [156]:
def get_best_answer(question):
    question = text_prepare(question)
    question_vec = question_to_vec(question, starspace_embeddings, embeddings_dim)
    best_thread = pairwise_distances_argmin(
            X=question_vec.reshape(1, embeddings_dim),
            Y=all_tags,
            metric='cosine'
        )
    return posts_df.iloc[best_thread[0]]["title"]

In [157]:
get_best_answer("How do I print an HTML document from a web service")

'How do I print an HTML document from a web service?'

In [160]:
get_best_answer("web service HTML document ")

'Passing XML document to a web service using WCF'

In [161]:
get_best_answer("from a web service  an HTML document How do I print")

'How do I print an HTML document from a web service?'

In [162]:
get_best_answer(" web service   HTML document   I print")

'How do I print an HTML document from a web service?'

In [164]:
get_best_answer("How heap memory management works")

'Java JDBC clearBatch() and heap memory'

In [165]:
get_best_answer("How immutable string works")

'Make Hashtable immutable'

In [166]:
get_best_answer("Perfomance of Map vs List")

'Python PIL library perfomance'

In [167]:
get_best_answer("Comparision of Map vs List")

'How to map one list to another in python?'

In [168]:
get_best_answer("Null pointer exception")

'null pointer exception at org.hibernate.tuple.AbstractEntityTuplizer.createProxy'

In [169]:
get_best_answer("Access db from console application")

'Connecting console application to access database problems'