In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import csv


DATA_PATH = Path('data/')
MODEL_PATH = Path('models/')

In [2]:
questions = pd.read_csv('data/Questions.csv', encoding='latin1')
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...


In [3]:
tags = pd.read_csv('data/Tags.csv', encoding='latin1')
tags['Tag'] = tags['Tag'].astype(str)
tag_series = tags.groupby('Id', as_index=True)['Tag'].apply(lambda x: ' '.join(x))
tag_series = pd.DataFrame(tag_series).reset_index()
q_shape = len(questions)
questions = pd.merge(questions, tag_series, how='left',  on='Id')
assert len(questions) == q_shape
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body,Tag
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,python osx fonts photoshop
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,python windows image pdf
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,python continuous-integration extreme-programming
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...,python sql database oracle cx-oracle
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,python arrays iteration


In [4]:
answers = pd.read_csv('data/Answers.csv', encoding='latin1')
answers.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,497,50.0,2008-08-02T16:56:53Z,469,4,<p>open up a terminal (Applications-&gt;Utilit...
1,518,153.0,2008-08-02T17:42:28Z,469,2,<p>I haven't been able to find anything that d...
2,536,161.0,2008-08-02T18:49:07Z,502,9,<p>You can use ImageMagick's convert utility f...
3,538,156.0,2008-08-02T18:56:56Z,535,23,<p>One possibility is Hudson. It's written in...
4,541,157.0,2008-08-02T19:06:40Z,535,20,"<p>We run <a href=""http://buildbot.net/trac"">B..."


In [15]:
name = 'StackOverflow_python'

connection = sqlite3.connect('{}.db'.format(name))
c = connection.cursor()

c.execute("SELECT COUNT(*) FROM posts;")
print(c.fetchone())
c.close()
connection.close()
# df = pd.read_sql("SELECT * FROM posts LIMIT 100", connection)
# df.head()

(2976535,)


In [7]:
all_not_included = pd.read_csv(DATA_PATH/'all_not_included.csv')
all_not_included.head()

Unnamed: 0,comment,comment_id,parent_id,score,tags,title
0,<p>I've recently starting playing around with ...,287312,,10,python documentation tkinter tix,Tix documentation for Python
1,"<p>If I do a google search with the string ""p...",354124,,44,python productivity,Are there statistical studies that indicates t...
2,<p>I'm trying to write my first little plugin ...,370733,,12,python documentation trac,Trac documentation?
3,<p>I am looking for best practices for functio...,405582,,68,python documentation coding-style,Function and class documentation best practice...
4,<p>Of the two which one would exposed someone ...,527134,,6,java python programming-languages,Python or java which language will exposed a s...


In [21]:
name = 'StackOverflow_python'
connection = sqlite3.connect('{}.db'.format(name))


def get_comment(ids_):
    all_comment_ids = []
    for data in ids_:
#         print(data)
        all_comment_ids.extend([x for x in data['comment_id']])
    return all_comment_ids

df = pd.read_sql("SELECT comment_id FROM posts", connection, chunksize=1000)
all_comment_ids = get_comment(df)

In [22]:
len(all_comment_ids)

2976535

In [23]:
q_ids = list(questions.Id.values)
diff = list(set(q_ids).difference(set(all_comment_ids)))

In [26]:
not_included = questions[questions.Id.isin(diff)].rename(
                    columns={'Id': 'comment_id', 'Body': 'comment', 'Title': 'title', 'Score': 'score', 'Tag': 'tags'})

not_included.drop(['OwnerUserId', 'CreationDate'], axis=1, inplace=True)
not_included['parent_id'] = np.nan
not_included.head()

Unnamed: 0,comment_id,score,title,comment,tags,parent_id
1256,287312,10,Tix documentation for Python,<p>I've recently starting playing around with ...,python documentation tkinter tix,
1638,354124,44,Are there statistical studies that indicates t...,"<p>If I do a google search with the string ""p...",python productivity,
1738,370733,12,Trac documentation?,<p>I'm trying to write my first little plugin ...,python documentation trac,
1941,405582,68,Function and class documentation best practice...,<p>I am looking for best practices for functio...,python documentation coding-style,
2699,527134,6,Python or java which language will exposed a s...,<p>Of the two which one would exposed someone ...,java python programming-languages,


In [27]:
not_included_answers = answers[answers.ParentId.isin(not_included.comment_id.tolist())]
not_included_answers.rename(columns={'Id': 'comment_id', 'ParentId': 'parent_id', 'Score': 'score', 'Body': 'comment'},
                           inplace=True)
not_included_answers.head()
all_not_included = pd.concat((not_included, not_included_answers[['comment_id', 'parent_id', 'score', 'comment']]), axis=0)
all_not_included.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,comment,comment_id,parent_id,score,tags,title
1256,<p>I've recently starting playing around with ...,287312,,10,python documentation tkinter tix,Tix documentation for Python
1638,"<p>If I do a google search with the string ""p...",354124,,44,python productivity,Are there statistical studies that indicates t...
1738,<p>I'm trying to write my first little plugin ...,370733,,12,python documentation trac,Trac documentation?
1941,<p>I am looking for best practices for functio...,405582,,68,python documentation coding-style,Function and class documentation best practice...
2699,<p>Of the two which one would exposed someone ...,527134,,6,java python programming-languages,Python or java which language will exposed a s...


In [38]:
all_not_included = all_not_included[all_not_included.comment_id.isin(list(set(all_not_included.comment_id).difference(set(all_comment_ids))))]

In [None]:
name = 'StackOverflow_python'
connection = sqlite3.connect('{}.db'.format(name))

all_not_included.to_sql(name='posts', con=connection, if_exists='append', index=False, chunksize=1000)

In [5]:
%%time
import nltk
import pickle
import re
import numpy as np
import html
import sqlite3

# nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_set = set(stopwords.words('english'))



name = 'StackOverflow_python'
sql_transactions = []
start_row = 0
cleanup = 1000000

connection = sqlite3.connect('{}.db'.format(name))
c = connection.cursor()

def create_table():
    c.execute("""
              CREATE TABLE IF NOT EXISTS posts
              (comment_id INT PRIMARY KEY, parent_id INT, 
              comment TEXT, title TEXT, score INT, tags TEXT)
              """) 


        
def transaction_bldr(sql, size=1000):
    global sql_transactions
    sql_transactions.append(sql)
    if len(sql_transactions) > size:
        c.execute('BEGIN TRANSACTION')
        for s in sql_transactions:
            try:
                c.execute(s)
            except:
                pass
        connection.commit()
        sql_transactions = []
        
        
def sql_insert_has_parent(commentid, parentid, comment, score):
    try:
        sql = """
              INSERT INTO posts (comment_id, parent_id, comment, score) 
              VALUES ({},{},"{}",{});
              """.format(int(commentid), int(parentid), comment, int(score))
        transaction_bldr(sql)
    except Exception as e:
        print('s-PARENT insertion', str(e))

        
def sql_insert_no_parent(commentid, comment, title, score, tags):
    try:
        sql = """
              INSERT INTO posts (comment_id, comment, title, score, tags) 
              VALUES ({},"{}","{}",{},"{}");
              """.format(int(commentid), comment, title, int(score), tags)
        transaction_bldr(sql)
    except Exception as e:
        print('s-NO-PARENT insertion', str(e))
        
        
def find_parent(pid):
    try:
        sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
        c.execute(sql)
        result = c.fetchone()
        if result != None:
            return result[0]
        else: return False
    except Exception as e:
        #print(str(e))
        return False

    

def populate(column):
    old_name = 'StackOverflow'
    old_connection = sqlite3.connect('{}.db'.format(old_name))
    
    df = pd.read_sql("SELECT {} FROM posts".format(column), 
                 old_connection, chunksize=10000)
    preprocess(df)
    return None
    
all_good_tags_reduced = ['python', 'django', 'python-3.x', 'python3x', 'pythonx', 'python-2.7', 'python27', 
                   'apache-spark', 'apachespark', 'pandas', 'numpy', 'matplotlib', 'flask', 'tensorflow', 
                   'machine-learning', 'machinelearning', 'nlp', 'cuda', 'neural-network', 'neuralnetwork', 
                   'artificial-intelligence', 'artificialintelligence', 'deep-learning', 'deeplearning']


def binarySearch(alist, item):
    first = 0
    last = len(alist) - 1
    found = False

    while first <= last and not found:
        midpoint = (first + last) // 2
        if alist[midpoint] == item:
            found = True
        else:
            if item < alist[midpoint]:
                last = midpoint - 1
            else:
                first = midpoint + 1

    return found

parent_comment_ids = []
def preprocess(df):
    
    for i, data in enumerate(df):  
        for tag_, comment_id_, title_, comment_, parent_id_, score_ in zip(
                                                data['tags'], data['comment_id'], data['title'], 
                                                data['comment'], data['parent_id'], data['score']):
            for g_t in all_good_tags_reduced:
                if tag_:
                    # this is a parent comment with title and tag (no parent_id) 
                    if g_t in tag_.split():
                        parent_comment_ids.append(comment_id_)
                        sql_insert_no_parent(comment_id_, comment_, title_, score_, tag_)     
#             binary search is much faster (16.9 s on 1 mill vs 1min 49s)
            if binarySearch(parent_comment_ids, parent_id_):
                # this is a answer comment with parent_id (no title and no tags)
                sql_insert_has_parent(comment_id_, parent_id_, comment_, score_)


create_table()
populate('comment_id, parent_id, comment, title, score, tags')

CPU times: user 5min 56s, sys: 19.6 s, total: 6min 16s
Wall time: 7min 12s


In [None]:
# include rows that are not in my dataset from the Question.csv Kaggle one. Only 15566 rows

In [18]:
diff = list(set(q_ids).difference(set(df_ids)))

In [17]:
q_ids = list(questions.Id.values)
df_ids = list(df.comment_id.values)

In [20]:
diff = list(set(q_ids).difference(set(df_ids)))
not_included = questions[questions.Id.isin(diff)].rename(
                    columns={'Id': 'comment_id', 'Body': 'comment', 'Title': 'title', 'Score': 'score', 'Tag': 'tags'})

not_included.drop(['OwnerUserId', 'CreationDate'], axis=1, inplace=True)
not_included['parent_id'] = np.nan
not_included.head()
# df = pd.concat((df[['comment_id', 'title', 'comment']], not_included), axis=0)
# df.reset_index(inplace=True)

Unnamed: 0,comment_id,score,title,comment,tags
1256,287312,10,Tix documentation for Python,<p>I've recently starting playing around with ...,python documentation tkinter tix
1638,354124,44,Are there statistical studies that indicates t...,"<p>If I do a google search with the string ""p...",python productivity
1738,370733,12,Trac documentation?,<p>I'm trying to write my first little plugin ...,python documentation trac
1941,405582,68,Function and class documentation best practice...,<p>I am looking for best practices for functio...,python documentation coding-style
2699,527134,6,Python or java which language will exposed a s...,<p>Of the two which one would exposed someone ...,java python programming-languages


In [43]:
not_included_answers = answers[answers.ParentId.isin(not_included.comment_id.tolist())]
not_included_answers.rename(columns={'Id': 'comment_id', 'ParentId': 'parent_id', 'Score': 'score', 'Body': 'comment'},
                           inplace=True)
not_included_answers.head()
all_not_included = pd.concat((not_included, not_included_answers[['comment_id', 'parent_id', 'score', 'comment']]), axis=0)
all_not_included.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,comment,comment_id,parent_id,score,tags,title
1256,<p>I've recently starting playing around with ...,287312,,10,python documentation tkinter tix,Tix documentation for Python
1638,"<p>If I do a google search with the string ""p...",354124,,44,python productivity,Are there statistical studies that indicates t...
1738,<p>I'm trying to write my first little plugin ...,370733,,12,python documentation trac,Trac documentation?
1941,<p>I am looking for best practices for functio...,405582,,68,python documentation coding-style,Function and class documentation best practice...
2699,<p>Of the two which one would exposed someone ...,527134,,6,java python programming-languages,Python or java which language will exposed a s...


In [55]:
all_not_included.to_csv(DATA_PATH/'all_not_included.csv', index=False)

In [21]:
save_df = False
if save_df:    
    df.to_csv(DATA_PATH/'python_df.csv', index=False)

In [22]:

def compute_tfidf(X, X_test=None, save_path=MODEL_PATH/'tf_idf_python_title_stopwords.pkl', load=True, save=False):
    
    if load:
        with open(save_path, mode='rb') as f:
            vect = pickle.load(f) 
    else:
        vect = TfidfVectorizer(token_pattern='(\S+)', min_df=5, max_df=0.9, ngram_range=(1,1))
        vect.fit(X)
        if save:
            # save vect
            with open(save_path, mode='wb') as f:
                pickle.dump(vect, f)
            print('SAVED')

    X = vect.transform(X)
    if X_test: 
        X_test = vect.transform(X_test)
        return X, X_test, vect   
    return X, vect


X, vect = compute_tfidf(df.title, load=True, save=False)
idf_scores = defaultdict(lambda:0, zip(vect.get_feature_names(), vect.idf_))
print(len(idf_scores))

SAVED
33791


In [23]:
def get_embeddings(filename):
    embeddings = {}
    with open(MODEL_PATH/filename, newline='') as f:
        reader = csv.reader(f, delimiter='\t')
        embed_list = list(reader)
    for line in embed_list:
        embeddings[line[0]] = np.asarray(line[1:], dtype=np.float32)
    return embeddings

# embeddings = get_embeddings('starspace_embedding300_ngram2.tsv')
embeddings = get_embeddings('starspace_embedding100_ngram2.tsv')

In [24]:
def avg_word_vectors(question, embeddings, dim):
    words_embedding = [embeddings[word] for word in question.lower().split() if word in embeddings]
    if not words_embedding:
        return np.zeros(dim)
    words_embedding = np.array(words_embedding).astype(np.float32)
    return words_embedding.mean(axis=0)

def average_tfidf_vectors(question, embeddings, dim, vect):
    # get idf weights
    split_question = [word for word in question.lower().split() if word in embeddings]
    if not split_question:
        return np.zeros(dim).astype(np.float32)
    words_embedding = np.zeros((dim, len(split_question))).astype(np.float32)
    for i, token in enumerate(split_question):
        if token in embeddings:
            embed_score = embeddings[token]
        else: embed_score = 0
        idf_score = idf_scores[token]
        # word vectors multiply by their TF-IDF scores
        words_embedding[:, i] = embed_score * idf_score    
    return words_embedding.mean(axis=1)

In [26]:
from torch.nn import CosineSimilarity
import torch

def rank_candidates(question, candidates, embeddings, dim=300, question_to_vec=avg_word_vectors, topk=5, 
                    return_score=False, save=False, *args, **kwargs):
    cos = CosineSimilarity(dim=1)
    question2vec = question_to_vec(question, embeddings, dim, *args, **kwargs)
    candidate2vecs = np.array([question_to_vec(cand, embeddings, dim, *args, **kwargs) for cand in candidates])
    if save:
        np.save(MODEL_PATH/'candidate2vecs_python_title.npy', candidate2vecs)
    candidate2vecs = np.load(MODEL_PATH/'candidate2vecs_python_title.npy')

    output = cos(torch.Tensor(question2vec.reshape(1, -1)).cuda(), torch.Tensor(candidate2vecs).cuda())
    output = output.cpu().numpy()
    print(output.shape)
    data = [(i, candidates[i]) for i in range(len(output))]   
    if return_score:
        output = [(x, score) for score, x in sorted(zip(output, data), key=lambda pair: pair[0], reverse=True)]
    else:
        output = [x for _, x in sorted(zip(output, data), key=lambda pair: pair[0], reverse=True)]
    if topk: return output[:topk]
    else: return output

In [27]:
%%time
# ex = questions.iloc[105].Title
# print(ex)
ex = 'How do I sort a list of strings in Python?'
clean_example = clean_title(ex)
print(rank_candidates(clean_example, df.title, embeddings, 100, average_tfidf_vectors, return_score=True, vect=vect))

None
CPU times: user 37.8 s, sys: 881 ms, total: 38.7 s
Wall time: 38.9 s


In [28]:
import nmslib

M = 1
efC = 1000
num_threads = 4
index_time_params = {'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}

# def create_index(a):
#     index = nmslib.init(space='cosinesimil')
#     index.addDataPointBatch(a)
#     index.createIndex()
#     return index

def get_knns(index, vecs):
    return zip(*index.knnQueryBatch(vecs, k=10, num_threads=4))

def get_knn(index, vec):
    return index.knnQuery(vec, k=10)

def create_index(a, space='cosinesimil', load=True, save=False, 
                 filepath=str(MODEL_PATH/'knn_embeddings_path/python_only.bin')):
    index = nmslib.init(space=space)
    if load:
        index.loadIndex(filepath)
        return index
    else:
        index.addDataPointBatch(a)
        index.createIndex()
        if save: index.saveIndex(filepath)
        return index

In [29]:
candidate2vecs = np.load(MODEL_PATH/'candidate2vecs_python_title.npy')
nms_index = create_index(candidate2vecs, space='cosinesimil', save=False, load=True)

In [40]:
len(df)

1644219

In [41]:
import os
embeddings_dim = 100


tag_posts = df.title.values
tag_post_ids = df.comment_id.astype(np.int32)
    
tag_vectors = np.zeros((len(df), embeddings_dim), dtype=np.float32)
    
for i, question in enumerate(tag_posts):
    tag_vectors[i, :] = average_tfidf_vectors(question, embeddings, embeddings_dim, vect)

pickle.dump((tag_post_ids, tag_vectors), (MODEL_PATH/'thread_embeddings_by_tags/python_only.pkl').open('wb'))

In [37]:
question = 'How do I sort a list of strings in Python?'
clean_question = clean_title(question)
question2vec = average_tfidf_vectors(clean_question, embeddings, 100, vect)

idxs, distances = get_knns(nms_index, [question2vec])
print(idxs)
print([(df.ix[i].title.values, d) for i, d in zip(idxs, distances)])

(array([ 994024, 1375075, 1643938, 1643446, 1643617, 1644155, 1643176,
       1643447, 1644112, 1640210], dtype=int32),)
[(array(['python sort list list', 'sort list list python',
       'one time pad keygen using randomshuffle',
       'typeerror remains str object callable',
       'generate tree structure websites catogary like dmoz',
       'aiohttp realize pagination',
       'python memory error using infinitely long browser session',
       'get values python file form using flask',
       'pyplot contour plot function coordinate vector',
       'python pybluez send data l2cap sockets without pairing devices'],
      dtype=object), array([0.15749317, 0.15749317, 0.7806451 , 0.82051265, 0.8970093 ,
       0.90215355, 0.9258046 , 0.95270646, 0.954098  , 0.9832047 ],
      dtype=float32))]


In [42]:
not_included.head()

Unnamed: 0,comment_id,title,comment
1256,287312,tix documentation python,<p>I've recently starting playing around with ...
1638,354124,statistical studies indicates python productive,"<p>If I do a google search with the string ""p..."
1738,370733,trac documentation,<p>I'm trying to write my first little plugin ...
1941,405582,function class documentation best practices py...,<p>I am looking for best practices for functio...
2699,527134,python java language exposed self taught progr...,<p>Of the two which one would exposed someone ...


Question-Answer ideas

- vector similarities based on average word vector
- number of matching words 
- cosine distance between TF-IDF vectors 
- Levenshtein distance

- symbolic n-grams (1 - 5)
 - SVD and take first 300 components 

In [46]:
from pathlib import Path
import sqlite3
import re 
import html
from datetime import datetime

In [47]:

name = 'StackOverflow_newline_score'
sql_transactions = []
start_row = 0
cleanup = 1000000

connection = sqlite3.connect('{}.db'.format(name))
c = connection.cursor()


re1 = re.compile(r'  +')

def clean_tags(x):
    x = x.replace('<', '').replace('>', ' ').replace('"',"'").strip()
    return re1.sub(' ', html.unescape(x))


def clean_text(x, remove_html=False):
    if remove_html:
        x = re.sub(r'<code>[^>]*</code>', '', x)
        x = re.sub(r'<[^>]*>', '', x)
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ').replace('"',"'").strip()
    return re1.sub(' ', html.unescape(x))


def sql_insert_has_parent(commentid, parentid, comment, date, score):
    try:
        sql = """
              INSERT INTO posts (comment_id, parent_id, comment, date, score) 
              VALUES ({},{},"{}","{}",{});
              """.format(int(commentid), int(parentid), comment, date, int(score))
        transaction_bldr(sql)
    except Exception as e:
        print('s-PARENT insertion', str(e))

        
def sql_insert_no_parent(commentid, comment, title, date, score, tags):
    try:
        sql = """
              INSERT INTO posts (comment_id, comment, title, date, score, tags) 
              VALUES ({},"{}","{}","{}",{},"{}");
              """.format(int(commentid), comment, title, date, int(score), tags)
        transaction_bldr(sql)
    except Exception as e:
        print('s-NO-PARENT insertion', str(e))
        


In [44]:
subset_question = questions[questions.Id.isin(diff)]
subset_question['Body'] = subset_question['Body'].map(lambda x: clean_text(x))
subset_question = subset_question[['Id', '']].rename(
    columns={'Id': 'comment_id', 'Body': 'comment', 'Title': 'title', 'CreationDate': 'Date'})
subset_question.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
1256,287312,1694.0,2008-11-13T15:52:03Z,10,Tix documentation for Python,<p>I've recently starting playing around with ...
1638,354124,36131.0,2008-12-09T20:29:40Z,44,Are there statistical studies that indicates t...,"<p>If I do a google search with the string ""p..."
1738,370733,6583.0,2008-12-16T08:50:10Z,12,Trac documentation?,<p>I'm trying to write my first little plugin ...
1941,405582,32558.0,2009-01-01T22:30:39Z,68,Function and class documentation best practice...,<p>I am looking for best practices for functio...
2699,527134,62617.0,2009-02-09T04:48:10Z,6,Python or java which language will exposed a s...,<p>Of the two which one would exposed someone ...


In [None]:
subset_question.to_sql('posts', connection, if_exists='append')

Unnamed: 0,index,comment_id,title,comment
0,0,337,xml processing python,<p>I am about to build a piece of a project th...
1,1,469,find full path font display name mac,<p>I am using the Photoshop's javascript API t...
2,2,502,get preview jpeg pdf windows,<p>I have a cross-platform (Python) applicatio...
3,3,535,continuous integration system python codebase,<p>I am starting to work on a hobby project wi...
4,4,594,cx_oracle iterate result set,<p>There are several ways to iterate over a re...


In [49]:
tags = pd.read_csv('data/Tags.csv')
tags.head()

Unnamed: 0,Id,Tag
0,469,python
1,469,osx
2,469,fonts
3,469,photoshop
4,502,python


In [None]:
questions = pd.read_csv('data/Questions.csv', encoding='latin1')