In [23]:
# Understanding tf-idf implementation from the blog of Kavita Ganesan
import pandas as pd

# read json into a dataframe
df_idf=pd.read_json("data/stackoverflow-data-idf.json",lines=True)


In [3]:
#schema
df_idf.dtypes

id                            int64
title                        object
body                         object
answer_count                  int64
comment_count                 int64
creation_date                object
last_activity_date           object
last_editor_display_name     object
owner_display_name           object
owner_user_id               float64
post_type_id                  int64
score                         int64
tags                         object
view_count                    int64
accepted_answer_id          float64
favorite_count              float64
last_edit_date               object
last_editor_user_id         float64
community_owned_date         object
dtype: object

In [4]:
#Preprocess

import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("","",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

df_idf['text'] = df_idf['title'] + df_idf['body']
df_idf['text'] = df_idf['text'].apply(lambda x:pre_process(x))
df_idf['text'][2]

'gradle command line p i m trying to run a shell script with gradle i currently have something like this p pre code def test project tasks create test exec commandline bash c bash c my file dir script sh code pre p the problem is that i cannot run this script because i have spaces in my dir name i have tried everything e g p pre code commandline bash c bash c my file dir script sh tokenize commandline bash c bash c my file dir script sh commandline bash c new stringbuilder append bash append c my file dir script sh commandline bash c bash c my file dir script sh file dir file c my file dir script sh commandline bash c bash dir getabsolutepath code pre p im using windows bit and if i use a path without spaces the script runs perfectly therefore the only issue as i can see is how gradle handles spaces p '

In [7]:
#Build vocab
from sklearn.feature_extraction.text import CountVectorizer
import re
posts = df_idf['text'].tolist()
cv=CountVectorizer(max_df=0.85,stop_words=None, max_features=1000)
word_count_vector=cv.fit_transform(posts)

In [8]:
list(cv.vocabulary_.keys())[:10]

['private',
 'struct',
 'can',
 'it',
 'be',
 'done',
 'have',
 'public',
 'class',
 'that']

In [9]:
#Compute tf-idf
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [19]:
#Create test set

test_idf=pd.read_json("data/stackoverflow-data-idf.json",lines=True)
test_idf['text'] = test_idf['title'] + test_idf['body']
test_idf['text'] =test_idf['text'].apply(lambda x:pre_process(x))
docs_test=test_idf['text'].tolist()

In [21]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [22]:
#Extract top 10 keywords using tf-idf
feature_names=cv.get_feature_names()
doc=docs_test[0]

tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
sorted_items=sort_coo(tf_idf_vector.tocoo())
keywords=extract_topn_from_vector(feature_names,sorted_items,10)
for k in keywords:
    print (k, keywords[k])

struct 0.735
private 0.308
public 0.246
contains 0.219
don 0.166
be 0.158
it 0.154
can 0.141
stream 0.14
types 0.132
