In [1]:
import pandas as pd
from sqlalchemy import create_engine

In [2]:
def load_query_batch(cursor,query,options,batch_size):
    if options:
        cursor.execute(query, options)
    else:
        cursor.execute(query)
    batch = cursor.fetchmany(batch_size)
    while batch:
        yield batch
        batch = cursor.fetchmany(batch_size)            

In [3]:
def load_data_for_text_processing(cursor,query,options,batch_size):
    for batch in load_query_batch(cursor,query,options,batch_size):
        if len(batch[0]) > 1:
            yield pd.DataFrame(batch).fillna('').apply(lambda x: '. '.join(x),1)
        else:
            batch = [i[0] for i in batch]
            yield pd.Series(batch).fillna('')

In [4]:
import psycopg2
database='crawler_service_test'
user='postgres'
password='postgres'
host='192.168.3.56'

con = psycopg2.connect(database=database, user=user,password=password,host=host)
cursor = con.cursor()

#### building vectorizer from the whole data

In [5]:
query = 'select all_page_text,home_page_text from crawler.webpage_texts limit 200'
data_yielder = load_data_for_text_processing(cursor,query,None,50)

In [7]:
from analytics_workbench.process_text import ProcessText
text_processor = ProcessText()
text_processor.generate_vectorizer_iter_list(data_yielder,vectorizer_type='Count',synonym_loc=None,
                                stem_type=False,phrase_generation=False,stop_words_loc=None,lower=True,
                                n_gram_range=(1,2),max_df=0.9,min_df=0.1)

In [8]:
len(text_processor.vocabulary)

2079

In [9]:
cursor.execute(query)
res = cursor.fetchall()

In [12]:
out_test,vocabulary = text_processor.get_matrix_test(pd.DataFrame(res).fillna('').apply(lambda x: '. '.join(x),1))

In [13]:
out_test.shape

(200, 2079)

#### generating topics

below function will yield data matrix that can be fed into the model iteratively

In [20]:
def get_data_matrix_iter(text_processor,cursor,query,options,batch_size):
    for inp_series in load_data_for_text_processing(cursor,query,options,batch_size):
        yield text_processor.get_matrix_test(inp_series)[0]

In [15]:
from analytics_workbench.unsupervised_learning import Unsupervised
unsup_obj = Unsupervised()

In [23]:
matrix_iter = get_data_matrix_iter(text_processor,cursor,query,None,50)
lda_obj = unsup_obj.lda_partial_fit(matrix_iter,text_processor.vocabulary,n_topics=10)

Topic #0:
site, solutions, cr, r, web, commerce, internet, solution, erp, f
Topic #1:
r, v, one, us, p, size, style, amp, contact, g
Topic #2:
web, development, design, services, website, us, contact, site, marketing, web design
Topic #3:
website, design, marketing, us, services, business, web, contact, digital, elegant
Topic #4:
site, web, commerce, r, cr, solution, internet, solutions, services, plus
Topic #5:
us, software, services, data, marketing, contact, development, business, web, online
Topic #6:
development, services, us, payment, application, software, business, web, mobile, ecommerce
Topic #7:
check, page, make, sure, make sure, website, try, network, computer, web
Topic #8:
b, us, solutions, contact, management, global, group, commerce, business, contact us
Topic #9:
r, amp, site, plus, cr, solutions, web, commerce, f, sp
()


In [25]:
con.close()