In [93]:
import os
import json
import csv
import time
import datetime
import gzip
from urllib.request import urlopen

import numpy as np
import pandas as pd
import requests
import sqlite3
import MeCab
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from bs4 import BeautifulSoup

from IPython.display import clear_output

In [131]:
conn = sqlite3.connect('./db/test.sqlite3')
c = conn.cursor()

# インデックスの作成

In [145]:
def create_ncode_index(cursor):
    cursor.execute("CREATE INDEX ncodeindex ON details(ncode)")

# ポイント予測

In [102]:
def count_noun_number(mecab, text):
    text = str(text)
    count = []
    for line in mecab.parse(text).splitlines():
        try:
            if "名詞" in line.split()[-1]:
                count.append(line)
        except:
            pass
    return len(set(count))

In [103]:
def preprocessing(detail_df):
    '''
    Made features: 
        title_length: length of title
        story_length: length of story
        text_length: length of text
        keyword_number: number of keywords
        noun_proportion_in_text: number of nouns in text per text length
    '''
    mecab = MeCab.Tagger("-Ochasen")
    
    for column in ['title', 'story', 'text']:
        detail_df[column + '_length'] = detail_df[column].apply(lambda x: len(str(x)))
    detail_df['keyword_number'] = detail_df['keyword'].apply(lambda x: len(str(x).split(' ')))
    detail_df['noun_proportion_in_text'] = detail_df.text.apply(lambda x: count_noun_number(mecab, str(x)) / len(str(x)))
    return detail_df

In [104]:
def point_prediction(url, detail_df):
    '''
    Args:
        str url: url of point prediction api
        pandas.DataFrame detail_df: dataframe containing all features of item
    '''
    detail_df = preprocessing(detail_df)
    
    headers = {'Content-Type': 'application/json'}
    data = {}
    data = {column: list(detail_df[column]) for column in list(detail_df.columns)}
    data = json.dumps(data)
    r_post = requests.post(url, headers=headers, json=data)

    predicted_points = r_post.json()['prediction']
    return predicted_points

In [105]:
def register_predicted_point(url, conn, cursor, mode, ncodes=None, test=True):
    '''
    Args:
        str url: url of point prediction api
        sqlite3.Connection conn: connection of sqlite3
        sqlite3.Cursor cursor: cursor of sqlite3
        str mode: all (target all records whose predict_point is Nan) or part (specify records by ncode)
        list<str> ncodes: specify ncode when use part mode
    '''   
    
    if mode not in ['all', 'part']:
        raise Exception('Argument mode shoud be all or part.')
    if mode == 'part' and type(ncodes) is not list:
        raise Exception('Argument ncodes should be list of string.')
    
    if mode == 'all':
        if test == True:
            detail_df = pd.read_sql_query(sql="SELECT * FROM details WHERE predict_point='Nan' LIMIT 10", con=conn)
        else:
            detail_df = pd.read_sql_query(sql="SELECT * FROM details WHERE predict_point='Nan'", con=conn)
        ncodes = list(detail_df.ncode)
    elif mode == 'part':
        ncodes_str = ', '.join(map(str, list(["'{}'".format(ncode) for ncode in ncodes])))
        detail_df = pd.read_sql_query(sql="SELECT * FROM details WHERE ncode IN ({})".format(ncodes_str), con=conn)

    predicted_points = point_prediction(url, detail_df)
    
    ncodes = list(detail_df.ncode)
    for ncode, predicted_point in zip(ncodes, predicted_points):
        c.execute("UPDATE details SET predict_point={} WHERE ncode='{}'".format(predicted_point, ncode))

In [86]:
# url = 'http://localhost:5000/predict'
url = 'http://localhost:3033/predict'

In [467]:
register_predicted_point(url, conn, c, mode='all')

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

# 特徴量登録

In [154]:
def extract_features(url, texts):
    '''
    Args:
        str url: url of feature extraction api
        list<str> texts: texts of narou novel
    Return:
        list<float> features: feature vectors of item
    '''   
    headers = {'Content-Type': 'application/json'}
    data = {'texts': texts}
    r_post = requests.post(url, headers=headers, json=data)
    features = r_post.json()['prediction']
    return features

In [155]:
def generate_data(ncodes, features):
    for ncode, feature in zip(ncodes, features):
        yield {
            '_index': 'features',
            'ncode': ncode,
            'feature': feature
        }

In [156]:
def register_features_to_elasticsearch(url, ncodes, texts, h_dim=64):
    '''
    Args: 
        sqlite3.Connection conn: connection of sqlite3
        str host: host name of elasticsearch
        str url: url of feature extraction api
        h_dim: size of feature vector
    '''    

    features = extract_features(url, texts)
    
    client = Elasticsearch(host)
    
    mappings = {
        'properties': {
            'ncode': {'type': 'text'},
            'feature': {'type': 'dense_vector', 'dims': h_dim}
        }
    }
    
    if not client.indices.exists(index='features'):
        client.indices.create(index='features', body={ 'mappings': mappings })
    
    bulk(client, generate_data(ncodes, features))

In [157]:
def register_all_features_to_elasticsearch(conn, host, url, h_dim=64, test=True):
    
    if test == True:
        detail_df = pd.read_sql_query(sql="SELECT * FROM details LIMIT 30", con=conn)
    else:
        detail_df = pd.read_sql_query(sql="SELECT * FROM details WHERE predict_point=1", con=conn)
    
    ncodes = list(detail_df.ncode)
    texts = list(detail_df.text)
    
    register_features_to_elasticsearch(url, ncodes, texts, h_dim=h_dim)

In [148]:
host = 'localhost:9200'
# url = 'http://localhost:5000/predict'
url = 'http://localhost:3032/predict'
h_dim = 64

In [141]:
register_all_features_to_elasticsearch(conn, host, url)

# 類似文書検索

In [487]:
ELASTIC_SEARCH_CLIENT = 'elasticsearch'
FEATURE_EXTRACTION_URL = 'bertserver'

def search_similar_text(url, query_ncode, query_text):
    '''
    Args:
        str url: url of feature extraction api
        str query_ncode: ncode of query item
        str query_text: text of query item
    '''
    if type(query_text) is not list and type(query_text) is not str:
        raise Exception('query_text should be list or str.')
    if type(query_text) is str:
        query_text = [query_text]
        
    query_feature = extract_features(url, query_text)[0]
    
    res = es.search(index='features', body={
      "query": {
        "script_score": {
          "query": {
            "match_all": {}
          },
          "script": {
            "source": "cosineSimilarity(params.query_vec, doc['feature']) + 1.0", # Elasticsearch does not allow negative scores
            "params": {
              "query_vec": query_feature
            }
          }
        }
      }
    })
    
    recommend_ncodes = []
    for i in range(10):
        ncode = res['hits']['hits'][i]['_source']['ncode']
        recommend_ncodes.append(ncode)
    
    return recommend_ncodes

In [488]:
search_similar_text(url, 'testtext')

['N6600GI',
 'N2238GK',
 'N3325BS',
 'N9514FJ',
 'N1762DR',
 'N8231FM',
 'N9831GA',
 'N1703GK',
 'N0137GI',
 'N2241GK']

# スクレイピング&SQLiteとElasticsearchへの登録

In [17]:
conn = sqlite3.connect('./db/test.sqlite3')
cur = conn.cursor()
narou_api_url="https://api.syosetu.com/novelapi/api/" 

### details

In [10]:
def date_to_timestamp(date):
    return int(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S").timestamp())

In [11]:
def df_preprocessing(df):
    df = df.drop(['allcount', 'gensaku'], axis=1, errors='ignore')
    df = df.dropna(how='all')

    for column in df.columns:
        if column in ['title', 'ncode', 'userid', 'writer', 'story', 'keyword']:
            df[column] = df[column].astype(str)
        elif column in['general_firstup', 'general_lastup', 'novelupdated_at', 'updated_at']:
            df[column] = df[column].map(str).map(date_to_timestamp)
        else:
            df[column] = df[column].astype(int)
            
    df['predict_point'] = 'Nan'
    df['text'] = 'Nan'
    
    return df

In [72]:
def scraping_details(conn, cursor, narou_api_url, test=True):
    cursor.execute('SELECT general_lastup FROM details ORDER BY general_lastup DESC LIMIT 1')
    
    register_latest = str(cursor.fetchone()[0])
    now = str(int(datetime.datetime.now().timestamp()))

    payload = {'out': 'json', 'gzip': 5, 'of': 'n', 'lim': 1, 'lastup': register_latest+"-"+now}
    res = requests.get(narou_api_url, params=payload).content
    r =  gzip.decompress(res).decode("utf-8") 
    allcount = json.loads(r)[0]["allcount"]
    
    interval = 1
    detail_df = pd.DataFrame()

    lastup = now
    all_queue_cnt = (allcount // 500)

    for i in range(all_queue_cnt):
        payload = {'out': 'json', 'gzip': 5,'opt': 'weekly', 'lim':500, 'lastup': register_latest+"-"+str(lastup)}
        
        c = 0 # Avoid infinite loop
        while c < 10:
            try:
                res = requests.get(narou_api_url, params=payload, timeout=30).content
                break
            except:
                print('Connection Error')
                c += 1       

        r = gzip.decompress(res).decode('utf-8')

        df_temp = pd.read_json(r)
        df_temp = df_temp.drop(0)

        last_general_lastup = df_temp.iloc[-1]["general_lastup"]
        lastup = datetime.datetime.strptime(last_general_lastup, "%Y-%m-%d %H:%M:%S").timestamp()
        lastup = int(lastup)

        df_temp = df_preprocessing(df_temp)
        detail_df = pd.concat([detail_df, df_temp], axis=0)

    #     detail_df.to_sql('details', conn, if_exists='append')
    #     conn.commit()

        time.sleep(interval)
        
        if test is True:
            break
        
    return detail_df

### text

In [73]:
def make_bs_obj(url):
    html = urlopen(url)
    return BeautifulSoup(html,"html.parser")

In [74]:
def get_main_text(bs_obj):
    text = ""
    text_htmls = bs_obj.findAll("div",{"id":"novel_honbun"})[0].findAll("p")

    for text_html in text_htmls:
        text = text + text_html.get_text() + "\n\n"

    return text

In [76]:
def scraping_texts(ncodes, test=True):
    texts = []
    processed_ncodes = []
    interval = 0.1
    cnt = 0

    for ncode in ncodes:
        print(cnt) if cnt % 100 == 0 else None

        time.sleep(interval)
        url = 'https://ncode.syosetu.com/' + ncode + '/'
        c = 0 # Avoid infinite loop
        while c < 10:
            try:
                bs_obj = make_bs_obj(url)
                break
            except:
                print('Connection Error')
                c += 1
                
        url_list = ["https://ncode.syosetu.com" + a_bs_obj.find("a").attrs["href"] for a_bs_obj in bs_obj.findAll("dl", {"class": "novel_sublist2"})]
        
        if len(url_list) == 0:
            text = get_main_text(bs_obj)
        else:
            time.sleep(interval)
            bs_obj = make_bs_obj(url_list[0])
            text = get_main_text(bs_obj)

        texts.append(text)
        processed_ncodes.append(ncode)
        cnt += 1
        
        if test == True and cnt == 101:
            break
    
    return processed_ncodes, texts

In [77]:
detail_df = scraping_details(conn, cur, narou_api_url)

In [114]:
predicted_point = point_prediction('http://localhost:3033/predict', detail_df)

In [116]:
detail_df['predict_point'] = predicted_point 

In [118]:
target_detail_df = detail_df[detail_df.predict_point == 1]

In [170]:
url = 'http://localhost:3032/predict'
ncodes = list(target_detail_df.ncode)
texts = list(target_detail_df.text)

for i in range(len(ncodes) // 6 + 1):
    register_features_to_elasticsearch(url, ncodes[i*64:(i+1)*64], texts[i*64:(i+1)*64])

ConnectionError: HTTPConnectionPool(host='localhost', port=3032): Max retries exceeded with url: /predict (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fcd903f1f50>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [124]:
# for ncode, text in zip(ncodes, texts):
#     target_detail_df.loc[target_detail_df['ncode']==ncode, 'text'] = text
# target_text_detail_df = target_detail_df[target_detail_df['text'] != 'Nan']

In [163]:
# ミニバッチでextract_featuresを処理できるようにする
extract_features(url,  texts[:10])

ConnectionError: HTTPConnectionPool(host='localhost', port=3032): Max retries exceeded with url: /predict (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fcd90dfe350>: Failed to establish a new connection: [Errno 61] Connection refused'))