In [1]:
import os
import json
import csv
import time
import datetime
import gzip
from urllib.request import urlopen

import numpy as np
import pandas as pd
import requests
import sqlite3
import MeCab
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from bs4 import BeautifulSoup
import MySQLdb

from IPython.display import clear_output

# MySQL接続

In [2]:
def get_connector_and_cursor():
    conn = MySQLdb.connect(
        host = '0.0.0.0',
        port = 3306,
        user = 'root',
        password = 'root',
        database = 'maindb'
    )
    cursor = conn.cursor()
    return conn, cursor

# ポイント予測

In [3]:
def count_noun_number(mecab, text):
    text = str(text)
    count = []
    for line in mecab.parse(text).splitlines():
        try:
            if "名詞" in line.split()[-1]:
                count.append(line)
        except:
            pass
    return len(set(count))

In [4]:
def preprocessing(detail_df):
    '''
    Made features: 
        title_length: length of title
        story_length: length of story
        text_length: length of text
        keyword_number: number of keywords
        noun_proportion_in_text: number of nouns in text per text length
    '''
    mecab = MeCab.Tagger("-Ochasen")
    
    for column in ['title', 'story', 'text']:
        detail_df[column + '_length'] = detail_df[column].apply(lambda x: len(str(x)))
    detail_df['keyword_number'] = detail_df['keyword'].apply(lambda x: len(str(x).split(' ')))
    detail_df['noun_proportion_in_text'] = detail_df.text.apply(lambda x: count_noun_number(mecab, str(x)) / len(str(x)))
    return detail_df

In [5]:
def point_prediction(url, detail_df):
    '''
    Args:
        str url: url of point prediction api
        pandas.DataFrame detail_df: dataframe containing all features of item
    '''
    detail_df = preprocessing(detail_df)
    
    headers = {'Content-Type': 'application/json'}
    data = {}
    data = {column: list(detail_df[column]) for column in list(detail_df.columns)}
    data = json.dumps(data)
    r_post = requests.post(url, headers=headers, json=data)

    predicted_points = r_post.json()['prediction']
    return predicted_points

# 特徴量抽出

In [6]:
def extract_features(url, texts):
    '''
    Args:
        str url: url of feature extraction api
        list<str> texts: texts of narou novel
    Return:
        list<float> features: feature vectors of item
    '''   
    headers = {'Content-Type': 'application/json'}
    data = {'texts': texts}
    r_post = requests.post(url, headers=headers, json=data)
    features = r_post.json()['prediction']
    return features

In [7]:
def generate_data(ncodes, features):
    for ncode, feature in zip(ncodes, features):
        yield {
            '_index': 'features',
            'ncode': ncode,
            'feature': feature
        }

In [8]:
def register_features_to_elasticsearch(host, url, ncodes, texts, h_dim=64):
    '''
    Args: 
        str host: host name of elasticsearch
        str url: url of feature extraction api
        list<str> ncodes: ncodes to register
        texts<str> texts: texts to extract features
        h_dim: size of feature vector
    '''    

    features = extract_features(url, texts)
    
    client = Elasticsearch(host)
    
    mappings = {
        'properties': {
            'ncode': {'type': 'text'},
            'feature': {'type': 'dense_vector', 'dims': h_dim}
        }
    }
    
    if not client.indices.exists(index='features'):
        client.indices.create(index='features', body={ 'mappings': mappings })
    
    bulk(client, generate_data(ncodes, features))

# 全データポイント予測

In [None]:
TEST = True
MINIBATCH_SIZE = 10
ELASTICSEARCH_HOST_NAME = 'localhost:9200'
FEATURE_EXTRACTION_URL = 'http://localhost:3032/predict'
POINT_PREDICTION_URL = 'http://localhost:3033/predict'

def register_all_data():
    conn, cursor = get_connector_and_cursor()
    detail_df = pd.read_sql_query("SELECT * FROM details WHERE predict_point='Nan'", conn)
    predicted_point = point_prediction(POINT_PREDICTION_URL, detail_df)
    detail_df['predict_point'] = predicted_point
    target_detail_df = detail_df[(detail_df['predict_point']==1) & (detail_df['global_point']==0)]
    if len(target_detail_df) != 0:      
        ncodes = list(target_detail_df.ncode)
        texts = list(target_detail_df.text)

        for i in range(len(ncodes) // MINIBATCH_SIZE + 1):
            register_features_to_elasticsearch(ELASTICSEARCH_HOST_NAME, FEATURE_EXTRACTION_URL, ncodes[i*MINIBATCH_SIZE:(i+1)*MINIBATCH_SIZE], texts[i*MINIBATCH_SIZE:(i+1)*MINIBATCH_SIZE])
            if TEST:
                break

    print('{} data is inserted to Elasticsearch.'.format(len(target_detail_df)))
    conn.commit()
    conn.close()

# スクレイピング

In [9]:
def df_preprocessing(df):
    df = df.drop(['allcount', 'gensaku'], axis=1, errors='ignore')
    df = df.dropna(how='all')
    
    date_to_timestamp = lambda date: int(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S").timestamp())

    for column in df.columns:
        if column in ['title', 'ncode', 'userid', 'writer', 'story', 'keyword']:
            df[column] = df[column].astype(str)
        elif column in['general_firstup', 'general_lastup', 'novelupdated_at', 'updated_at']:
            df[column] = df[column].map(str).map(date_to_timestamp)
        else:
            df[column] = df[column].astype(int)
            
    df['predict_point'] = 'Nan'
    df['text'] = 'Nan'
    
    return df

In [32]:
def scraping_details(conn, cursor, narou_api_url, mode='middle', test=True):
    if mode not in ['middle', 'first']:
        raise Exception('Argument mode should be middle or first.')
    
    if mode is 'middle':
        cursor.execute('SELECT general_lastup FROM details ORDER BY general_lastup DESC LIMIT 1')
        sql_result = cursor.fetchone()
        register_latest = str(sql_result[0]) if sql_result is not None else "1073779200"
    elif mode is 'first':
        register_latest = "1073779200"
    
    now = str(int(datetime.datetime.now().timestamp()))

    payload = {'out': 'json', 'gzip': 5, 'of': 'n', 'lim': 1, 'lastup': register_latest+"-"+now}
    res = requests.get(narou_api_url, params=payload).content
    r =  gzip.decompress(res).decode("utf-8") 
    allcount = json.loads(r)[0]["allcount"]
    
    interval = 1
    detail_df = pd.DataFrame()

    lastup = now
    all_queue_cnt = (allcount // 500)

    for i in range(all_queue_cnt):
        payload = {'out': 'json', 'gzip': 5,'opt': 'weekly', 'lim':500, 'lastup': register_latest+"-"+str(lastup)}
        
        c = 0 # Avoid infinite loop
        while c < 10:
            try:
                res = requests.get(narou_api_url, params=payload, timeout=30).content
                break
            except:
                print('Connection Error')
                c += 1       

        r = gzip.decompress(res).decode('utf-8')

        df_temp = pd.read_json(r)
        df_temp = df_temp.drop(0)

        last_general_lastup = df_temp.iloc[-1]["general_lastup"]
        lastup = datetime.datetime.strptime(last_general_lastup, "%Y-%m-%d %H:%M:%S").timestamp()
        lastup = int(lastup)

        df_temp = df_preprocessing(df_temp)
        detail_df = pd.concat([detail_df, df_temp], axis=0)

        time.sleep(interval)
        
        if test is True and i==1:
            break
        
    return detail_df

In [33]:
def make_bs_obj(url):
    html = urlopen(url)
    return BeautifulSoup(html,"html.parser")

In [34]:
def get_main_text(bs_obj):
    text = ""
    text_htmls = bs_obj.findAll("div",{"id":"novel_honbun"})[0].findAll("p")

    for text_html in text_htmls:
        text = text + text_html.get_text() + "\n\n"

    return text

In [35]:
def scraping_texts(ncodes, test=True):
    texts = []
    processed_ncodes = []
    interval = 0.1
    cnt = 0

    for ncode in ncodes:
        print(cnt) if cnt % 100 == 0 else None

        time.sleep(interval)
        url = 'https://ncode.syosetu.com/' + ncode + '/'
        c = 0 # Avoid infinite loop
        while c < 10:
            try:
                bs_obj = make_bs_obj(url)
                break
            except:
                print('Connection Error')
                c += 1
                
        url_list = ["https://ncode.syosetu.com" + a_bs_obj.find("a").attrs["href"] for a_bs_obj in bs_obj.findAll("dl", {"class": "novel_sublist2"})]
        
        if len(url_list) == 0:
            text = get_main_text(bs_obj)
        else:
            time.sleep(interval)
            bs_obj = make_bs_obj(url_list[0])
            text = get_main_text(bs_obj)

        texts.append(text)
        processed_ncodes.append(ncode)
        cnt += 1
        
        if test == True and cnt == 10:
            break
    
    return processed_ncodes, texts

In [36]:
TEST = True
MINIBATCH_SIZE = 10
NAROU_API_URL = 'https://api.syosetu.com/novelapi/api/'
ELASTICSEARCH_HOST_NAME = 'localhost:9200'
FEATURE_EXTRACTION_URL = 'http://localhost:3032/predict'
POINT_PREDICTION_URL = 'http://localhost:3033/predict'

def register_scraped_data():
    conn, cursor = get_connector_and_cursor()
    # Scraping details and texts
    detail_df = scraping_details(conn, cursor, NAROU_API_URL, mode='first', test=TEST)
    ncodes, texts = scraping_texts(detail_df.ncode, TEST)
    for ncode, text in zip(ncodes, texts):
        detail_df.loc[detail_df['ncode'] == ncode, 'text'] = text
    predicted_point = point_prediction(POINT_PREDICTION_URL, detail_df)
    detail_df['predict_point'] = predicted_point
    
    # Insert scraped data to database
    cursor.execute('SHOW columns FROM details')
    columns_of_details = [column[0] for column in cursor.fetchall()]
    details_data_tmp = detail_df[columns_of_details]
    details_data = [tuple(details_data_tmp.iloc[i]) for i in range(len(details_data_tmp))]
    cursor.executemany("INSERT INTO details VALUES ({})".format(('%s, '*len(columns_of_details))[:-2]), details_data)
    
    # Insert scraped data to elasticsearch
    target_detail_df = detail_df[(detail_df['predict_point'] == 1) & (detail_df['global_point'] == 0)]
    if len(target_detail_df) != 0:      
        ncodes = list(target_detail_df.ncode)
        texts = list(target_detail_df.text)

        for i in range(len(ncodes) // MINIBATCH_SIZE + 1):
            register_features_to_elasticsearch(ELASTICSEARCH_HOST_NAME, FEATURE_EXTRACTION_URL, ncodes[i*MINIBATCH_SIZE:(i+1)*MINIBATCH_SIZE], texts[i*MINIBATCH_SIZE:(i+1)*MINIBATCH_SIZE])
            if TEST:
                break
    
    print('{} data is inserted to Elasticsearch.'.format(len(target_detail_df)))
    conn.commit()
    conn.close()

In [27]:
register_scraped_data()

0
76 data is inserted to Elasticsearch.


# インデックスの作成

In [32]:
def create_ncode_index(cursor):
    cursor.execute("CREATE INDEX ncodeindex ON details(ncode)")

# ポイント予測

In [33]:
def count_noun_number(mecab, text):
    text = str(text)
    count = []
    for line in mecab.parse(text).splitlines():
        try:
            if "名詞" in line.split()[-1]:
                count.append(line)
        except:
            pass
    return len(set(count))

In [34]:
def preprocessing(detail_df):
    '''
    Made features: 
        title_length: length of title
        story_length: length of story
        text_length: length of text
        keyword_number: number of keywords
        noun_proportion_in_text: number of nouns in text per text length
    '''
    mecab = MeCab.Tagger("-Ochasen")
    
    for column in ['title', 'story', 'text']:
        detail_df[column + '_length'] = detail_df[column].apply(lambda x: len(str(x)))
    detail_df['keyword_number'] = detail_df['keyword'].apply(lambda x: len(str(x).split(' ')))
    detail_df['noun_proportion_in_text'] = detail_df.text.apply(lambda x: count_noun_number(mecab, str(x)) / len(str(x)))
    return detail_df

In [35]:
def point_prediction(url, detail_df):
    '''
    Args:
        str url: url of point prediction api
        pandas.DataFrame detail_df: dataframe containing all features of item
    '''
    detail_df = preprocessing(detail_df)
    
    headers = {'Content-Type': 'application/json'}
    data = {}
    data = {column: list(detail_df[column]) for column in list(detail_df.columns)}
    data = json.dumps(data)
    r_post = requests.post(url, headers=headers, json=data)

    predicted_points = r_post.json()['prediction']
    return predicted_points

In [36]:
def register_predicted_point(url, conn, cursor, mode, ncodes=None, test=True):
    '''
    Args:
        str url: url of point prediction api
        sqlite3.Connection conn: connection of sqlite3
        sqlite3.Cursor cursor: cursor of sqlite3
        str mode: all (target all records whose predict_point is Nan) or part (specify records by ncode)
        list<str> ncodes: specify ncode when use part mode
    '''   
    
    if mode not in ['all', 'part']:
        raise Exception('Argument mode shoud be all or part.')
    if mode == 'part' and type(ncodes) is not list:
        raise Exception('Argument ncodes should be list of string.')
    
    if mode == 'all':
        if test == True:
            detail_df = pd.read_sql_query(sql="SELECT * FROM details WHERE predict_point='Nan' LIMIT 10", con=conn)
        else:
            detail_df = pd.read_sql_query(sql="SELECT * FROM details WHERE predict_point='Nan'", con=conn)
        ncodes = list(detail_df.ncode)
    elif mode == 'part':
        ncodes_str = ', '.join(map(str, list(["'{}'".format(ncode) for ncode in ncodes])))
        detail_df = pd.read_sql_query(sql="SELECT * FROM details WHERE ncode IN ({})".format(ncodes_str), con=conn)

    predicted_points = point_prediction(url, detail_df)
    
    ncodes = list(detail_df.ncode)
    for ncode, predicted_point in zip(ncodes, predicted_points):
        c.execute("UPDATE details SET predict_point={} WHERE ncode='{}'".format(predicted_point, ncode))

In [37]:
# url = 'http://localhost:5000/predict'
url = 'http://localhost:3033/predict'

In [39]:
register_predicted_point(url, conn, cursor, mode='all')

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

# 特徴量登録

In [76]:
def extract_features(url, texts):
    '''
    Args:
        str url: url of feature extraction api
        list<str> texts: texts of narou novel
    Return:
        list<float> features: feature vectors of item
    '''   
    headers = {'Content-Type': 'application/json'}
    data = {'texts': texts}
    r_post = requests.post(url, headers=headers, json=data)
    features = r_post.json()['prediction']
    return features

In [77]:
def generate_data(ncodes, features):
    for ncode, feature in zip(ncodes, features):
        yield {
            '_index': 'features',
            'ncode': ncode,
            'feature': feature
        }

In [88]:
def register_features_to_elasticsearch(host, url, ncodes, texts, h_dim=64):
    '''
    Args: 
        str host: host name of elasticsearch
        str url: url of feature extraction api
        list<str> ncodes: ncodes to register
        texts<str> texts: texts to extract features
        h_dim: size of feature vector
    '''    

    features = extract_features(url, texts)
    
    client = Elasticsearch(host)
    
    mappings = {
        'properties': {
            'ncode': {'type': 'text'},
            'feature': {'type': 'dense_vector', 'dims': h_dim}
        }
    }
    
    if not client.indices.exists(index='features'):
        client.indices.create(index='features', body={ 'mappings': mappings })
    
    bulk(client, generate_data(ncodes, features))

In [89]:
def register_all_features_to_elasticsearch(conn, host, url, h_dim=64, test=True):
    
    if test == True:
        detail_df = pd.read_sql_query(sql="SELECT * FROM details LIMIT 30", con=conn)
    else:
        detail_df = pd.read_sql_query(sql="SELECT * FROM details WHERE predict_point=1", con=conn)
    
    ncodes = list(detail_df.ncode)
    texts = list(detail_df.text)
    
    register_features_to_elasticsearch(host, url, ncodes, texts, h_dim=h_dim)

In [148]:
host = 'localhost:9200'
# url = 'http://localhost:5000/predict'
url = 'http://localhost:3032/predict'
h_dim = 64

In [141]:
register_all_features_to_elasticsearch(conn, host, url)

# 類似文書検索

In [487]:
def search_similar_text(url, query_text):
    '''
    Args:
        str url: url of feature extraction api
        str query_text: query of similar text search
    '''
    if type(query_text) is not list and type(query_text) is not str:
        raise Exception('query_text should be list or str.')
    if type(query_text) is str:
        query_text = [query_text]
        
    query_feature = extract_features(url, query_text)[0]
    
    res = es.search(index='features', body={
      "query": {
        "script_score": {
          "query": {
            "match_all": {}
          },
          "script": {
            "source": "cosineSimilarity(params.query_vec, doc['feature']) + 1.0", # Elasticsearch does not allow negative scores
            "params": {
              "query_vec": query_feature
            }
          }
        }
      }
    })
    
    recommend_ncodes = []
    for i in range(10):
        ncode = res['hits']['hits'][i]['_source']['ncode']
        recommend_ncodes.append(ncode)
    
    return recommend_ncodes

In [488]:
search_similar_text(url, 'testtext')

['N6600GI',
 'N2238GK',
 'N3325BS',
 'N9514FJ',
 'N1762DR',
 'N8231FM',
 'N9831GA',
 'N1703GK',
 'N0137GI',
 'N2241GK']