In [1]:
import os
import re
import time
from contextlib import contextmanager
from operator import itemgetter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import japanize_matplotlib
%matplotlib inline

import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf
from sklearn.pipeline import make_pipeline, make_union, Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_log_error, r2_score

from google.cloud import storage

import re, MeCab
from glob import glob
import mojimoji
import warnings
warnings.simplefilter("ignore")

from keras.callbacks import EarlyStopping
from keras.layers.advanced_activations import PReLU
from keras.layers.core import Activation, Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Sequential, load_model
from keras.utils import np_utils
from multiprocessing.pool import ThreadPool

#環境変数,
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../auth/My First Project.json"
bucket_name = "pj_horidasimono"
prefix="dataset/train/ElectricalAppliance"
#最大表示行数の指定（ここでは50行を指定）
pd.set_option('display.max_rows', 200)
#最大表示列数の指定（ここでは50列を指定）
pd.set_option('display.max_columns', 200)


In [2]:
def road_data_from_gcs(bucket_name, prefix):
    client = storage.Client()
    blobs = client.list_blobs(bucket_name, prefix=prefix)
    df = pd.DataFrame()
    for blob in blobs:
        bucket = client.get_bucket(bucket_name)
        r = storage.Blob(blob.name, bucket)
        content = r.download_as_string()
        df = df.append(pd.read_json(content))
        print(f"read file {blob.name}...")

    df = df.drop_duplicates(subset="url")
    df = df.reset_index(drop=True)
    return df

In [3]:
tagger = MeCab.Tagger("-Owakati")
def make_wakati(sentence):
    # MeCabで分かち書き
    sentence = tagger.parse(sentence)
    sentence = mojimoji.zen_to_han(sentence)
    # 半角全角英数字除去
    #sentence = re.sub(r'[0-9０-９a-zA-Zａ-ｚＡ-Ｚ]+', " ", sentence)
    # 記号もろもろ除去
    sentence = re.sub(r'[\．_－―─！＠＃＄％＾＆\-‐|\\＊\“（）＿■×+α※÷⇒—♬◉ᴗ͈ˬ●★☆⭐️⭕⚡⚠o①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮♡⭐︎〇◎◆♦▼◇△□(：〜～＋=)／*&^%$#@!~`)♪ᴖ◡ᴖ{}［］…\[\]\"\'\”\’:;<>?＜＞〔〕＼〈〉？、､。｡・,\./『』【】｢｣「」→←○《》≪≫\n\u3000⭕]+', "", sentence)
    # 絵文字除去
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           "]+", flags=re.UNICODE)
    sentence = emoji_pattern.sub(r'', sentence)
    # スペースで区切って形態素の配列へ
    #wakati = sentence.split(" ")
    # 空の要素は削除
    #wakati = list(filter(("").__ne__, wakati))
    return sentence

def title_torkenize(sentence):
    sentence = mojimoji.zen_to_han(sentence)
    sentence = re.sub("[\．_－―─！＠＃＄％＾＆\-‐|\\＊\“（）＿■×+α※÷⇒♬◉ᴗ͈ˬ—●★☆⭐️⭕⚡⚠o①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮♡⭐︎〇◎◆♦▼◇△□(：〜～＋=)／*&^%$#@!~`)♪ᴖ◡ᴖ{}［］…\[\]\"\'\”\’:;<>?＜＞〔〕＼〈〉？、､。｡・,\./『』【】｢｣「」→←○《》≪≫\n\u3000]", " ", sentence)
    sentence = re.sub("[あ-ん]", " ", sentence)
    sentence = re.sub("( |　)+", " ", sentence)
    sentence = sentence.lower()
    #〇〇様専用を除く
    sentence = re.sub("[^ ]*専用", "", sentence)
    sentence = re.sub("[^ ]*様", "", sentence)
    #1文字のアルファベットを除く
    sentence = re.sub(" [a-z]{1}[^(a-z)]", " ", sentence)
    # 絵文字除去
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           "]+", flags=re.UNICODE)
    sentence = emoji_pattern.sub(r'', sentence)
    sentence = sentence.strip()

    return sentence

def preprocess(df):
    df["price"] = df["price"].str.replace(r"\D", "").astype(np.float)
    
    #列ズレを修正
    pattern = re.compile(r"^(?!.*(傷や汚れあり|全体的に状態が悪い|やや傷や汚れあり|未使用に近い|目立った傷や汚れなし|新品、未使用)).+$")
    invalid = df["status"].str.match(pattern)

    df.loc[invalid, "description"] = df.loc[invalid, "description"] + "\n" + df.loc[invalid, "status"]
    df.loc[invalid, "status"]      = df.loc[invalid, "shipping"]
    df.loc[invalid, "shipping"]    = df.loc[invalid, "method"]
    df.loc[invalid, "method"]      = df.loc[invalid, "region"]
    df.loc[invalid, "period"]      = "未定"
    
    df["title"] = df["title"] + " " + df["sub_category_1"] + " " + df["sub_category_2"] + " " + df["brand"]
    #df["text"]  = df["title"] + " " + df["description"]

    df = df.drop(columns=["sub_category_1", "sub_category_2", "brand"])
    
    status_dict = {'新品、未使用': "best",
                   '未使用に近い': "Very Good",
                   '目立った傷や汚れなし': "good",
                   '傷や汚れあり': "Poor",
                   'やや傷や汚れあり': "very poor",
                   '全体的に状態が悪い': "worst"
                  }
    
    #配送負担をラベルエンコーディング
    shipping_dict = {'送料込み(出品者負担)': 0, '着払い(購入者負担)': 1}

    df["status"] = df["status"].map(status_dict)
    df["shipping"] = df["shipping"].map(shipping_dict)
    
    #トークナイズ
    df["title"] = df["title"].apply(title_torkenize)
    df["description"] = df["description"].apply(make_wakati)
    
    #不要列削除
    df = df.drop(columns=["url", "seller", "rating", "method", "region", "period", "recent_comment", "timestamp"])
    return df

In [8]:
cd code-analysis

/root/user/project/pj_horidasimono/code-analysis


In [9]:
from model_ridge import ModelRidge

In [10]:
#df_ = road_data_from_gcs(bucket_name, prefix)
df_ = pd.read_pickle("dataset.pickle")
#df_.to_pickle("dataset.pickle")

In [11]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')
    
def on_field(f: str, *vec):
    return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec)

def to_records(df: pd.DataFrame):
    return df.to_dict(orient='records')

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

vectorizer = make_union(
            on_field("title", Tfidf(max_features=100000, token_pattern="\w+")),
            on_field("description", Tfidf(max_features=500000, token_pattern="\w+", ngram_range=(1, 2))),
            on_field(['shipping', 'status'],
                 FunctionTransformer(to_records, validate=False), DictVectorizer()))

with timer("preprocess"):
    df = preprocess(df_.copy())
    X = df.drop(columns="price")
    y = df["price"]
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)
    X_train = vectorizer.fit_transform(X_train).astype(np.float32)
    X_valid = vectorizer.transform(X_valid).astype(np.float32)
    Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]]
    
    xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2

with timer("fit and predict"):
    preds = []
    for X_train, X_valid in xs:
        model = ModelRidge()
        model.fit(X_train, y_train)
        pred = np.expm1(model.predict(X_valid))
        preds.append(pred)


[preprocess] done in 158 s
[fit and predict] done in 879 s


In [22]:
np.array(preds)[:,0:10]

array([[ 3921.00073941, 10722.54178525,  3015.72987121,  6307.71814086,
         5837.41511324,  4124.91457758,  3318.73857373, 10388.81354219,
         9276.01570235,  4403.37482128],
       [ 5033.77274055,  8672.69665585,  3291.10274213,  7334.94270278,
         6168.25854082,  4247.94608789,  4581.66158334,  7324.05772586,
         9005.50807571,  4207.85018395],
       [ 3925.23032733, 10715.00034578,  3016.61798409,  6306.63922998,
         5834.43025584,  4127.43910608,  3317.39577228, 10391.9404367 ,
         9283.17776722,  4396.11391128],
       [ 5033.77423575,  8672.70156269,  3291.10094743,  7334.93930372,
         6168.25372722,  4247.94592174,  4581.66296965,  7324.04847944,
         9005.50674726,  4207.85583122]])

In [29]:
for pred in np.array(preds):
    print(np.sqrt(mean_squared_log_error(y_valid, pred)))
    print(np.mean(np.abs((y_valid - pred) / y_valid)) * 100)

print(np.sqrt(mean_squared_log_error(y_valid, np.mean(np.array(preds), axis=0))))

0.584205749403832
49.22214812364832
0.4560177175726792
36.77099050851783
0.5842012810989269
49.22075733874581
0.4560177162274387
36.77099165179877
0.48854135389353986


In [None]:
    
va_pred = np.expm1(model.predict(X_valid))
score = np.sqrt(mean_squared_log_error(y_valid, va_pred))
mape = np.mean(np.abs((y_valid - va_pred) / y_valid)) * 100
print("mape: {:.3f}%".format(mape))
print(score)

## 前処理

In [21]:
a = [[1, 2], [3, 4]]
for i, j in a:
    print(i, j)

1 2
3 4


In [38]:
m = Doc2Vec(documents= trainings, dm = 1, size=300, window=8, min_count=10, workers=4)

In [51]:
m.save('doc2vec.model')
#model = models.Doc2Vec.load('doc2vec.model')

In [47]:
m.infer_vector(["宜しく", "どうぞ", "付属", "品", "は", "画像", "が", "全て", "です",  "問題", "なく", "動作", "確認", "済み", "です"]).shape

(300,)

In [None]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

vectorizer = Vectorizer().tfidf_vectorizer(title_feat=100000, description_feat=500000)

with timer("preprocess"):
    df = preprocess(df_.copy())
    X = df.drop(columns="price")
    y = df["price"]
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)
    X_train = vectorizer.fit_transform(X_train).astype(np.float32)
    X_valid = vectorizer.transform(X_valid).astype(np.float32)
    Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]]
    
    xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2

with timer("fit and predict"):
    preds = []
    for X_train, X_valid in xs:
        model = ModelRidge()
        model.fit(X_train, y_train)
        pred = np.expm1(model.predict(X_valid))
        preds.append(pred)
