# Movies Recommend System

### 構成

1. レビューの形態素解析
2. 特徴量行列の作成
3. 事前に作成したモデルで予測

### 準備

In [1]:
#パンダスの準備
import pandas as pd
from pandas import Series,DataFrame

In [2]:
#SQLの準備
import sqlite3
con = sqlite3.connect("movies_c.db")

In [3]:
# 関数を作って、SQL文の実行結果をDataFrameにして返します。
def sql_to_df(sql_query):

    # read_sqlの引数に、SQL文とデータベースへのConnectionを渡します。
    df = pd.read_sql(sql_query, con)

    # 結果のDataFrameを返します。
    return df

In [4]:
import MeCab
mecab = MeCab.Tagger ('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')

## 1. レビューの形態素解析

In [5]:
watch = '''
コメディ映画。めちゃくちゃ笑いたい。イギリスっぽいアイロニーを含んだ映画がみたい。
知的な笑いで、パティントンとかそういう映画。
テッドとか下品な映画ではなくて、上品な笑いがいい。
'''

In [6]:
want_df = pd.DataFrame(columns=['id', 'word', 'num'])

In [7]:
want_df

Unnamed: 0,id,word,num


In [8]:
text = watch
mecab.parse('')#文字列がGCされるのを防ぐ
node = mecab.parseToNode(text)
i_id=0
while node:
    #単語を取得
    word = node.surface
    #品詞を取得
    pos = node.feature.split(",")[1]
    #print('{}'.format(word))
    #print('{0} , {1}'.format(word, pos))
        
    #次の単語に進める
    node = node.next

    s = pd.DataFrame([[i_id, word, 1]], columns=['id', 'word', 'num'])
    want_df = want_df.append(s)
    #want.append(pd.DataFrame([i_id, word, 1]))
    
i_id += 1

In [10]:
sql_query='''
SELECT * FROM num_words
'''
num_words = sql_to_df(sql_query)

In [12]:
m = pd.merge(want_df, num_words, on='word')

In [13]:
mm = m.groupby("id_y")["num"].sum()

In [15]:
mmm = pd.DataFrame(mm)

In [17]:
want_df = mmm.T

## 2. 特徴量行列の作成

In [19]:
l = []
for num in range(0,30400):
    literal = num
    #literal = '{num}'.format(num=num)
    l.append(literal)

In [21]:
all_df = pd.DataFrame(columns = l)

In [23]:
all_df = all_df.T

In [24]:
all_df.index.names = ["id_y"]

In [26]:
want_df = want_df.T

In [27]:
last_df = all_df.join(want_df)

In [29]:
last_df = last_df.fillna(0)

In [30]:
all_df = last_df.T

In [31]:
all_df

id_y,0,1,2,3,4,5,6,7,8,9,...,30390,30391,30392,30393,30394,30395,30396,30397,30398,30399
num,0.0,5.0,2.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
con.close

<function Connection.close>

## 3. 事前に作成したモデルで予測

In [33]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

%matplotlib inline

In [34]:
from sklearn import linear_model

In [35]:
from sklearn.externals import joblib
#モデルの読み込み
logreg = joblib.load("model_dir/model")

In [36]:
all_numpyMatrix_data = all_df.as_matrix()
X_test = all_numpyMatrix_data
_test_data = DataFrame(X_test)

In [37]:
# テストデータを予測します。
Y_pred = logreg.predict(X_test)

In [38]:
Y_pred

array(['アダムス・ファミリー'], dtype=object)