# Movies Recommend System

### 構成

1. レビューの形態素解析
2. 特徴量行列の作成
3. 事前に作成したモデルで予測

### 準備

In [1]:
#パンダスの準備
import pandas as pd
from pandas import Series,DataFrame

In [2]:
#SQLの準備
import sqlite3
con = sqlite3.connect("movies_c.db")

In [3]:
# 関数を作って、SQL文の実行結果をDataFrameにして返します。
def sql_to_df(sql_query):

    # read_sqlの引数に、SQL文とデータベースへのConnectionを渡します。
    df = pd.read_sql(sql_query, con)

    # 結果のDataFrameを返します。
    return df

In [4]:
import MeCab
mecab = MeCab.Tagger ('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')

## 1. レビューの形態素解析

In [5]:
watch = '''
コメディ映画。めちゃくちゃ笑いたい。イギリスっぽいアイロニーを含んだ映画がみたい。
知的な笑いで、パティントンとかそういう映画。
テッドとか下品な映画ではなくて、上品な笑いがいい。
'''

In [6]:
text = watch
mecab.parse('')#文字列がGCされるのを防ぐ
node = mecab.parseToNode(text)
i_id=0
while node:
    #単語を取得
    word = node.surface
    #品詞を取得
    pos = node.feature.split(",")[1]
    #print('{}'.format(word))
    #print('{0} , {1}'.format(word, pos))
        
    #次の単語に進める
    node = node.next
    
    sql_query = '''
    INSERT INTO watch_cnt_table(id, word, num) values({i_id}, "{word}", 1)
    '''.format(i_id=i_id, word=word)
        
    con.execute(sql_query)
    
i_id += 1

In [7]:
sql_query = '''
SELECT * FROM watch_cnt_table
'''
words_df = sql_to_df(sql_query)

In [8]:
words_df

Unnamed: 0,id,word,num
0,0,,1
1,0,コメディ,1
2,0,映画,1
3,0,。,1
4,0,めちゃくちゃ,1
5,0,笑い,1
6,0,たい,1
7,0,。,1
8,0,イギリス,1
9,0,っぽい,1


In [9]:
sql_query = '''
SELECT num_words.id, watch_cnt_table.word FROM watch_cnt_table, num_words WHERE num_words.word = watch_cnt_table.word GROUP BY num_words.id, watch_cnt_table.word ORDER BY num_words.id
'''
words_df = sql_to_df(sql_query)

In [10]:
words_df

Unnamed: 0,id,word
0,1,。
1,2,、
2,3,が
3,6,は
4,7,て
5,10,を
6,12,で
7,17,な
8,21,
9,27,映画


In [11]:
query = '''SELECT '''
for item in range(len(words_df)):
    #レビューIDごとに出現しているwordでカウントする。
    query += '''SUM(CASE WHEN word = '{word}' THEN num ELSE 0 END) AS a{n}, '''.format(word=words_df.iloc[item, 1], n=words_df.iloc[item, 0])
    
query += '''id FROM watch_cnt_table GROUP BY id'''
#print(query)

In [12]:
want_df = sql_to_df(query)

In [13]:
want_df =  want_df.drop('id', axis=1)

In [14]:
want_df

Unnamed: 0,a1,a2,a3,a6,a7,a10,a12,a17,a21,a27,...,a387,a492,a754,a976,a1079,a2431,a2648,a3463,a3691,a7065
0,5,2,2,1,1,1,2,3,2,4,...,1,1,3,1,1,1,1,1,1,1


In [15]:
want_df = want_df.T

In [16]:
want_df

Unnamed: 0,0
a1,5
a2,2
a3,2
a6,1
a7,1
a10,1
a12,2
a17,3
a21,2
a27,4


## 2. 特徴量行列の作成

In [17]:
l = []
for num in range(0,30400):
    literal = 'a{num}'.format(num=num)
    l.append(literal)

In [18]:
l

['a0',
 'a1',
 'a2',
 'a3',
 'a4',
 'a5',
 'a6',
 'a7',
 'a8',
 'a9',
 'a10',
 'a11',
 'a12',
 'a13',
 'a14',
 'a15',
 'a16',
 'a17',
 'a18',
 'a19',
 'a20',
 'a21',
 'a22',
 'a23',
 'a24',
 'a25',
 'a26',
 'a27',
 'a28',
 'a29',
 'a30',
 'a31',
 'a32',
 'a33',
 'a34',
 'a35',
 'a36',
 'a37',
 'a38',
 'a39',
 'a40',
 'a41',
 'a42',
 'a43',
 'a44',
 'a45',
 'a46',
 'a47',
 'a48',
 'a49',
 'a50',
 'a51',
 'a52',
 'a53',
 'a54',
 'a55',
 'a56',
 'a57',
 'a58',
 'a59',
 'a60',
 'a61',
 'a62',
 'a63',
 'a64',
 'a65',
 'a66',
 'a67',
 'a68',
 'a69',
 'a70',
 'a71',
 'a72',
 'a73',
 'a74',
 'a75',
 'a76',
 'a77',
 'a78',
 'a79',
 'a80',
 'a81',
 'a82',
 'a83',
 'a84',
 'a85',
 'a86',
 'a87',
 'a88',
 'a89',
 'a90',
 'a91',
 'a92',
 'a93',
 'a94',
 'a95',
 'a96',
 'a97',
 'a98',
 'a99',
 'a100',
 'a101',
 'a102',
 'a103',
 'a104',
 'a105',
 'a106',
 'a107',
 'a108',
 'a109',
 'a110',
 'a111',
 'a112',
 'a113',
 'a114',
 'a115',
 'a116',
 'a117',
 'a118',
 'a119',
 'a120',
 'a121',
 'a122',
 'a

In [19]:
all_df = pd.DataFrame(columns = l)

In [20]:
all_df

Unnamed: 0,a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,...,a30390,a30391,a30392,a30393,a30394,a30395,a30396,a30397,a30398,a30399


In [21]:
all_df = all_df.T

In [22]:
all_df

a0
a1
a2
a3
a4
a5
a6
a7
a8
a9
a10


In [23]:
last_df = all_df.join(want_df)

In [24]:
last_df

Unnamed: 0,0
a0,
a1,5.0
a2,2.0
a3,2.0
a4,
a5,
a6,1.0
a7,1.0
a8,
a9,


In [25]:
last_df = last_df.fillna(0)

In [26]:
all_df = last_df.T

In [27]:
all_df

Unnamed: 0,a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,...,a30390,a30391,a30392,a30393,a30394,a30395,a30396,a30397,a30398,a30399
0,0.0,5.0,2.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [137]:
words_used_df.tail(1)

Unnamed: 0,word
31566,￣*


In [28]:
con.close

<function Connection.close>

## 3. 事前に作成したモデルで予測

In [29]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

%matplotlib inline

In [30]:
from sklearn import linear_model

In [31]:
from sklearn.externals import joblib
#モデルの読み込み
logreg = joblib.load("model_dir/model")

In [32]:
all_numpyMatrix_data = all_df.as_matrix()
X_test = all_numpyMatrix_data
_test_data = DataFrame(X_test)

In [33]:
# テストデータを予測します。
Y_pred = logreg.predict(X_test)

In [34]:
Y_pred

array(['アダムス・ファミリー'], dtype=object)