In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('reviews_Musical_Instruments.csv')
df = df.dropna()

### 전처리

In [3]:
import re

In [4]:
def find_english(text):
    return re.findall(r'[a-zA-Z]+', text)

In [7]:
data = df[df['review'].notnull()]['review'].map(find_english)

In [9]:
def only_english(text):
    return ' '.join(find_english(text))

In [11]:
data2 = df[df['review'].notnull()]['review'].map(only_english)

In [12]:
with open('df2.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(data2))  

### FastText 학습

In [13]:
from gensim.models.fasttext import FastText
from gensim.models.word2vec import Word2Vec

In [14]:
model = FastText(size=16)

In [15]:
model.build_vocab(sentences=data)  

In [16]:
model.train(sentences=data, epochs=5,
            total_examples=model.corpus_count, total_words=model.corpus_total_words)

In [17]:
model.save('df2.fasttext')  

In [18]:
FastText.load('df2.fasttext') 

<gensim.models.fasttext.FastText at 0x1a9130e3448>

In [19]:
from gensim.models.fasttext import FastText
model = FastText.load('df2.fasttext')

### 감성분석 전처리

In [27]:
df = df[df['review'].notnull()]

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
doc_train, doc_test, y_train, y_test = train_test_split(df['review'], df['sentiment'],
                                                        test_size=0.2, random_state=42)

In [31]:
x_train = np.zeros((1000, 16))

In [32]:
for i, doc in enumerate(doc_train.iloc[:1000]):
    vs = [model.wv[word] for word in find_english(doc)]
    if vs:
        x_train[i,] = np.mean(vs, axis=0)

### 모형학습

In [34]:
import tensorflow as tf

In [35]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

In [36]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [37]:
model.fit(x_train, y_train.values[:1000], epochs=1)

Train on 1000 samples


<tensorflow.python.keras.callbacks.History at 0x1a90b49f588>

In [38]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                multiple                  272       
_________________________________________________________________
dense_1 (Dense)              multiple                  17        
Total params: 289
Trainable params: 289
Non-trainable params: 0
_________________________________________________________________
