## 4-1.3 Linear Regression Example 2

### Word2Vec Feature Example

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from tqdm import tqdm

default_path='~/.kaggle/competitions/word2vec-nlp-tutorial/'

train = pd.read_csv(default_path + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

In [37]:
def preprocess(review): 
    # 불용어 제거는 옵션으로 선택 가능하다.
    
    # 1. HTML 태그 제거
    html_filtered_review = BeautifulSoup(review, "html5lib").get_text()
    # 2. 영어가 아닌 특수문자들을 공백(" ")으로 바꾸기
    non_alphabet_filtered_review = re.sub("[^a-zA-Z]", " ", html_filtered_review)

    # 3. 대문자들을 소문자로 바꾸고 공백단위로 텍스트들 나눠서 리스트로 만든다.
    lowered_review = non_alphabet_filtered_review.lower()
    processed_words = lowered_review.split()

    # 4. 불용어들을 제거
    #영어에 관련된 불용어 불러오기
    stops = set(stopwords.words("english"))
    # 불용어가 아닌 단어들로 이루어진 새로운 리스트 생성
    processed_words_to_stopwords = [w for w in processed_words if not w in stops]
    # 5. 단어 리스트를 공백을 넣어서 하나의 글로 합친다.	
    clean_review = ' '.join(processed_words_to_stopwords)

    return clean_review

In [3]:
clean_train_reviews = []
for review in tqdm(train['review']):
    clean_train_reviews.append(prprocess(review))

100%|██████████| 25000/25000 [00:25<00:00, 988.39it/s]


In [4]:
len(clean_train_reviews)

25000

In [5]:
sentences = list()
for r in clean_train_reviews:
    sentences.append(r.split())

In [6]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [7]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
   level=logging.INFO)

In [8]:
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, workers=num_workers, \
           size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

2018-10-13 20:45:08,310 : INFO : 'pattern' package not found; tag filters are not available for English
2018-10-13 20:45:08,313 : INFO : collecting all words and their counts
2018-10-13 20:45:08,313 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-13 20:45:08,456 : INFO : PROGRESS: at sentence #10000, processed 1205223 words, keeping 51374 word types
2018-10-13 20:45:08,602 : INFO : PROGRESS: at sentence #20000, processed 2396605 words, keeping 67660 word types
2018-10-13 20:45:08,676 : INFO : collected 74065 word types from a corpus of 2988089 raw words and 25000 sentences
2018-10-13 20:45:08,676 : INFO : Loading a fresh vocabulary
2018-10-13 20:45:08,698 : INFO : min_count=40 retains 8160 unique words (11% of original 74065, drops 65905)
2018-10-13 20:45:08,699 : INFO : min_count=40 leaves 2627273 word corpus (87% of original 2988089, drops 360816)
2018-10-13 20:45:08,711 : INFO : deleting the raw counts dictionary of 74065 items
2018-10-13 20:45:08,

In [48]:
def get_features(words, model, num_features):
    """
    주어진 문장에서 단어 벡터의 평균을 구하는 함수
    """
    # 속도를 위해 0으로 채운 배열로 초기화 한다.
    feature_vector = np.zeros((num_features),dtype=np.float32)

    num_words = 0
    # Index2word는 모델의 사전에 있는 단어명을 담은 리스트이다.
    # 속도를 위해 set 형태로 초기화 한다.
    index2word_set = set(model.wv.index2word)
    # 루프를 돌며 모델 사전에 포함이 되는 단어라면 피처에 추가한다.
    for w in words:
        if w in index2word_set:
            num_words += 1
            feature_vector = np.add(feature_vector, model[w])
    # 결과를 단어수로 나누어 평균을 구한다.
    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [28]:
def get_dataset(reviews, model, num_features):
    # 리뷰 단어 목록의 각각에 대한 평균 feature 벡터를 계산하고 
    # 2D numpy 배열을 반환한다.
    dataset = list()

    for s in tqdm(reviews):
        dataset.append(get_features(s, model, num_features))

    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

In [29]:
trainDataVecs = get_dataset(sentences, model, num_features)

  app.launch_new_instance()
100%|██████████| 25000/25000 [00:18<00:00, 1366.58it/s]


In [30]:
from sklearn.model_selection import train_test_split
import numpy as np

sentiments = list(train['sentiment'])
X = trainDataVecs
y = np.array(sentiments)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [32]:
predicted = lgs.predict(X_test)
from sklearn import metrics

fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))
auc = metrics.auc(fpr, tpr)

print("------------")
print("Accuracy: %f" % lgs.score(X_test, y_test))  #checking the accuracy
print("Precision: %f" % metrics.precision_score(y_test, predicted))
print("Recall: %f" % metrics.recall_score(y_test, predicted))
print("F1-Score: %f" % metrics.f1_score(y_test, predicted))
print("AUC: %f" % auc)

------------
Accuracy: 0.864200
Precision: 0.858535
Recall: 0.874553
F1-Score: 0.866470
AUC: 0.933482


In [None]:
%%bash

unzip /home/mind/.kaggle/competitions/word2vec-nlp-tutorial/testData.tsv.zip -d /home/mind/.kaggle/competitions/word2vec-nlp-tutorial

In [33]:
test = pd.read_csv(default_path + 'testData.tsv', header=0, delimiter='\t', quoting=3)

In [34]:
test.head(5)

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [40]:
clean_test_reviews = []
for review in tqdm(test['review']):
    clean_test_reviews.append(preprocess(review))

100%|██████████| 25000/25000 [00:27<00:00, 904.02it/s]


In [41]:
len(clean_test_reviews)

25000

In [42]:
test_sentences = list()
for r in clean_test_reviews:
    test_sentences.append(r.split())

In [49]:
testDataVecs = get_dataset(test_sentences, model, num_features)

  app.launch_new_instance()
100%|██████████| 25000/25000 [00:18<00:00, 1376.00it/s]


In [50]:
testDataVecs

array([[ 1.26484662e-01,  4.01385501e-02, -2.06145033e-01, ...,
        -1.97365489e-02,  1.94497228e-01, -1.91502362e-01],
       [ 9.94577110e-02,  3.17749441e-01,  1.28265128e-01, ...,
        -2.12954119e-01,  4.31397259e-01, -1.44890577e-01],
       [ 1.67861152e-02,  1.21207416e-01, -8.08910206e-02, ...,
        -1.00061119e-01,  3.62079114e-01, -2.54240304e-01],
       ...,
       [ 8.71552229e-02,  6.84049204e-02, -1.18286842e-02, ...,
         6.34482205e-02,  2.15148360e-01, -5.95017225e-02],
       [ 1.07426822e-01,  1.45983487e-01, -1.58126671e-02, ...,
        -1.25091240e-01,  2.62662977e-01, -1.41854733e-04],
       [ 9.48106572e-02,  1.41753005e-02, -5.92297241e-02, ...,
        -1.83753625e-01,  4.88906741e-01, -5.64651936e-02]], dtype=float32)

In [51]:
test_predicted = lgs.predict(testDataVecs)

In [52]:
answer_dataset = pd.DataFrame({'id': test['id'], 'sentiment': test_predicted})

In [54]:
answer_dataset.to_csv('lgs_answer.csv', index=False, quoting=3)