## 4.1.3 Linear Regression Example with Word2Vec

### Word2Vec Feature Example

In [8]:
import os
import re

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from tqdm import tqdm #TODO2: 관련 라이브러리 사용에 대해, 이전 코드에 사용되지 않는 라이브러리 활용 된 것 같습니다. 전처리 및 등등

In [2]:
DATA_OUT_PATH = './data_out/'
DEFAULT_PATH = '~/.kaggle/competitions/word2vec-nlp-tutorial/' #TODO1: 디렉토리 위치 변경

In [3]:
train = pd.read_csv(DEFAULT_PATH + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

In [4]:
def preprocess(review): 
    html_filtered_review = BeautifulSoup(review, "html5lib").get_text()
    non_alphabet_filtered_review = re.sub("[^a-zA-Z]", " ", html_filtered_review)
    lowered_review = non_alphabet_filtered_review.lower()
    processed_words = lowered_review.split()
    stops = set(stopwords.words("english"))
    processed_words_to_stopwords = [w for w in processed_words if not w in stops]
    clean_review = ' '.join(processed_words_to_stopwords)

    return clean_review

In [5]:
clean_train_reviews = []
for review in tqdm(train['review']):
    clean_train_reviews.append(preprocess(review))

100%|██████████| 25000/25000 [00:34<00:00, 734.18it/s]


In [9]:
len(clean_train_reviews)

25000

In [14]:
sentences = list()
for r in clean_train_reviews:
    sentences.append(r.split())

In [15]:
num_features = 300    
min_word_count = 40   
num_workers = 4       
context = 10          
downsampling = 1e-3 

In [16]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
   level=logging.INFO)

In [17]:
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, workers=num_workers, \
           size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

2018-11-18 17:37:15,082 : INFO : 'pattern' package not found; tag filters are not available for English
2018-11-18 17:37:15,087 : INFO : collecting all words and their counts
2018-11-18 17:37:15,088 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-11-18 17:37:15,386 : INFO : PROGRESS: at sentence #10000, processed 1205223 words, keeping 51374 word types
2018-11-18 17:37:15,678 : INFO : PROGRESS: at sentence #20000, processed 2396605 words, keeping 67660 word types
2018-11-18 17:37:15,827 : INFO : collected 74065 word types from a corpus of 2988089 raw words and 25000 sentences
2018-11-18 17:37:15,827 : INFO : Loading a fresh vocabulary
2018-11-18 17:37:15,992 : INFO : effective_min_count=40 retains 8160 unique words (11% of original 74065, drops 65905)
2018-11-18 17:37:15,993 : INFO : effective_min_count=40 leaves 2627273 word corpus (87% of original 2988089, drops 360816)
2018-11-18 17:37:16,019 : INFO : deleting the raw counts dictionary of 74065 items


In [18]:
def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features),dtype=np.float32)

    num_words = 0
    index2word_set = set(model.wv.index2word)

    for w in words:
        if w in index2word_set:
            num_words += 1
            feature_vector = np.add(feature_vector, model[w])

    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [19]:
def get_dataset(reviews, model, num_features):
    dataset = list()

    for s in tqdm(reviews):
        dataset.append(get_features(s, model, num_features))

    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

In [20]:
trainDataVecs = get_dataset(sentences, model, num_features)

  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 25000/25000 [00:36<00:00, 687.70it/s]


In [21]:
from sklearn.model_selection import train_test_split
import numpy as np

sentiments = list(train['sentiment'])
X = trainDataVecs
y = np.array(sentiments)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [23]:
predicted = lgs.predict(X_test)
from sklearn import metrics

fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))
auc = metrics.auc(fpr, tpr)

print("------------")
print("Accuracy: %f" % lgs.score(X_test, y_test))  #checking the accuracy
print("Precision: %f" % metrics.precision_score(y_test, predicted))
print("Recall: %f" % metrics.recall_score(y_test, predicted))
print("F1-Score: %f" % metrics.f1_score(y_test, predicted))
print("AUC: %f" % auc)

------------
Accuracy: 0.862400
Precision: 0.858037
Recall: 0.870981
F1-Score: 0.864460
AUC: 0.933789


In [24]:
test = pd.read_csv(DEFAULT_PATH + 'testData.tsv', header=0, delimiter='\t', quoting=3)

In [25]:
test.head(5)

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [26]:
clean_test_reviews = []
for review in tqdm(test['review']):
    clean_test_reviews.append(preprocess(review))

100%|██████████| 25000/25000 [00:34<00:00, 727.32it/s]


In [27]:
test_sentences = list()
for r in clean_test_reviews:
    test_sentences.append(r.split())

In [None]:
testDataVecs = get_dataset(test_sentences, model, num_features)

  # Remove the CWD from sys.path while we load stuff.
 65%|██████▍   | 16202/25000 [00:22<00:11, 733.71it/s]

In [None]:
test_predicted = lgs.predict(testDataVecs)

In [None]:
answer_dataset = pd.DataFrame({'id': test['id'], 'sentiment': test_predicted})

In [None]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_w2v_answer.csv', index=False, quoting=3)