In [11]:
import pandas as pd

train = pd.read_csv('../Datasets/IMDB/labeledTrainData.tsv', delimiter='\t')
test = pd.read_csv('../Datasets/IMDB/testData.tsv', delimiter='\t')

In [12]:
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [13]:
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [14]:
train.loc[0, ['review']].values

array(["With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it fina

In [15]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [16]:
# len(stopwords.words('english')), len(set(stopwords.words('english')))
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [18]:
def review_to_text(review, remove_stopwords):
    # 去掉 html 标签
    raw_text = BeautifulSoup(review, 'lxml').get_text()
    # 去掉标点符号
    letters = re.sub('[^a-zA-Z]', ' ', raw_text)
    # 转换小写
    words = letters.lower().split()
    # 如果设置了去除停用词，就把停用词移除
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words]
    
    return words

In [19]:
%%time
X_train = []

for review in train['review']:
    X_train.append(' '.join(review_to_text(review, True)))
    
y_train = train['sentiment']
    
X_test = []

for review in test['review']:
    X_test.append(' '.join(review_to_text(review, True)))
    

CPU times: user 37.5 s, sys: 1.95 s, total: 39.5 s
Wall time: 39.7 s


In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [21]:
%%time
pip_count = Pipeline(
    [('count_vec', CountVectorizer(analyzer='word')),
     ('mnb', MultinomialNB())])

pip_tfidf = Pipeline(
    [('tfidf_vec', TfidfVectorizer(analyzer='word')),
     ('mnb', MultinomialNB())])

params_count = {
    'count_vec__binary': [True, False],
    'count_vec__ngram_range': [(1, 1), (1, 2)],
    'mnb__alpha': [0.1, 1.0, 10.0]
}

params_tfidf = {
    'tfidf_vec__binary': [True, False],
    'tfidf_vec__ngram_range': [(1, 1), (1, 2)],
    'mnb__alpha': [0.1, 1.0, 10.0]
}

gs_count = GridSearchCV(pip_count, params_count, cv=4, n_jobs=-1, verbose=1)
gs_tfidf = GridSearchCV(pip_tfidf, params_tfidf, cv=4, n_jobs=-1, verbose=1)

CPU times: user 308 µs, sys: 5 µs, total: 313 µs
Wall time: 323 µs


In [22]:
%%time
gs_count.fit(X_train, y_train)

print(gs_count.best_score_)
print(gs_count.best_params_)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.0min finished


0.88216
{'count_vec__binary': True, 'count_vec__ngram_range': (1, 2), 'mnb__alpha': 1.0}
CPU times: user 18 s, sys: 1.93 s, total: 19.9 s
Wall time: 3min 14s


In [24]:
%%time
count_y_predict = gs_count.predict(X_test)

CPU times: user 7.29 s, sys: 194 ms, total: 7.49 s
Wall time: 7.56 s


In [25]:
%%time
gs_tfidf.fit(X_train, y_train)

print(gs_tfidf.best_score_)
print(gs_tfidf.best_params_)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  3.7min finished


0.88712
{'mnb__alpha': 0.1, 'tfidf_vec__binary': True, 'tfidf_vec__ngram_range': (1, 2)}
CPU times: user 21.8 s, sys: 2.57 s, total: 24.4 s
Wall time: 4min 5s


In [26]:
%%time
tfidf_y_predict = gs_tfidf.predict(X_test)

CPU times: user 8.87 s, sys: 296 ms, total: 9.16 s
Wall time: 9.2 s


### 生成提交文件

In [27]:
submission_count = pd.DataFrame({'id': test['id'], 'sentiment': count_y_predict})

submission_tfidf= pd.DataFrame({'id': test['id'], 'sentiment': tfidf_y_predict})


submission_count.to_csv('../Datasets/IMDB/submission_count.csv', index=False)
submission_tfidf.to_csv('../Datasets/IMDB/submission_tfidf.csv', index=False)

In [36]:
unlabeled_train = pd.read_csv('../Datasets/IMDB/unlabeledTrainData.tsv', delimiter='\t', quoting=3)

In [37]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [38]:
def review_to_sentences(review, tokenizer):
    raw_sentences = tokenizer.tokenize(review.strip())
    
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_text(raw_sentence, False))
    
    return sentences


In [40]:

corpora = []  
    
for review in unlabeled_train['review']:
    corpora += review_to_sentences(review, tokenizer)


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [41]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 20   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words



In [42]:
from gensim.models import word2vec

print("Training model...")
model = word2vec.Word2Vec(corpora, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

model.init_sims(replace=True)

model_name = "../Datasets/IMDB/300features_20minwords_10context"
model.save(model_name)

Training model...


In [43]:
from gensim.models import Word2Vec

model = Word2Vec.load("../Datasets/IMDB/300features_20minwords_10context")
model.most_similar("man")

  after removing the cwd from sys.path.


[('woman', 0.6218575835227966),
 ('lady', 0.5581679344177246),
 ('soldier', 0.5439067482948303),
 ('person', 0.5426537990570068),
 ('lad', 0.5375710129737854),
 ('guy', 0.5335234999656677),
 ('men', 0.5238945484161377),
 ('boy', 0.5075846910476685),
 ('doctor', 0.5016827583312988),
 ('farmer', 0.5004631280899048)]

In [49]:
import numpy as np


def makeFeatureVec(words, model, num_features):

    featureVec = np.zeros((num_features,), dtype="float32")

    nwords = 0.

    index2word_set = set(model.index2word_set)

    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model[word])

    featureVec = np.divide(featureVec, nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):

    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")

    for review in reviews:
        reviewFeatureVecs[counter] = makeFeatureVec(
            review, model, num_features)

        counter += 1

    return reviewFeatureVecs

In [50]:
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_text( review, remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_text( review, remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

AttributeError: 'Word2Vec' object has no attribute 'index2word_set'

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV

gbc = GradientBoostingClassifier()

params_gbc = {'n_estimators':[10, 100, 500], 'learning_rate':[0.01, 0.1, 1.0], 'max_depth': [2, 3, 4]}
gs = GridSearchCV(gbc, params_gbc, cv=4, n_jobs=-1, verbose=1)

gs.fit(trainDataVecs, y_train)

print(gs.best_score_)
print(gs.best_params_)

result = gs.predict(testDataVecs)
# Write the test results 
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "../Datasets/IMDB/submission_w2v.csv", index=False, quoting=3)

（end）

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

text = ["A smile is the most charming part of a person forever.","A smile is"]

In [28]:
ngram_vectorizer = CountVectorizer(ngram_range=(2,2),decode_error='ignore',token_pattern = r'\b\w+\b',min_df=1)

In [29]:
x1 = ngram_vectorizer.fit_transform(text)

In [34]:
ngram_vectorizer.vocabulary_

{'a person': 0,
 'a smile': 1,
 'charming part': 2,
 'is the': 3,
 'most charming': 4,
 'of a': 5,
 'part of': 6,
 'person forever': 7,
 'smile is': 8,
 'the most': 9}

In [33]:
print(x1)

  (0, 7)	1
  (0, 0)	1
  (0, 5)	1
  (0, 6)	1
  (0, 2)	1
  (0, 4)	1
  (0, 9)	1
  (0, 3)	1
  (0, 8)	1
  (0, 1)	1
  (1, 8)	1
  (1, 1)	1


In [31]:
x1.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 0]], dtype=int64)

In [35]:
# 如果ngram_range=(2, 4)，则表示2，3,4个单词切割
ngram_vectorizer = CountVectorizer(ngram_range=(2, 4), decode_error="ignore",
                                        token_pattern = r'\b\w+\b',min_df=1)
x1 = ngram_vectorizer.fit_transform(text)
print(x1)

  (0, 16)	1
  (0, 19)	1
  (0, 7)	1
  (0, 13)	1
  (0, 26)	1
  (0, 10)	1
  (0, 23)	1
  (0, 4)	1
  (0, 1)	1
  (0, 15)	1
  (0, 18)	1
  (0, 6)	1
  (0, 12)	1
  (0, 25)	1
  (0, 9)	1
  (0, 22)	1
  (0, 3)	1
  (0, 20)	1
  (0, 0)	1
  (0, 14)	1
  (0, 17)	1
  (0, 5)	1
  (0, 11)	1
  (0, 24)	1
  (0, 8)	1
  (0, 21)	1
  (0, 2)	1
  (1, 3)	1
  (1, 21)	1
  (1, 2)	1


In [36]:
ngram_vectorizer.vocabulary_

{'a person': 0,
 'a person forever': 1,
 'a smile': 2,
 'a smile is': 3,
 'a smile is the': 4,
 'charming part': 5,
 'charming part of': 6,
 'charming part of a': 7,
 'is the': 8,
 'is the most': 9,
 'is the most charming': 10,
 'most charming': 11,
 'most charming part': 12,
 'most charming part of': 13,
 'of a': 14,
 'of a person': 15,
 'of a person forever': 16,
 'part of': 17,
 'part of a': 18,
 'part of a person': 19,
 'person forever': 20,
 'smile is': 21,
 'smile is the': 22,
 'smile is the most': 23,
 'the most': 24,
 'the most charming': 25,
 'the most charming part': 26}