In [2]:
from gensim.models import KeyedVectors
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

from nltk import sent_tokenize
from nltk import pos_tag
from nltk import map_tag
from nltk import word_tokenize
from nltk.corpus import stopwords

In [9]:
# Load NLTK's English stop-words list
stop_words = set(stopwords.words('english'))

def tag_pos(x):
    sentences = sent_tokenize(x)
    sents = []
    for s in sentences:
        text = word_tokenize(s)
        pos_tagged = pos_tag(text)
        simplified_tags = [
            (word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos_tagged]
        sents.append(simplified_tags)
    return sents

In [2]:
def post_tag_documents(data_df):
    x_data = []
    y_data = []
    total = len(data_df['plot'].as_matrix().tolist())
    plots = data_df['plot'].as_matrix().tolist()
    genres = data_df.drop(['plot', 'title', 'plot_lang'], axis=1).as_matrix()
    for i in range(len(plots)):
        sents = tag_pos(plots[i])
        x_data.append(sents)
        y_data.append(genres[i])
        i += 1
        if i % 5000 == 0:
            print(i, "/", total)
    return x_data, y_data

In [4]:
def doc2vec(data_df):
    data = []
    print("Building TaggedDocuments")
    total = len(data_df[['title', 'plot']].as_matrix().tolist())
    processed = 0
    for x in data_df[['title', 'plot']].as_matrix().tolist():
        label = ["_".join(x[0].split())]
        words = []
        sentences = sent_tokenize(x[1].decode("utf8"))
        for s in sentences:
            words.extend([x.lower() for x in word_tokenize(s)])
        doc = TaggedDocument(words, label)
        data.append(doc)

        processed += 1
        if processed % 10000 == 0:
            print(processed, "/", total)

    model = Doc2Vec(min_count=1, window=10, size=300, sample=1e-4, negative=5, workers=2)
    print("Building Vocabulary")
    model.build_vocab(data)

    for epoch in range(20):
        print("Training epoch %s" % epoch)
        model.train(data)
        model.alpha -= 0.002  # decrease the learning rate
        model.min_alpha = model.alpha  # fix the learning rate, no decay
        model.train(data)

    # Build doc2vec vectors
    x_data = []
    y_data = []
    genres = data_df.drop(['title', 'plot', 'plot_lang'], axis=1).as_matrix()
    names = data_df[['title']].as_matrix().tolist()
    for i in range(len(names)):
        name = names[i][0]
        label = "_".join(name.split())
        x_data.append(model.docvecs[label])
        y_data.append(genres[i])

    return np.array(x_data), np.array(y_data)
