In [1]:
# https://github.com/PacktPublishing/Hands-on-NLP-with-NLTK-and-scikit-learn-
from sklearn import datasets, feature_extraction, model_selection, linear_model
import nltk

In [2]:
# http://www.cs.cornell.edu/people/pabo/movie-review-data/
movie_data = datasets.load_files('../large_files/movie_reviews', shuffle=True)
movie_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [3]:
movie_data.target_names

['neg', 'pos']

Targets are converted into integers

In [4]:
movie_data.target[0:10]

array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0])

In [5]:
movie_data.data[4] # pos example

b"kolya is one of the richest films i've seen in some time . \nzdenek sverak plays a confirmed old bachelor ( who's likely to remain so ) , who finds his life as a czech cellist increasingly impacted by the five-year old boy that he's taking care of . \nthough it ends rather abruptly-- and i'm whining , 'cause i wanted to spend more time with these characters-- the acting , writing , and production values are as high as , if not higher than , comparable american dramas . \nthis father-and-son delight-- sverak also wrote the script , while his son , jan , directed-- won a golden globe for best foreign language film and , a couple days after i saw it , walked away an oscar . \nin czech and russian , with english subtitles . \n"

Defining our custom feature extraction

In [6]:
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''
    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        stop_words='english',  # remove stop words
        min_df=2  # minimum document frequency, i.e. the word must appear at least in 2 docs
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    # this line will add weighted TF-IDF values to features
    #processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(processed_corpus)

    return processed_corpus

this will transform document into vectors of n length

In [7]:
movie_tfidf = extract_features(movie_data.data)

In [8]:
movie_tfidf[4]

<1x24975 sparse matrix of type '<class 'numpy.int64'>'
	with 66 stored elements in Compressed Sparse Row format>

In [9]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(movie_tfidf, movie_data.target, test_size=0.3)
model = linear_model.LogisticRegression()
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
model.score(X_train, y_train) # on training data

1.0

In [11]:
model.score(X_test, y_test) # on testing data

0.8233333333333334