In [66]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
from nltk.corpus import stopwords
from string import punctuation

In [33]:
english_stopwords = stopwords.words('english')

In [34]:
non_words = list(punctuation)
non_words.extend(['\n'])
non_words.extend(map(str,range(10)))

In [35]:
from sklearn.feature_extraction.text import CountVectorizer   
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

stemmer = SnowballStemmer('english')

In [36]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = ''.join([c for c in text if c not in non_words])
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

In [37]:
vectorizer = CountVectorizer(analyzer = 'word', tokenizer = tokenize, lowercase = True, stop_words = english_stopwords)

In [38]:
import os
import tarfile
from contextlib import closing
try:
    from urllib import urlopen
except ImportError:
    from urllib.request import urlopen


URL = ("http://www.cs.cornell.edu/people/pabo/"
       "movie-review-data/review_polarity.tar.gz")

ARCHIVE_NAME = URL.rsplit('/', 1)[1]
DATA_FOLDER = "txt_sentoken"


if not os.path.exists(DATA_FOLDER):

    if not os.path.exists(ARCHIVE_NAME):
        print("Downloading dataset from %s (3 MB)" % URL)
        opener = urlopen(URL)
        with open(ARCHIVE_NAME, 'wb') as archive:
            archive.write(opener.read())

    print("Decompressing %s" % ARCHIVE_NAME)
    with closing(tarfile.open(ARCHIVE_NAME, "r:gz")) as archive:
        archive.extractall(path='.')
    os.remove(ARCHIVE_NAME)
    
#load data
from sklearn.datasets import load_files
dataset = load_files('txt_sentoken', shuffle=False)
print("n_samples: %d" % len(dataset.data))

#create text and train
from sklearn.cross_validation import train_test_split
# split the dataset in training and test set:
X_train, X_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.25, random_state=3948)

n_samples: 2000


In [39]:
for i in range(0,len(X_train)):
    X_train[i]=X_train[i].decode('utf-8')

In [40]:
for i in range(0,len(X_test)):
    X_test[i]=X_test[i].decode('utf-8')

In [41]:
X_train = vectorizer.fit_transform(X_train)

In [42]:
print(X_train.toarray())

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [43]:
from sklearn.linear_model import SGDClassifier
clasificador=SGDClassifier().fit(X_train.toarray(),y_train)

In [44]:
X_test=vectorizer.transform(X_test)

In [45]:
y_predict=clasificador.predict(X_test.toarray())

In [46]:
print("Aciertos: % d" % sum(y_predict==y_test)) #aciertos del modelo anterior

Aciertos:  407


In [47]:
print("Errores: % d" % sum(y_predict!=y_test))#fallos del modelo anterior

Errores:  93
