In [30]:
#to create data directly in the form sklearn requires, we use count 
#vectorizer. So use nltk just for cleaning data, then onwards use count vectorizer and apply sklearn

In [31]:
#data cleaning as usual.

In [32]:
from nltk.corpus import movie_reviews

In [33]:
import nltk 
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/aashishchoudhary/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [34]:
documents=[]
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))

In [35]:
import random
random.shuffle(documents)

In [36]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [37]:
from nltk import pos_tag
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ    #adjective
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV         #adverb
    else:
        return wordnet.NOUN

In [38]:
from nltk.corpus import stopwords
stops=set(stopwords.words('english'))

In [39]:
import string
punctuations=list(string.punctuation)
stops.update(punctuations)

In [40]:
def clean_review(words):
    output_words=[]
    for w in words:
        if w.lower() not in stops:
            pos=pos_tag([w])
            clean_word=lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))  #for selecting the type of pos
            output_words.append(clean_word.lower())
    return output_words

In [41]:
documents=[(clean_review(document),category) for document,category in documents]

In [42]:
#use Count vectorizer after this

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

In [44]:
train_set={'the sky is blue','the sun is bright'}

In [45]:
count_vec=CountVectorizer(max_features=3)  #using this count vectorizer will chose top 3 features on its own
#and make 3 rows corresonding to these words(max freq) as features with their freq.
a=count_vec.fit_transform(train_set)

In [46]:
a

<2x3 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [47]:
a.todense()

matrix([[0, 1, 1],
        [1, 1, 1]], dtype=int64)

In [48]:
count_vec.get_feature_names()  #is , the were highest freq words
# in train set, blue was in sentence 1 only. These r top 3 words
#picked by count vectorizer. So it picks highest freq words, make array
#on its own.

['blue', 'is', 'the']

In [49]:
a=["as","such"]
" ".join(a)

'as such'

In [62]:
categories=[category for document,category in documents]

In [63]:
text_documents=[" ".join(document) for document,category in documents]

In [64]:
from sklearn.model_selection import train_test_split

In [65]:
x_train,x_test,y_train,y_test=train_test_split(text_documents,categories)

In [69]:
count_vec=CountVectorizer(max_features=2000)
x_train_features=count_vec.fit_transform(x_train)

In [70]:
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 4, 0, ..., 1, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 1, 0]], dtype=int64)

In [68]:
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '15',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '30',
 '50',
 '60',
 '70',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accomplish',
 'achieve',
 'across',
 'act',
 'action',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adapt',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'agrees',
 'ahead',
 'aid',
 'aim',
 'air',
 'al',
 'ala',
 'alan',
 'alex',
 'alice',
 'alien',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'amy',
 'anderson',
 'andrew',
 'angel',
 'angle',
 'angry',
 'animal',
 'animate',
 'animation',
 'anne',
 'annie',
 'annoy',
 'another',
 'answer',
 'anthony',
 'anti',
 'anyone',
 'anything',
 'anyway',
 'apart',
 'apar

In [71]:
x_test_features=count_vec.transform(x_test)

In [73]:
x_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 85864 stored elements in Compressed Sparse Row format>

In [None]:
#now we can use any classifier on it.

In [75]:
from sklearn.svm import SVC

In [76]:
svc=SVC()

In [77]:
svc.fit(x_train_features,y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [85]:
svc.score(x_test_features,y_test)

0.818

In [None]:
#using n-grams

In [79]:
#count_vectorizer helps find best feature from given doc on its own, we just need to do data cleaning
#this can be directly used in nltk.

In [80]:
#different options available for count_vectorizer: max-features,
#ngram, analyzer.

In [82]:
#in n gram,eg if n=2, it will consider combination of 
#2 words also,if words like "not good" are repeating very often
#it can be set as feature , which is logical.

In [84]:
count_vec=CountVectorizer(max_features=2000,ngram_range=(2,3))
x_train_features=count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [86]:
count_vec.get_feature_names()  # as we can feature name is combination
#of 2-3 words instead of 1. n gram can lead to good results in some 
#projects

['10 10',
 '10 minute',
 '10 scale',
 '10 year',
 '100 million',
 '100 minute',
 '14 year',
 '15 minute',
 '19th century',
 '20 minute',
 '20 year',
 '30 year',
 '90 minute',
 'able make',
 'absolutely nothing',
 'academy award',
 'ace ventura',
 'across country',
 'act ability',
 'act film',
 'act like',
 'act one',
 'act skill',
 'act talent',
 'action comedy',
 'action film',
 'action flick',
 'action hero',
 'action movie',
 'action scene',
 'action sequence',
 'action thriller',
 'actor film',
 'actor play',
 'actually get',
 'actually quite',
 'adam sandler',
 'african american',
 'al pacino',
 'albert brook',
 'alec baldwin',
 'alicia silverstone',
 'alien film',
 'alien resurrection',
 'almost always',
 'almost entirely',
 'almost every',
 'along way',
 'already know',
 'already see',
 'also direct',
 'also feature',
 'also get',
 'also give',
 'also good',
 'also happens',
 'also help',
 'also include',
 'also know',
 'also like',
 'also make',
 'also one',
 'also quite',
 'al

In [87]:
x_test_features=count_vec.transform(x_test)

In [88]:
svc=SVC()

In [89]:
svc.fit(x_train_features,y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [90]:
svc.score(x_test_features,y_test)

0.488

In [None]:
#using 1-2 ngrams

In [91]:
count_vec=CountVectorizer(max_features=2000,ngram_range=(1,2))
x_train_features=count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 4, 0, ..., 1, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 1, 0]], dtype=int64)

In [92]:
x_test_features=count_vec.transform(x_test)

In [93]:
svc=SVC()

In [94]:
svc.fit(x_train_features,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [95]:
svc.score(x_test_features,y_test)  #improved score.

0.82

In [96]:
#TF-IDF

In [None]:
#tf: term frequency: no of docs that contain word w, 
#word freq!= doc freq, a word may be very repetitive in each doc, which is
#logically irrelevant, and there may be word with less freq, but 
#present in many docs(doc freq).i.e imp
#in count_vectozier there are min_df and max_df option to chose
#min and max percent of doc freq to get rid of some useless word in
#vocubulary.


#idf: inverse document freq