## Vector Space Model (better than Bag of Words Model as it also has the frequencies stored)

In [55]:
train_set = train_set = ["The sky is blue.", "The sun is bright"]

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()
count_vec.fit_transform(train_set)
count_vec.get_feature_names()

['blue', 'bright', 'is', 'sky', 'sun', 'the']

In [57]:
test_set = ["The sky is bright and blue", "Bright day"]
freq_term = count_vec.transform(test_set)
freq_term.todense()

matrix([[1, 1, 1, 1, 0, 1],
        [0, 1, 0, 0, 0, 0]], dtype=int64)

### Movie Review

In [63]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, movie_reviews, wordnet
from nltk.stem import PorterStemmer
from nltk import pos_tag, ne_chunk
from nltk.stem import WordNetLemmatizer

stops = set(stopwords.words('english'))
stops.update(".", "?", '"', "'", "-", "(", ")", ",", ":", "/", "!", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9")

documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
        
reviews = []
categories = []
for (rev, category) in documents:
    reviews.append(rev)
    categories.append(category)

In [64]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [65]:
lemmatizer = WordNetLemmatizer()

def clean_data(data):
    meaningful_words = []
    for w in data:
        if w.lower() not in stops:
            pos = pos_tag([w])
            n_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            meaningful_words.append(n_word.lower())
    return (" ".join(meaningful_words))

In [66]:
clean_review = clean_data(reviews[0])
clean_review

'plot two teen couple go church party drink drive get accident one guy dy girlfriend continue see life nightmare deal watch movie sorta find critique mind fuck movie teen generation touch cool idea present bad package make review even harder one write since generally applaud film attempt break mold mess head lose highway & memento good bad way make type film folk snag one correctly seem take pretty neat concept execute terribly problem movie well main problem simply jumbled start normal downshift fantasy world audience member idea go dream character come back dead others look like dead strange apparition disappearance looooot chase scene ton weird thing happen simply explain personally mind try unravel film every give clue get kind fed film big problem obviously get big secret hide seem want hide completely final five minute make thing entertain thrill even engage meantime really sad part arrow dig flick like actually figure half way point strangeness start make little bit sense still 

In [67]:
clean_reviews = [clean_data(review) for review in reviews]

In [68]:
from sklearn.cross_validation import train_test_split

x_train, x_test, y_train, y_test = train_test_split(clean_reviews, categories, test_size = 0.2)
count_vec = CountVectorizer(analyzer = "word", max_features = 4500)

In [69]:
train_transformed = count_vec.fit_transform(x_train)
train_transformed.shape

(1600, 4500)

In [71]:
test_transformed = count_vec.transform(x_test)

In [73]:
from sklearn.svm import SVC
clf = SVC(kernel = 'linear', C = 1)
clf.fit(train_transformed, y_train)
clf.score(test_transformed, y_test)

0.82250000000000001

In [74]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf1 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf1.fit(train_transformed, y_train)
clf1.score(test_transformed, y_test)

0.81999999999999995

In [75]:
clf1.best_estimator_

SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.0005, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [79]:
from sklearn import tree
clf2 = tree.DecisionTreeClassifier()
clf2.fit(train_transformed, y_train)
clf2.score(test_transformed, y_test)

0.62250000000000005

In [80]:
from sklearn import ensemble
clf3 = ensemble.RandomForestClassifier(n_estimators=10)
clf3.fit(train_transformed ,y_train)
clf3.score(test_transformed, y_test)

0.67000000000000004

In [81]:
from sklearn import neighbors
KNN = neighbors.KNeighborsClassifier()
KNN.fit(train_transformed, y_train)
KNN.score(test_transformed, y_test)

0.56999999999999995