In [None]:
corpus = ['I saw the saw.',
     'I saw her standing there.',
     'Ofcourse I give her an umbrella.',
     'But she give me the saw.',
     'I come but she run.']

### Bag of words

#### CountVectorizer

In [None]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [None]:
# CountVectorizer?

In [None]:
# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(corpus)

In [None]:
# examine the fitted vocabulary
vect.get_feature_names()

In [None]:
# transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(corpus)

In [None]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

In [None]:
import pandas as pd
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

#### TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()

In [None]:
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [None]:
# example text for model testing
simple_test = ["i give the saw"]

In [None]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

In [None]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())

### Sentiment Analysis

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
corpus = []
corpus.append({'text':'I saw the saw.','sentiment':'0'})
corpus.append({'text':'I saw her standing there.', 'sentiment':'1'})
corpus.append({'text':'Ofcourse I give her an umbrella.','sentiment':'1'})
corpus.append({'text':'But she give me the saw.', 'sentiment':'-1'})
corpus.append({'text':'I come but she run.', 'sentiment':'-1'})

corpus.append({'text':'The moon is Earth’s closest neighbor.', 'sentiment':'0'})
corpus.append({'text':'It is the brightest object in the night sky..', 'sentiment':'1'})
corpus.append({'text':'Learn more about the moon.', 'sentiment':'0'})
corpus.append({'text':'Scientists have studied the moon for thousands of years.', 'sentiment':'0'})
corpus.append({'text':'The moon is about 4½ billion years old.', 'sentiment':'0'})

corpus.append({'text':'People build homes.','sentiment':'1'})
corpus.append({'text':'A home keeps us warm and dry.', 'sentiment':'1'})
corpus.append({'text':'It gives us a place to live.','sentiment':'1'})
corpus.append({'text':'Animals also build homes.','sentiment':'1'})
corpus.append({'text':'Read on to see how.','sentiment':'0'})

corpus.append({'text':'This is a human skeleton.','sentiment':'0'})
corpus.append({'text':'It is made of bones.','sentiment':'0'})
corpus.append({'text':'Bones shape and support your body.','sentiment':'1'})
corpus.append({'text':'They also protect your organs.','sentiment':'1'})
corpus.append({'text':'Calcium helps bones grow strong.','sentiment':'1'})


In [None]:
news_corpus = pd.DataFrame.from_dict(corpus)

In [None]:
X = news_corpus.text
y = news_corpus.sentiment
print(X.shape)
print(y.shape)

#### Split train test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


In [None]:
news_corpus

#### Vectorizer

In [None]:
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [None]:
X_test_dtm = vect.transform(X_test)

### Naive Bayes Classification

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
%time nb.fit(X_train_dtm, y_train)

In [None]:
y_pred_class = nb.predict(X_test_dtm)

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
X_test

In [None]:
y_pred_class

#### Testing

In [None]:
test1 = ['A home keeps us warm and healty']
new_article_vect = vect.transform(test1)
nb.predict(new_article_vect)

### SVM Classification

In [None]:
from sklearn import svm
from sklearn.metrics import accuracy_score
clf = svm.SVC(kernel='linear')

In [None]:
%time clf.fit(X_train_dtm, y_train)

In [None]:
print("Predicting...")
prediction = clf.predict(X_test_dtm)

In [None]:
print("Prediction:",prediction)
accuracy = accuracy_score(y_test, prediction)

In [None]:
metrics.accuracy_score(y_test, y_pred_class)
print('Accuracy:', accuracy)

In [None]:
new_article = ['saw your saw i']
new_article_vect = vect.transform(new_article)
clf.predict(new_article_vect)

# {'news':0, 'finance':1, 'hot':2, 'sport':3, 'travel':4}

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
from sklearn.model_selection import train_test_split