In [1]:
import pandas as pd

In [2]:
filepath = ('https://raw.githubusercontent.com/nileshgode/GitDemo/master/amazon_cells_labelled.txt')

In [3]:
data = pd.read_csv(filepath,  sep='\t', header=None)
data.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [4]:
X = data.iloc[:,0] # extract column with reviews
y = data.iloc[:,-1] # extract column with sentiments

In [5]:
X

0      So there is no way for me to plug it in here i...
1                            Good case, Excellent value.
2                                 Great for the jawbone.
3      Tied to charger for conversations lasting more...
4                                      The mic is great.
                             ...                        
995    The screen does get smudged easily because it ...
996    What a piece of junk.. I lose more calls on th...
997                         Item Does Not Match Picture.
998    The only thing that disappoint me is the infra...
999    You can not answer calls with the unit, never ...
Name: 0, Length: 1000, dtype: object

In [6]:
y

0      0
1      1
2      1
3      0
4      1
      ..
995    0
996    0
997    0
998    0
999    0
Name: 1, Length: 1000, dtype: int64

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)
X_vec.todense() # convert sparse matrix into dense matrix

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X_vec)
X_tfidf = X_tfidf.todense()

In [9]:
X_tfidf

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size = 0.25, random_state = 0)

In [15]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
y_pred = clf.predict(X_test)

In [17]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 87,  33],
       [ 20, 110]])

In [18]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear')
classifier.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [19]:
y_pred = classifier.predict(X_test)

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[102,  18],
       [ 33,  97]])

In [21]:
import pickle
pickle.dump(vectorizer, open("vectorizer_sa", 'wb')) # Save vectorizer for reuse
pickle.dump(classifier, open("nb_sa", 'wb')) # Save classifier for reuse

In [22]:
def sentiment_pred(classifier, training_matrix, doc):
    """function to predict the sentiment of a product review
    
       classifier : pre trained model
       training_matrix : matrix of features associated with the trained 
       model
       doc = product review whose sentiment needs to be identified"""
       
    X_new = training_matrix.transform(pd.Series(doc)) 
    #don't use fit_transform here because the model is already fitted
    X_new = X_new.todense() #convert sparse matrix to dense

    from sklearn.feature_extraction.text import TfidfTransformer
    tfidf = TfidfTransformer()
    X_tfidf_new = tfidf.fit_transform(X_new)
    X_tfidf_new = X_tfidf_new.todense()

    y_new = classifier.predict(X_tfidf_new)
    if y_new[0] == 0:
        return "negative sentiment"
    elif y_new[0] == 1:
        return "positive sentiment"

In [23]:
nb_clf = pickle.load(open("nb_sa", 'rb'))
vectorizer = pickle.load(open("vectorizer_sa", 'rb'))
new_doc = "The gadget works like a charm. Very satisfied with the product"
sentiment_pred(nb_clf, vectorizer, new_doc)

'positive sentiment'

In [24]:
new_doc = "Not even close to the quality one would expect"
sentiment_pred(nb_clf, vectorizer, new_doc)

'negative sentiment'