# Simple Sentiment Analysis Example

In [42]:
x_train = [
  'This was awesome an awesome movie',
  'Great movie! I liked it a lot',
  'Happy Ending! awesome acting by the hero',
  'loved it! truly great',
  'bad not upto mark',
  'could have been better',
  'surely a Disapointing movie',
]

y_train = [1, 1, 1, 1, 0, 0, 0,] # 1 = Positive, 0 = Negative class

x_test = [ 'I was happy & happy and I loved the acting in the movie',
' The movie I saw was bad']

In [93]:
# x_test = ['The movie I saw was bad']

## Data Cleaning

In [94]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\.....
[nltk_data]   Package stopwords is already up-to-date!


True

In [95]:
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [96]:
def getCleanedText(text):
  text = text.lower()

  # tokenize
  tokens = tokenizer.tokenize(text)
  new_tokens = [token for token in tokens if token not in en_stopwords]

  stemmed_tokens = [ps.stem(tokens) for tokens in new_tokens]

  clean_text = ' '.join(stemmed_tokens)

  return clean_text

In [97]:
x_clean = [getCleanedText(i) for i in x_train]
xt_clean = [getCleanedText(i) for i in x_test]

In [98]:
x_clean

['awesom awesom movi',
 'great movi like lot',
 'happi end awesom act hero',
 'love truli great',
 'bad upto mark',
 'could better',
 'sure disapoint movi']

## Vectorization

In [99]:
from sklearn.feature_extraction.text import CountVectorizer

In [100]:
cv = CountVectorizer(ngram_range=(1, 2))

In [101]:
x_vec = cv.fit_transform(x_clean).toarray()

In [102]:
x_vec

array([[0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0]], dtype=int64)

In [103]:
print(cv.get_feature_names())

['act', 'act hero', 'awesom', 'awesom act', 'awesom awesom', 'awesom movi', 'bad', 'bad upto', 'better', 'could', 'could better', 'disapoint', 'disapoint movi', 'end', 'end awesom', 'great', 'great movi', 'happi', 'happi end', 'hero', 'like', 'like lot', 'lot', 'love', 'love truli', 'mark', 'movi', 'movi like', 'sure', 'sure disapoint', 'truli', 'truli great', 'upto', 'upto mark']


In [104]:
xt_vect = cv.transform(xt_clean).toarray()

## Multinomial Naive-Bayes

In [105]:
from sklearn.naive_bayes import MultinomialNB

In [106]:
mn = MultinomialNB()

In [107]:
mn.fit(x_vec, y_train)

MultinomialNB()

In [108]:
y_pred = mn.predict(xt_vect)

In [109]:
y_pred

array([1,0])