# Finance & Analytics Club

## Vectorizing documents using NLTK

In [12]:
text = "I am not a sentimental person but I believe in the utility of sentiment analysis"

### Tokenization - process of splitting text into relevant units called tokens, usually by words

In [14]:
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/subhamgupta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/subhamgupta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
print(tokens)

['I', 'am', 'not', 'a', 'sentimental', 'person', 'but', 'I', 'believe', 'in', 'the', 'utility', 'of', 'sentiment', 'analysis']


### Lemmatization - removing the inflection form of the words, i.e., reducing words to their root forms.

In [18]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
tokens=[lemmatizer.lemmatize(word) for word in tokens]
tokens

['i',
 'am',
 'not',
 'a',
 'sentiment',
 'person',
 'but',
 'i',
 'believ',
 'in',
 'the',
 'util',
 'of',
 'sentiment',
 'analysi']

In [20]:
a = lemmatizer.lemmatize('singing')
a

'singing'

### Stemming - keeping only root words and rejecting other forms 

In [22]:
from nltk.stem import PorterStemmer
tokens=word_tokenize(text.lower())
ps = PorterStemmer()
tokens=[ps.stem(word) for word in tokens]
print(tokens)

['i', 'am', 'not', 'a', 'sentiment', 'person', 'but', 'i', 'believ', 'in', 'the', 'util', 'of', 'sentiment', 'analysi']


## See the difference between stemming(easy) and lemmitization(tough) 

### Stop words - removing non sentiment showing words

In [24]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [26]:
tokens_new = [j for j in tokens if j not in stopwords]

In [28]:
tokens_new

['sentiment', 'person', 'believ', 'util', 'sentiment', 'analysi']

# VADER

https://github.com/cjhutto/vaderSentiment

In [30]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [32]:
analyser.polarity_scores("This is a good course")

{'neg': 0.0, 'neu': 0.58, 'pos': 0.42, 'compound': 0.4404}

In [34]:
analyser.polarity_scores("This is an awesome course") # degree modifier

{'neg': 0.0, 'neu': 0.494, 'pos': 0.506, 'compound': 0.6249}

In [36]:
analyser.polarity_scores("The instructor is so cool")

{'neg': 0.0, 'neu': 0.572, 'pos': 0.428, 'compound': 0.4572}

In [38]:
analyser.polarity_scores("The instructor is so cool!!") # exclaimataion changes score

{'neg': 0.0, 'neu': 0.528, 'pos': 0.472, 'compound': 0.5537}

In [40]:
analyser.polarity_scores("The instructor is so COOL!!") # Capitalization changes score

{'neg': 0.0, 'neu': 0.471, 'pos': 0.529, 'compound': 0.6696}

In [42]:
analyser.polarity_scores("Machine learning makes me :)") #emoticons

{'neg': 0.0, 'neu': 0.571, 'pos': 0.429, 'compound': 0.4588}

In [44]:
analyser.polarity_scores("His antics had me ROFL")

{'neg': 0.0, 'neu': 0.474, 'pos': 0.526, 'compound': 0.6633}

In [46]:
analyser.polarity_scores("The movie SUX") #Slangs

{'neg': 0.618, 'neu': 0.382, 'pos': 0.0, 'compound': -0.4995}

# Text Blob

https://github.com/sloria/TextBlob

In [48]:
from textblob import TextBlob

In [50]:
TextBlob("His").sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [52]:
TextBlob("remarkable").sentiment

Sentiment(polarity=0.75, subjectivity=0.75)

In [54]:
TextBlob("work").sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [56]:
TextBlob("ethic").sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [58]:
TextBlob("impressed").sentiment

Sentiment(polarity=1.0, subjectivity=1.0)

In [60]:
TextBlob("me").sentiment

Sentiment(polarity=0.0, subjectivity=0.0)

In [62]:
TextBlob("His remarkable work ethic impressed me").sentiment # take mean of polarity and subjectivity for individual non zero words

Sentiment(polarity=0.875, subjectivity=0.875)