# Demo of off-the-shelf and custom sentiment models


In [None]:
import json
import operator
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import boto3
import os

In [None]:
del os.environ["HTTP_PROXY"]
del os.environ["HTTPS_PROXY"]

### Demo off-the-shelf sentiment analysis from Amazon Comprehend

In [None]:
# Need to set environment variables for AWS credentials
#  `AWS_ACCESS_KEY_ID`
#  `AWS_SECRET_ACCESS_KEY`
comprehend = boto3.client(service_name='comprehend', 
                          region_name='us-east-1')
                
text = "It is raining today in Seattle"
print(json.dumps(comprehend.detect_sentiment(Text=text, 
                                             LanguageCode='en'), 
                 sort_keys=True, 
                 indent=4))

In [None]:
def print_aws_sentiment_score(x):
    response = comprehend.detect_sentiment(Text=text, 
                                           LanguageCode='en')
    score_data = {x: float(y) for x, y in response["SentimentScore"].items()}
    for cat, score in sorted(score_data.items(), key=operator.itemgetter(1), reverse=True):
        print(f"{cat:10s}: {score:0.3f}")

**Some examples**

In [None]:
text = "I can't believe I wasted ten dollars on such a boring movie"
print_aws_sentiment_score(text)

In [None]:
text = "The only thing I enjoyed about this movie was the popcorn"
print_aws_sentiment_score(text)

In [None]:
text = "Your service is horrible; could you just transfer me to someone more helpful?"
print_aws_sentiment_score(text)

In [None]:
text = "Could I speak to a supervisor?  I'm going in circles with you."
print_aws_sentiment_score(text)

In [None]:
text = "If this doesn't get addressed I will have to take legal action"
print_aws_sentiment_score(text)

In [None]:
text = "I read my explanation of benefits and it doesn't say anything about pre-approval"
print_aws_sentiment_score(text)

In [None]:
text = "My copayment gets higher every year"
print_aws_sentiment_score(text)

### Comparison to domain-specific sentiment

#### Build sentiment model on Hw3 data

In [None]:
DATA_DIR = "hw3_data"
from naive_bayes import IMDBReader
reader = IMDBReader(f"{DATA_DIR}/train")

**Read data**

In [None]:
sentence_label_pairs = []
for text in reader.get_texts("pos"):
    sentence_label_pairs.append((text, 1))
for text in reader.get_texts("neg"):
    sentence_label_pairs.append((text, 0))

In [None]:
len (sentence_label_pairs)

**Vectorize**

In [None]:
vectorizer = TfidfVectorizer(min_df=5, stop_words="english")
vectorizer = vectorizer.fit([t[0].text for t in sentence_label_pairs])
X = vectorizer.transform([t[0].text for t in sentence_label_pairs])
y = [t[1] for t in sentence_label_pairs]

In [None]:
X.shape

**Train model**

In [None]:
lr_model = LogisticRegression(penalty="l2", C=1.0, solver="lbfgs").fit(X, y)

In [None]:
def print_lr_sentiment_score(x):
    p = lr_model.predict_proba(vectorizer.transform([x]))[0]
    print(f"Positive: {p[1]:0.3f}")
    print(f"Negative: {p[0]:0.3f}")

**Test examples**

In [None]:
text = "I can't believe I wasted ten dollars on such a boring movie"
print_lr_sentiment_score(text)

In [None]:
text = "The only thing I enjoyed about this movie was the popcorn"
print_lr_sentiment_score(text)

In [None]:
text = "Your service is horrible; could you just transfer me to someone more helpful?"
print_lr_sentiment_score(text)

In [None]:
text = "Could I speak to a supervisor?  I'm going in circles with you."
print_lr_sentiment_score(text)

In [None]:
text = "If this doesn't get addressed I will have to take legal action"
print_lr_sentiment_score(text)

In [None]:
text = "I read my explanation of benefits and it doesn't say anything about pre-approval"
print_lr_sentiment_score(text)

In [None]:
text = "My copayment gets higher every year"
print_lr_sentiment_score(text)

**Sample words associated with negative sentiment**

In [None]:
lr_model.coef_.shape

In [None]:
import numpy as np

In [None]:
vocab = {idx: w for w, idx in vectorizer.vocabulary_.items()}
top_features = np.argsort(lr_model.coef_[0])[0:5000]
coefs = lr_model.coef_[0,top_features]
words = [vocab[x] for x in top_features]

*Most strongly negative terms*

In [None]:
for i in range(10):
    print(f"{words[i]:10s}: {coefs[i]:0.3f}")

*Random set of words correlated with negative sentiment*

In [None]:
for i in sorted(np.random.randint(0, len(coefs), 20)):
    print(f"{i:4d} -- {words[i]:10s}: {coefs[i]:0.3f}")