# Alexa Classification - BOW and TF-IDF

In [1]:
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,  TfidfTransformer

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/sharmaroshan/Amazon-Alexa-Reviews/master/amazon_alexa.tsv", sep="\t")

In [3]:
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [4]:
df.shape

(3150, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
rating              3150 non-null int64
date                3150 non-null object
variation           3150 non-null object
verified_reviews    3150 non-null object
feedback            3150 non-null int64
dtypes: int64(2), object(3)
memory usage: 123.1+ KB


In [6]:
df.feedback.value_counts()

1    2893
0     257
Name: feedback, dtype: int64

## Tokenize with spaCy

In [7]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

nlp = English()

# Create our list of punctuation marks
punctuations = list(string.punctuation)

# Create our list of stopwords
stop_words = STOP_WORDS

In [21]:
doc = nlp("This is a sentence.")

#This doesn't work
print([token.text.lower() for token in doc if token not in stop_words])

#This works
print([token.text.lower() for token in doc if not token.is_stop])

['this', 'is', 'a', 'sentence', '.']
['sentence', '.']


In [9]:
doc = nlp("Hello, world! T")

[token.text for token in doc if not token.is_punct]

[token.text for token in doc if not token.is_punct and not token.is_stop]

['Hello', 'world', 'sentence']

In [22]:
tokens = []
lemma = []
pos = []

for doc in nlp.pipe(df['verified_reviews'].astype('unicode').values):
        tokens.append([token.text.lower() for token in doc if not token.is_punct and not token.is_stop])
        lemma.append([token.lemma_ for token in doc])
        pos.append([token.pos_ for token in doc])

In [23]:
tokens[:3]

[['love', 'echo'],
 ['loved'],
 ['playing',
  'game',
  'answer',
  'question',
  'correctly',
  'alexa',
  'says',
  'got',
  'wrong',
  'answers',
  ' ',
  'like',
  'able',
  'turn',
  'lights',
  'away',
  'home']]

In [24]:
# Add the cleaned/processed reviews to the dataframe
df['clean_reviews'] = tokens

df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback,clean_reviews
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1,"[love, echo]"
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1,[loved]
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1,"[playing, game, answer, question, correctly, a..."
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1,"[lot, fun, thing, 4, yr, old, learns, dinosaur..."
4,5,31-Jul-18,Charcoal Fabric,Music,1,[music]


In [29]:
# If you want the tokenized list to be a string
df['str_reviews'] = df.clean_reviews.apply(' '.join)

In [30]:
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback,clean_reviews,str_reviews
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1,"[love, echo]",love echo
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1,[loved],loved
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1,"[playing, game, answer, question, correctly, a...",playing game answer question correctly alexa s...
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1,"[lot, fun, thing, 4, yr, old, learns, dinosaur...",lot fun thing 4 yr old learns dinosaurs contro...
4,5,31-Jul-18,Charcoal Fabric,Music,1,[music],music


# Sci-kit Learn time

## CountVectorizer

In [31]:
y = df.feedback

print("class balance:\n", df.feedback.value_counts())

text_train, text_test, y_train, y_test = train_test_split(df['str_reviews'], y,
                                                    test_size = 0.33,
                                                    stratify = y,
                                                    random_state = 42)

class balance:
 1    2893
0     257
Name: feedback, dtype: int64


In [33]:
vect = CountVectorizer()

X_train = vect.fit_transform(text_train.values)
X_test = vect.transform(text_test.values)

In [37]:
# Sanity check on the vocabulary

feature_names = vect.get_feature_names()
print(feature_names[:10]), print(feature_names[200:220]), print(feature_names[::2000])

['00', '000', '07', '10', '100', '100x', '11', '1100sf', '12', '15']
['anymore', 'anypod', 'anytime', 'apartment', 'app', 'apparent', 'apparently', 'appealing', 'appear', 'appears', 'apple', 'appliance', 'application', 'appointments', 'appreciated', 'approaching', 'appropriate', 'approximately', 'apps', 'area']
['00', 'orange']


(None, None, None)

In [38]:
count_df = pd.DataFrame(X_train.A, columns=vect.get_feature_names())

count_df.head()

Unnamed: 0,00,000,07,10,100,100x,11,1100sf,12,15,...,yhe,you,young,younger,youtube,yr,zero,zigbee,zzzz,útil
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
# Classification w/ Logistic Regression
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV().fit(X_train, y_train)

lr.C_
lr.score(X_test, y_test)



0.9346153846153846

In [59]:
from sklearn.linear_model import LogisticRegression


logreg = LogisticRegression().fit(X_train, y_train)

print("Training set score: {:.3f}".format(logreg.score(tfidf_pipe, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

Training set score: 0.918
Test set score: 0.932


## TF-IDF

In [40]:
tfidf = TfidfVectorizer(stop_words = 'english', max_df = 0.7)

X_tfidf = tfidf.fit_transform(text_train)
#X_tfidf.toarray()

# First five vectors of TFIDF training data
X_tfidf.A[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [41]:
# Subset of features from TF-IDF
tfidf.get_feature_names()[1000:1010]

['exact',
 'exactly',
 'example',
 'exceeded',
 'exceeds',
 'excelente',
 'excellent',
 'excellently',
 'exception',
 'exceptionally']

In [42]:
tfidf_df = pd.DataFrame(X_tfidf.A, columns=tfidf.get_feature_names())

tfidf_df.head()

Unnamed: 0,00,000,07,10,100,100x,11,1100sf,12,15,...,yesterday,yhe,young,younger,youtube,yr,zero,zigbee,zzzz,útil
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### TF-IDF pipeline

In [53]:
#TF-IDF pipeline to transform into TFIDF

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import make_pipeline

# Make pipeline + Fit_transform -> TFIDF
tfidf_pipe = make_pipeline(CountVectorizer(),
                          TfidfTransformer()).fit_transform(text_train) #notice, fit_transform done here

In [56]:
tfidf_pipe

<2110x3279 sparse matrix of type '<class 'numpy.float64'>'
	with 21772 stored elements in Compressed Sparse Row format>

In [57]:
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV().fit(tfidf_pipe, y_train)

lr.C_
lr.score(X_test, y_test)



0.9346153846153846

In [58]:
from sklearn.linear_model import LogisticRegression


logreg = LogisticRegression().fit(tfidf_pipe, y_train)

print("Training set score: {:.3f}".format(logreg.score(tfidf_pipe, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

Training set score: 0.919
Test set score: 0.922


