In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [10]:
df = pd.read_csv(r'C:\Users\Nikkitha\Desktop\spacy_explorations\amazon_alexa.tsv', sep="\t")

In [11]:
df.tail()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",1
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",1
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",1
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,1
3149,4,29-Jul-18,Black Dot,Good,1


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
rating              3150 non-null int64
date                3150 non-null object
variation           3150 non-null object
verified_reviews    3150 non-null object
feedback            3150 non-null int64
dtypes: int64(2), object(3)
memory usage: 123.1+ KB


In [14]:
df.rating.value_counts()

5    2286
4     455
1     161
3     152
2      96
Name: rating, dtype: int64

In [16]:
punctuations = string.punctuation  #create list of punctuations

In [17]:
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS  #create list of stop words

In [19]:
parser = English()  #load english tokenizer, tagger, parser

In [20]:
def tokenize(sentence):
    tokens = parser(sentence)  #creating a token object, creates documents with linguistic annotations
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]  #lemmatizing each token and converting it into lower case
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations]  #removing stop words
    return tokens

In [21]:
class predictors(TransformerMixin):
    #transform, fit, get_params
    def transform(self, X,**transform_params):
        #cleaning text
        return [clean_text(text) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}
    
def clean_text(text):
    #remove spaces and convert to lower case
    return text.strip().lower()

In [23]:
bow_vector = CountVectorizer(tokenizer= tokenize, ngram_range=(1,1))  #vectorization feature engineering

In [24]:
tfidf_vector = TfidfVectorizer(tokenizer= tokenize)  #vectorization feature engineering

In [26]:
#splitting the data into training and test set
from sklearn.model_selection import train_test_split

X = df['verified_reviews']  #features (we want to analyze)
ylabels = df['feedback']  #labels (we want to test against)
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

In [27]:
#creating a pipeline and generating the model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

#creating pipeline using bag of words
pipe = Pipeline([("cleaner", predictors()),
                ('vectorizer', bow_vector),
                ('classifier', classifier)])

#model generation
pipe.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x0000017D835D72E8>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [28]:
#evaluate model
from sklearn import metrics
#predict on test dataset
pred = pipe.predict(X_test)

#accuracy of the model
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test,pred))
print("Logistic Regression Precision:", metrics.precision_score(y_test,pred))
print("Logistic Regression Recall:",metrics.recall_score(y_test,pred))

Logistic Regression Accuracy: 0.9354497354497354
Logistic Regression Precision: 0.9434167573449401
Logistic Regression Recall: 0.9897260273972602
