In [1]:
class Sentiment :
    NEUTRAL = "NEUTRAL"
    NEGATIVE = "NEGATIVE"
    POSITIVE  = "POSITIVE"

In [2]:
import random
class Feedback:
    def __init__(self,comment,rating):
        self.comment = comment
        self.rating = rating
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.rating <= 2 :
            return Sentiment.NEGATIVE
        elif self.rating == 3 :
            return Sentiment.NEUTRAL
        else :
            return Sentiment.POSITIVE
#     Container for Data Set
class Feedback_Container :
    def __init__(self,feedbacks):
        self.feedbacks = feedbacks
        
    def evenly_distribute(self):
        negative = list(filter(lambda x : x.sentiment == 'NEGATIVE',self.feedbacks))
        positive = list(filter(lambda x : x.sentiment == 'POSITIVE',self.feedbacks))
        
        positive_shrunken = positive[:len(negative)]
        self.feedbacks = negative + positive_shrunken
        random.shuffle(self.feedbacks)
        return self.feedbacks 
    


# Importing Important data from Training DataSet

In [3]:
import numpy as np
import json
file_name = 'Phones.json'
feedbacks = []
i=1
with open(file_name) as f:
    for line in f:
        feedback = json.loads(line)
        feedbacks.append(Feedback(feedback['reviewText'],feedback['overall']))
        if(i==20000):
            break;
        i+=1
print(feedbacks[5].rating)
print(feedbacks[5].sentiment)
print(feedbacks[5].comment)
print(len(feedbacks))

3.0
NEUTRAL
These make using the home button easy. My daughter and I both like them.  I would purchase them again. Well worth the price.
20000


# Creating Training DataSet 

In [4]:
len(feedbacks)
from sklearn.model_selection import train_test_split

training , test = train_test_split(feedbacks, train_size = 0.5, random_state = 40)

c  = Feedback_Container(training)
training = c.evenly_distribute()

d = Feedback_Container(test)
test = d.evenly_distribute()


# print(training[0].comment)
# print(training[0].sentiment)
# print(len(training))

Vectorizing Training and Test DataSets 

In [11]:
# x -> comment, y-> sentiments
train_x = [x.comment for x in training]
train_y = [x.sentiment for x in training]

test_x = [x.comment for x in test]
test_y = [x.sentiment for x in test]
# print(test_y)
print(len(test_y))

# test_x[10]
# print((test_x[test_y=='NEGATIVE'],test_y[test_y=='NEGATIVE']))

3342


### Bag Of Words Vectorization

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

# Classification

### Linear SVM (Support Vector Machine)

In [13]:
from sklearn import svm
svm_clf = svm.SVC(kernel = 'linear')
svm_clf.fit(train_x_vectors,train_y)
print(test_x[0])

# print(test_x[0])

Purchased from Eforcity, US seller that I've gotten a few items from before and quality was good for the price.  Not this case.  Item is the proper fit, and fits bumper cases or special cases that would only accept an Apple headphone jack (ie thin).But, at first, I was getting audio from just one channel - Jiggling a bit the cable at the base of the male end, 2nd channel started working but was getting static noise by just moving the cable.Audio would then come and go, by briefly moving the cable.Filed claim for damaged item.  Seems to me a $1.70 down the drain.


In [30]:
inp = [""]
inp_vector = vectorizer.transform(inp)
print("Predicted Value =", svm_clf.predict(inp_vector))

Predicted Value = ['POSITIVE']


#### Predicting

In [15]:
svm_pred = svm_clf.predict(test_x_vectors)
print(svm_pred)
print(test_y[0])

['NEGATIVE' 'NEGATIVE' 'NEGATIVE' ... 'NEGATIVE' 'NEGATIVE' 'POSITIVE']
NEGATIVE


### Decision Tree

#### Training

In [16]:
from sklearn.tree import DecisionTreeClassifier
D_clf = DecisionTreeClassifier(random_state=0)
D_clf.fit(train_x_vectors,train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

#### Predicting

In [17]:
D_pred = D_clf.predict(test_x_vectors)
print(D_pred)
print(test_y[1])

['NEGATIVE' 'POSITIVE' 'NEGATIVE' ... 'NEGATIVE' 'POSITIVE' 'POSITIVE']
NEGATIVE


### Logistic Regression

In [18]:
####

In [19]:
# from sklearn.linear_model import LinearRegression
# LR_clf = LinearRegression()
# LR_clf.fit(train_x_vectors,train_y)

In [20]:
####

In [21]:
# print(LR_clf.predict(test_x_vectors[1]))
# print(test_y[1])

### Gausian Naive Bayes

In [22]:
#### Training

In [23]:
from sklearn.naive_bayes import GaussianNB
GNB_clf = GaussianNB()
GNB_clf.fit(train_x_vectors.toarray(),train_y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [24]:
#### Predicting
GNB_pred = D_clf.predict(test_x_vectors.toarray())
print(GNB_pred)
print(test_y[1])

['NEGATIVE' 'POSITIVE' 'NEGATIVE' ... 'NEGATIVE' 'POSITIVE' 'POSITIVE']
NEGATIVE


## Evaluation

In [25]:
print(svm_clf.score(test_x_vectors,test_y))
print(GNB_clf.score(test_x_vectors.toarray(),test_y))
print(D_clf.score(test_x_vectors,test_y))

0.8512866546977857
0.5981448234590065
0.6741472172351886


In [26]:
from sklearn.metrics import f1_score
print(f1_score(test_y,svm_pred,average = None,labels = [Sentiment.NEGATIVE, Sentiment.POSITIVE]))
print(f1_score(test_y,GNB_pred,average = None,labels = [Sentiment.NEGATIVE, Sentiment.POSITIVE]))
print(f1_score(test_y,D_pred,average = None,labels = [Sentiment.NEGATIVE, Sentiment.POSITIVE]))

[0.85186289 0.85070592]
[0.66909754 0.67904509]
[0.66909754 0.67904509]


## TASK ACCOMPLISHED

### SVM is best for this Problem

### Saving Model

In [27]:
import pickle
with open("model/Electronic_Feedback.pkl",'wb') as f:
        pickle.dump(svm_clf,f)
with open("model/Test_Cases.pkl",'wb') as f:
        pickle.dump([test_x_vectors,test_y],f)

### Load Model

In [28]:
with open("model/Electronic_Feedback.pkl",'rb') as f:
        load_clf = pickle.load(f)

In [29]:
load_clf.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')