In [7]:
import pandas as pd
import numpy as np
from joblib import load, dump
from copy import deepcopy
from statistics import mean
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter

In [2]:
df = pd.read_csv('../output/Features.csv')

In [3]:
df

Unnamed: 0,product,answer_option,label,review_len,Rn,Rp,Rs,Rc,Rd,Rsc
0,Accucheck,Fast and accurate delivery,0,4,0.233709,0.300000,0.616667,0.005195,1.0,0.0000
1,Accucheck,Expected a longer expire date. Your Product Li...,0,14,0.673415,-0.100000,0.400000,0.016883,1.0,0.0000
2,Accucheck,I liked the prompt service,0,5,0.319747,0.600000,0.800000,0.006494,1.0,0.4215
3,Accucheck,Good product,0,2,0.546925,0.700000,0.600000,0.002597,0.0,0.4404
4,Accucheck,I not needed,0,3,0.000000,0.000000,0.000000,0.003896,1.0,0.0000
...,...,...,...,...,...,...,...,...,...,...
1650,shampoo,Its not much effective as it has been stated i...,0,12,0.500000,-0.300000,0.800000,0.028640,0.0,-0.3724
1651,shampoo,Liked it very nicely working now my scalp is a...,1,11,0.166375,0.690000,0.900000,0.026253,0.0,0.5709
1652,shampoo,its my regular choice,0,4,0.500000,0.000000,0.076923,0.009547,0.0,0.0000
1653,shampoo,Good but not very effective,0,5,0.000000,0.234615,0.607692,0.011933,0.0,-0.4032


### Ranking is a canonical problem for humans. It is easy to classify whether a review is useful (informative) or not. However, ranking reviews on the basis of usefulness, is a complex task. Our ranking methodology is based on this simple education.

#### Pairwise ranking approach is applied to rank reviews in the semi-supervised learning method. The pairwise ranking approach looks at a pair of documents at a time in a loss function and predicts a relative ordering. The objective is not to determine the relevance score but to find which document is more relevant than others. This relevance is developed to judge the preference of one review over another.
#### In the semi-supervised learning method, mapping is constructed between input and output. This input-output pair in the training model is used to learn the system.
#### Review Segregation: We segregated two sets of reviews on which we train our model.
+ Set 0 represents reviews with label 0, i.e., ones that are not informative. These include reviews based on delivery, customer support, packaging, etc. These reviews do not describe the product.
+ Set 1 represents reviews with label 1, i.e., reviews that are informative and are better than all reviews of Set 0;
#### How we segregated and determined labels for reviews:
### `Our entire review ranking system is based on the idea that it is easier for humans to binary classify reviews which we call Set 0 and Set 1.`

For each product 'Accucheck', 'Becadexamin', 'Evion', 'Neurobion','SevenseascodLiverOil', 'Shelcal', 'Supradyn','shampoo', we asked 10 different people to label reviews as a 1 (informative review) and 0 ( not informative review). Different participants were asked to label so that there is no bias and the model learns to its best.

In [4]:
data_split = pd.crosstab(df['product'],df['label'])
data_split

label,0,1
product,Unnamed: 1_level_1,Unnamed: 2_level_1
Accucheck,310,85
Becadexamin,53,27
Evion,89,33
Neurobion,280,136
SevenseascodLiverOil,59,22
Shelcal,259,124
Supradyn,50,23
shampoo,56,49


## Building the training set:
#### We pairwise compared each review of set1 with all reviews of set0 and vice-versa
+ (Rx, Ry,1) where x∈Set1 and y∈Set0 → Rx is better than Ry
+ (Ry, Rx, 0) where x∈Set1 and y∈Set0 → Ry is worst than Rx
<br>

#### This now becomes a classification problem.

<hr>

![PairwiseRanking](Photos/PairwiseRanking.png)

In [5]:
def building_training_data(df):
    A = df[df['label']==1]
    A.loc[df['label']==1,'join'] = 'j'
    B = df[df['label']==0]
    B.loc[df['label']==0,'join'] = 'j'
    trainset1 = pd.merge(A,B,how='outer',on='join')
    trainset2 = pd.merge(B,A,how='outer',on ='join')

    trainset = pd.merge(trainset1,trainset2,how='outer')
    return trainset

In [11]:
product_list = df['product'].unique()
data_stack = []
for product in product_list:
    temp = deepcopy(df[df['product']==product].iloc[:,2:])
    build_data = building_training_data(temp)
    print(product, len(temp), len(build_data))
    build_data.drop(columns = ['join','label_y'],inplace=True)
    data = build_data.iloc[:,1:]
    data['target'] = build_data.iloc[:,0]
    data_stack.append(data)

Accucheck 395 52700
Becadexamin 80 2862
Evion 122 5874
Neurobion 416 76160
SevenseascodLiverOil 81 2596
Shelcal 383 64232
Supradyn 73 2300
shampoo 105 5488


In [13]:
train = pd.concat(data_stack).reset_index(drop = True)

In [14]:
train

Unnamed: 0,review_len_x,Rn_x,Rp_x,Rs_x,Rc_x,Rd_x,Rsc_x,review_len_y,Rn_y,Rp_y,Rs_y,Rc_y,Rd_y,Rsc_y,target
0,1,0.000000,0.00000,0.00,0.001299,0.0,0.0000,1,0.000000,0.700000,0.600000,0.001299,0.0,0.4404,0
1,1,0.000000,0.00000,0.00,0.001299,0.0,0.0000,1,0.000000,0.700000,0.600000,0.001299,0.0,0.4404,0
2,1,0.000000,0.00000,0.00,0.001299,0.0,0.0000,1,0.000000,0.700000,0.600000,0.001299,0.0,0.4404,0
3,1,0.000000,0.00000,0.00,0.001299,0.0,0.0000,1,0.000000,0.700000,0.600000,0.001299,0.0,0.4404,0
4,1,0.000000,0.00000,0.00,0.001299,0.0,0.0000,3,0.000000,0.000000,0.000000,0.003896,0.0,0.0000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212207,79,0.924148,0.00875,0.51,0.136038,1.0,0.7184,20,0.488306,0.500000,0.500000,0.040573,1.0,0.4033,1
212208,79,0.924148,0.00875,0.51,0.136038,1.0,0.7184,24,0.370521,0.466667,0.650000,0.047733,1.0,0.2819,1
212209,79,0.924148,0.00875,0.51,0.136038,1.0,0.7184,34,0.589667,0.147619,0.460714,0.071599,1.0,0.2187,1
212210,79,0.924148,0.00875,0.51,0.136038,1.0,0.7184,37,0.681307,-0.105556,0.388889,0.083532,1.0,0.1154,1


In [23]:
X = train.iloc[:,:-1].values
y = train.iloc[:,-1].values

from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,shuffle = True, stratify = y) 
print("Train Len:",len(X_train)," ",len(X_test))
print("Test Len:",len(X_test)," ",len(y_test))

Train Len: 169769   42443
Test Len: 42443   42443


In [20]:
train

Unnamed: 0,review_len_x,Rn_x,Rp_x,Rs_x,Rc_x,Rd_x,Rsc_x,review_len_y,Rn_y,Rp_y,Rs_y,Rc_y,Rd_y,Rsc_y,target
0,1,0.000000,0.00000,0.00,0.001299,0.0,0.0000,1,0.000000,0.700000,0.600000,0.001299,0.0,0.4404,0
1,1,0.000000,0.00000,0.00,0.001299,0.0,0.0000,1,0.000000,0.700000,0.600000,0.001299,0.0,0.4404,0
2,1,0.000000,0.00000,0.00,0.001299,0.0,0.0000,1,0.000000,0.700000,0.600000,0.001299,0.0,0.4404,0
3,1,0.000000,0.00000,0.00,0.001299,0.0,0.0000,1,0.000000,0.700000,0.600000,0.001299,0.0,0.4404,0
4,1,0.000000,0.00000,0.00,0.001299,0.0,0.0000,3,0.000000,0.000000,0.000000,0.003896,0.0,0.0000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212207,79,0.924148,0.00875,0.51,0.136038,1.0,0.7184,20,0.488306,0.500000,0.500000,0.040573,1.0,0.4033,1
212208,79,0.924148,0.00875,0.51,0.136038,1.0,0.7184,24,0.370521,0.466667,0.650000,0.047733,1.0,0.2819,1
212209,79,0.924148,0.00875,0.51,0.136038,1.0,0.7184,34,0.589667,0.147619,0.460714,0.071599,1.0,0.2187,1
212210,79,0.924148,0.00875,0.51,0.136038,1.0,0.7184,37,0.681307,-0.105556,0.388889,0.083532,1.0,0.1154,1


# Spot Checking-
+ Linear Model
+ Non-Linear Model
+ Ensemble Model

<hr>

## Linear Model: Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
print("Training Accuracy\n", accuracy_score(y_train,classifier.predict(X_train)))
print("Test Accuracy\n", accuracy_score(y_test,classifier.predict(X_test)))

print('CLASSIFICATION REPORT')
print("Training\n", classification_report(y_train,classifier.predict(X_train)))
print("Test \n", classification_report(y_test,classifier.predict(X_test)))

Training Accuracy
 0.724808416141934
Test Accuracy
 0.7257027071601914
CLASSIFICATION REPORT
Training
               precision    recall  f1-score   support

           0       0.72      0.72      0.72     84884
           1       0.72      0.72      0.72     84885

    accuracy                           0.72    169769
   macro avg       0.72      0.72      0.72    169769
weighted avg       0.72      0.72      0.72    169769

Test 
               precision    recall  f1-score   support

           0       0.73      0.73      0.73     21222
           1       0.73      0.73      0.73     21221

    accuracy                           0.73     42443
   macro avg       0.73      0.73      0.73     42443
weighted avg       0.73      0.73      0.73     42443



## Non-Linear Model: DecisionTree

In [40]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()
classifier.fit(X_train,y_train)

print("Training Accuracy\n", accuracy_score(y_train,classifier.predict(X_train)))
print("Test Accuracy\n", accuracy_score(y_test,classifier.predict(X_test)))

print('CLASSIFICATION REPORT')
print("Training\n", classification_report(y_train,classifier.predict(X_train)))
print("Test \n", classification_report(y_test,classifier.predict(X_test)))

Training Accuracy
 0.9965777026430032
Test Accuracy
 0.9813868011215041
CLASSIFICATION REPORT
Training
               precision    recall  f1-score   support

           0       0.99      1.00      1.00     84884
           1       1.00      0.99      1.00     84885

    accuracy                           1.00    169769
   macro avg       1.00      1.00      1.00    169769
weighted avg       1.00      1.00      1.00    169769

Test 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98     21222
           1       0.98      0.98      0.98     21221

    accuracy                           0.98     42443
   macro avg       0.98      0.98      0.98     42443
weighted avg       0.98      0.98      0.98     42443



## Ensemble Model: RandomForest

In [41]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=50, n_jobs = -1, oob_score = True,random_state=42)
classifier.fit(X_train,y_train)

print("Training Accuracy\n", accuracy_score(y_train,classifier.predict(X_train)))
print("Test Accuracy\n", accuracy_score(y_test,classifier.predict(X_test)))

print('CLASSIFICATION REPORT')
print("Training\n", classification_report(y_train,classifier.predict(X_train)))
print("Test \n", classification_report(y_test,classifier.predict(X_test)))

print("Test\nConfusion Matrix: \n", confusion_matrix(y_test, classifier.predict(X_test)))

Training Accuracy
 0.9965718122861064
Test Accuracy
 0.9866880286501897
CLASSIFICATION REPORT
Training
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     84884
           1       1.00      1.00      1.00     84885

    accuracy                           1.00    169769
   macro avg       1.00      1.00      1.00    169769
weighted avg       1.00      1.00      1.00    169769

Test 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     21222
           1       0.99      0.99      0.99     21221

    accuracy                           0.99     42443
   macro avg       0.99      0.99      0.99     42443
weighted avg       0.99      0.99      0.99     42443

Test
Confusion Matrix: 
 [[20946   276]
 [  289 20932]]


In [42]:
## Score of the training dataset obtained using an out-of-bag estimate. This attribute exists only when oob_score is True.
classifier.oob_score_

0.9861576612926978