<a href="https://colab.research.google.com/github/lupis30puc/BERT_interpretation_with_RF/blob/main/feature_contributions_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Set Up

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import string
import math
import random

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier

import time
import pickle
#from treeinterpreter import treeinterpreter as ti
import joblib
#from transformers import BertTokenizer
import torch

In [None]:
# function so that the graphs look good 
def set_plotting_style():
  tw = 1.5
  rc = {'lines.linewidth': 2,
              'axes.labelsize': 18,
              'axes.titlesize': 21,
              'xtick.major' : 16,
              'ytick.major' : 16,
              'xtick.major.width': tw,
              'xtick.minor.width': tw,
              'ytick.major.width': tw,
              'ytick.minor.width': tw,
              'xtick.labelsize': 'large',
              'ytick.labelsize': 'large',
              'font.family': 'sans',
              'weight':'bold',
              'grid.linestyle': ':',
              'grid.linewidth': 1.5,
              'grid.color': '#ffffff',
              'mathtext.fontset': 'stixsans',
              'mathtext.sf': 'fantasy',
              'legend.frameon': True,
              'legend.fontsize': 12, 
            "xtick.direction": "in","ytick.direction": "in"}

  plt.rc('text.latex', preamble=r'\usepackage{sfmath}')
  plt.rc('mathtext', fontset='stixsans', sf='sans')
  sns.set_style('ticks', rc=rc)
  sns.set_context('notebook', rc=rc)


## Initializing the RF model and the x and y values

In [None]:
y_train = torch.load('/content/drive/MyDrive/Yelp/model_128_/pred_labels_train')
y_test = torch.load('/content/drive/MyDrive/Yelp/model_128_/pred_labels_test')

In [None]:
train_df = pd.read_pickle('/content/drive/MyDrive/Yelp/sample_train_10394.pkl')
test_df = pd.read_pickle('/content/drive/MyDrive/Yelp/sample_test_2599.pkl')

train_df.head(2)

Unnamed: 0,text,label,clean_text,category,length
11349,"Second best pool in Las Vegas! Pay the $20, c...",1,second best pool las vegas pay commit day drin...,Hotel Industry,33
12582,We went there on a Tuesday night in Jan. 2010....,0,went tuesday night jan weather nice course dec...,Restaurant Industry,56


In [None]:
#unify the y values
train_pred = dict(zip(list(train_df.index), y_train))
test_pred = dict(zip(list(test_df.index), y_test))
y_all = train_pred.copy()
y_all.update(test_pred)
len(y_all.keys())

12993

In [None]:
y_all = {key: val for key, val in sorted(y_all.items(), key = lambda ele: ele[0])} 
list(y_all.keys())[:5]

[0, 1, 2, 3, 4]

## Loading all reviews, contributions, feature importances and ids

In [None]:
ft_imp = pd.read_pickle('/content/drive/MyDrive/Yelp/random_forest/importance_df.pkl')

In [None]:
all_ids = pickle.load(open('/content/drive/MyDrive/Yelp/random_forest/all_ids.pkl', 'rb'))
len(all_ids)

12993

In [None]:
all_contribs = pickle.load(open('/content/drive/MyDrive/Yelp/random_forest/all_contribs.pkl', 'rb'))
len(all_contribs)

12993

In [None]:
all_reviews = pd.read_pickle('/content/drive/MyDrive/Yelp/sample_categories.pkl')
all_reviews.head(1)

Unnamed: 0,text,label,categories,clean_text,category,length
0,This actually used to be one of my favorite ho...,0,"Casinos, Nightlife, Restaurants, Hotels & Trav...",actually favorite hotel casinos day things cha...,Hotel Industry,265


## Loading subsamples for analysis and getting random reviews from them

In [None]:
restaurant = pd.read_pickle('/content/drive/MyDrive/Yelp/features_analysis/restaurant_5perc.pkl')
beauty = pd.read_pickle('/content/drive/MyDrive/Yelp/features_analysis/beauty_5perc.pkl')
other = pd.read_pickle('/content/drive/MyDrive/Yelp/features_analysis/other_5perc.pkl')
hotel = pd.read_pickle('/content/drive/MyDrive/Yelp/features_analysis/hotel_5perc.pkl')
service = pd.read_pickle('/content/drive/MyDrive/Yelp/features_analysis/service_5perc.pkl')
q_100 = pd.read_pickle('/content/drive/MyDrive/Yelp/features_analysis/quantile_100_5perc.pkl')
q_75 = pd.read_pickle('/content/drive/MyDrive/Yelp/features_analysis/quantile_75_5perc.pkl')
q_50 = pd.read_pickle('/content/drive/MyDrive/Yelp/features_analysis/quantile_50_5perc.pkl')
q_25 = pd.read_pickle('/content/drive/MyDrive/Yelp/features_analysis/quantile_25_5perc.pkl')

In [None]:
print(len(restaurant), len(beauty), len(other), len(hotel))
print(len(q_100), len(q_75), len(q_50), len(q_25))

459 35 72 36
168 153 156 172


In [None]:
q_100.category.value_counts()

Restaurant Industry    111
Other Industries        23
Hotel Industry          13
Service Industry        11
Beauty Industry         10
Name: category, dtype: int64

In [None]:
q_75.category.value_counts()

Restaurant Industry    105
Other Industries        20
Service Industry        11
Hotel Industry          10
Beauty Industry          7
Name: category, dtype: int64

In [None]:
q_50.category.value_counts()

Restaurant Industry    116
Service Industry        12
Other Industries        11
Beauty Industry          9
Hotel Industry           8
Name: category, dtype: int64

In [None]:
q_25.category.value_counts()

Restaurant Industry    127
Other Industries        18
Service Industry        13
Beauty Industry          9
Hotel Industry           5
Name: category, dtype: int64

Since I want to analyze the reviews per length and category, and there is at least 5 reviews present in all the length groups I'll take 5 reviews to analyze per length segment.

In [None]:
cats[4]

'Beauty Industry'

In [None]:
# obtaining random reviews to analyze
cats = ['Restaurant Industry',
 'Other Industries',
 'Beauty Industry',
 'Hotel Industry', 
 'Service Industry']

samples = [q_100, q_75, q_50, q_25]

rand_rev = []

for n in range(4):
  for c in range(5):
    random_sample = random.choice(samples[n][samples[n].category == cats[c]].index) #obtaining a random review
    rand_rev.append(random_sample)
    print('Ready: ' + str(n) + ' ' + cats[c])

Ready: 0 Restaurant Industry
Ready: 0 Other Industries
Ready: 0 Beauty Industry
Ready: 0 Hotel Industry
Ready: 0 Service Industry
Ready: 1 Restaurant Industry
Ready: 1 Other Industries
Ready: 1 Beauty Industry
Ready: 1 Hotel Industry
Ready: 1 Service Industry
Ready: 2 Restaurant Industry
Ready: 2 Other Industries
Ready: 2 Beauty Industry
Ready: 2 Hotel Industry
Ready: 2 Service Industry
Ready: 3 Restaurant Industry
Ready: 3 Other Industries
Ready: 3 Beauty Industry
Ready: 3 Hotel Industry
Ready: 3 Service Industry


In [None]:
special_q50 = q_50[q_50.label==1]

In [None]:
random.choice(special_q50[special_q50.category == 'Beauty Industry'].index)

7279

In [None]:
special = []
for c in range(5):
    random_sample = random.choice(special_q50[special_q50.category == cats[c]].index) #obtaining a random review
    special.append(random_sample)
    print('Ready: ' + ' ' + cats[c])

Ready:  Restaurant Industry
Ready:  Other Industries
Ready:  Beauty Industry
Ready:  Hotel Industry
Ready:  Service Industry


In [None]:
special_q50['label'][special[1]]

1.0

In [None]:
rand_rev2 = rand_rev.copy()
#rand_rev1 = rand_rev.copy()

In [None]:
special_q75 = q_75[q_75.label==1]
special_random = random.choice(special_q75[special_q75.category == 'Restaurant Industry'].index) #obtaining a random review
special_random

8749

## Analyzing with contributions

In [None]:
#this function is based on the one appearing on https://coderzcolumn.com/tutorials/machine-learning/treeinterpreter-interpreting-tree-based-models-prediction-of-individual-sample
def create_contrbutions_df(contributions, review, features_df, input_ids):
  #extracting only the relevant ids index. the ones that the observed review has
  review_ids = list(features_df[features_df['ids'].isin(input_ids[review])].index) 

  #creating a dictionary of the contributions for the specific id on the observed review
  contribs = {}
  for i in review_ids:
    c = contributions[review][i] 
    feature = features_df['words'][i] 
    contribs[feature] = list(c)
  
  # creating a dataframe out of it 
  contrib_df = pd.DataFrame(data=contribs.values(), index = contribs.keys(), columns=["negative", "positive"])
  # inserting the predicted label at the end
  prediction = contrib_df[["negative", "positive"]].sum()
  contrib_df.loc["PREDICTION"] = np.argmax(prediction)
  return contrib_df

first rev

In [None]:
r8035_df = create_contrbutions_df(all_contribs, 8035, ft_imp, all_ids)
print("Selected Sample     : %d"%8035)
print("Original Value : %s"%all_reviews.label[8035])
print("BERT predicted Value : %s"%y_all[8035]) #actual target value since RF mimics BERT results...
print("RF predicted Value     : %s"%r8035_df.positive['PREDICTION'])
  #print("Predicted Value     : %s"%np.argmax(preds_400[random_sample]))

Selected Sample     : 8035
Original Value : 0
BERT predicted Value : 0
RF predicted Value     : 0.0


In [None]:
#relevant-top positive features
r8035_df[r8035_df['positive'] > 0.01].sort_values(by=['positive'], ascending=False)

Unnamed: 0,negative,positive
sweet,-0.015289,0.015289
try,-0.011467,0.011467


In [None]:
#relevant/top negative
r8035_df[r8035_df['negative'] > 0.01].sort_values(by=['negative'], ascending=False)

Unnamed: 0,negative,positive
said,0.078484,-0.078484
apparently,0.033612,-0.033612
order,0.026757,-0.026757
line,0.020557,-0.020557
working,0.013323,-0.013323
water,0.010053,-0.010053


In [None]:
all_reviews[all_reviews.index == 12683]

Unnamed: 0,text,label,categories,clean_text,category,length
12683,One of my favourite places to go to for Korean...,1,"Korean, Restaurants",favourite places korean food usually sunday lu...,Restaurant Industry,136


In [None]:
all_reviews[all_reviews.index == rand_rev1[1]]

Unnamed: 0,text,label,categories,clean_text,category,length
9520,"The CRG was sold within the last few years, an...",1,"Fitness & Instruction, Climbing, Active Life, ...",crg sold years new owners making modifications...,Other Industries,69


## Function to analyze in dataframe everything!

In [None]:
rand_rev2

[11020,
 3954,
 7737,
 5468,
 1404,
 11251,
 555,
 6274,
 4281,
 5833,
 5231,
 3825,
 5601,
 1158,
 5013,
 855,
 11952,
 9048,
 5285,
 4210]

In [None]:
#exam_rev = [5468,5757,952,7968,903,2372,11656,11459,8050,12586,639,9965,4660,10144,12196,7619]
#exam_rev2 = [10717, 312, 3261, 1979, 8018, 1165, 12669, 10933, 1841, 11055, 12468, 8090, 7745, 5285, 1400]
rand_rev1 = [12683,
 9520,
 7850,
 1135,
 10871,
 11251,
 8957,
 11900,
 2751,
 5813,
 8183,
 6003,
 2896,
 8050,
 1816,
 3207,
 6829,
 6853,
 5076,
 4580]

rand_rev2=[11020,
 3954,
 7737,
 5468,
 1404,
 11251,
 555,
 6274,
 4281,
 5833,
 5231,
 3825,
 5601,
 1158,
 5013,
 855,
 11952,
 9048,
 5285,
 4210]

In [None]:
len(rand_rev2) # 4 per category, there are 5 categories. 4 because there are 4 length segments

20

In [None]:
all_reviews['bert_pred'] = list(y_all.values())

In [None]:
analyzed_rev = all_reviews[all_reviews.index.isin(rand_rev2)]
analyzed_rev['positive_c'] = ''
analyzed_rev['negative_c'] = ''
analyzed_rev['rf_pred'] = ''

In [None]:
analyzed_rev.columns

Index(['text', 'label', 'categories', 'clean_text', 'category', 'length',
       'bert_pred', 'positive_c', 'negative_c', 'rf_pred'],
      dtype='object')

In [None]:
def add_contributions(analysis_df, contributions, review, features_df, input_ids):
  
  #extracting only the relevant ids index. the ones that the observed review has 
  r_c = create_contrbutions_df(contributions, review, features_df, input_ids)
  pos = r_c[r_c['positive'] > 0].sort_values(by=['positive'], ascending=False)
  pos_dict = { i : pos.positive[i] for i in pos.index}
  try:
    del pos_dict['PREDICTION']
  except KeyError:
    pass

  neg = r_c[r_c['negative'] > 0].sort_values(by=['negative'], ascending=False)
  neg_dict = { i : neg.negative[i] for i in neg.index}
  try:
    del neg_dict['PREDICTION']
  except KeyError:
    pass

  # creating a dataframe out of the observed review:
  analysis_df['positive_c'][review] = pos_dict
  analysis_df['negative_c'][review] = neg_dict
  analysis_df['rf_pred'][review] = r_c.loc["PREDICTION"][0]


In [None]:
for r in rand_rev2:
  add_contributions(analyzed_rev, all_contribs, r, ft_imp, all_ids)

In [None]:
analyzed_rev.head()

In [None]:
summary = analyzed_rev.copy()
summary.reset_index(inplace=True)
summary.head()

Unnamed: 0,index,text,label,categories,clean_text,category,length,bert_pred,positive_c,negative_c,rf_pred
0,555,I went to the Fragrance department to pick up ...,0,"Department Stores, Fashion, Men's Clothing, Wo...",went fragrance department pick bottle versace ...,Other Industries,54,0,"{'mall': 0.0014292788633557055, 'er': 0.001085...","{'asked': 0.051271851229870706, 'said': 0.0446...",0
1,855,My friend found a hair in his food. Definitely...,0,"Arts & Entertainment, Nightlife, Breakfast & B...",friend hair food definitely lost appetite pull...,Restaurant Industry,19,0,"{'definitely': 0.07466639315626142, 'good': 0....","{'wasn': 0.043493839409041064, 'lost': 0.02648...",0
2,1158,Some selections are pretty good. Unfortunately...,0,"Asian Fusion, Event Planning & Services, Resta...",selections pretty good unfortunately dishes we...,Hotel Industry,21,0,"{'great': 0.06918939498428471, 'good': 0.01815...","{'unfortunately': 0.08739752310228917, 'wasn':...",0
3,1404,WTF!! WTF!! WTF!!\n1st off this place was do...,0,"Laundry Services, Dry Cleaning & Laundry, Loca...",wtf wtf wtf place downright filthy walked righ...,Service Industry,75,0,"{'clean': 0.012399755825582762, 'soft': 0.0095...","{'said': 0.0466807978957219, 'dirty': 0.039079...",0
4,3825,We got very lucky as far as insurance coverage...,0,"Shopping, Health & Medical, Ophthalmologists, ...",got lucky far insurance coverage goes new glas...,Other Industries,33,0,"{'knowledge': 0.0352074956753434, 'professiona...","{'left': 0.04591727101771179, 'took': 0.034729...",0


In [None]:
summary.to_pickle('/content/drive/MyDrive/Yelp/results2.pkl')


## Rounding and summarizying results

In [None]:
summary1 = pd.read_pickle('/content/drive/MyDrive/Yelp/results1.pkl')
summary1.head(2)

Unnamed: 0,index,text,label,categories,clean_text,category,length,bert_pred,positive_c,negative_c,rf_pred,relevant_pos_c,relevant_neg_c
0,1135,Good: \n1. Elegant \n2. The rooms have a nice ...,0,"Wedding Planning, Event Planning & Services, B...",good elegant rooms nice style loved kitchen be...,Hotel Industry,86,0,"{'loved': 0.033431089996246784, 'nice': 0.0137...","{'asked': 0.05015832844026933, 'dirty': 0.0291...",0,"[(loved, 0.033431089996246784), (nice, 0.01378...","[(asked, 0.05015832844026933), (dirty, 0.02918..."
1,1816,"These guys look like they do good work, too ba...",0,"Home Services, Flooring, Contractors",guys look like good work bad return calls emai...,Service Industry,24,0,"{'good': 0.019764806897246724, '[PAD]': 0.0004...","{'poor': 0.13657416317548124, 'bad': 0.0555959...",0,"[(good, 0.019764806897246724)]","[(poor, 0.13657416317548124), (bad, 0.05559598..."


In [None]:
print('Negative contributions: ')
for i in range(len(summary.negative_c[0])):
  if list(summary.negative_c[0].values())[i] > 0.01:
    #print(list(analyzed_rev.positive_c[1135].keys())[i])
    #print(list(analyzed_rev.positive_c[1135].values())[i])
    print(i)
    print(list(summary.negative_c[0].items())[i])

Negative contributions: 
0
('asked', 0.051271851229870706)
1
('said', 0.04467408898365459)
2
('phone', 0.039025407637245435)
3
('minutes', 0.03611555506152559)
4
('called', 0.028856732600068533)
5
('customer', 0.019433742487794163)
6
('inc', 0.01749209060524957)
7
('got', 0.01528525316690704)
8
('lady', 0.013060704011726959)
9
('went', 0.011922317700750078)


In [None]:
exp2.r_neg_c[0]

[('asked', 0.05),
 ('dirty', 0.029),
 ('bad', 0.027),
 ('pay', 0.025),
 ('called', 0.024),
 ('finally', 0.02),
 ('maybe', 0.013),
 ('hours', 0.011)]

In [None]:
#def roundingVals(d):
    for k, v in d.items():
        v = round(v, 3)
        d[k] = v

In [None]:
def rounded(df):
  df['r_pos_c'] = None
  df['r_neg_c'] = None
  for r in range(20):
  #Positive contributions:
    pos = df.positive_c[r].copy()
    #roundingVals(pos)
    pos_it = []
    for i in range(len(pos)):
      if list(pos.values())[i] > 0.01:
        pos_it.append(str(list(pos.keys())[i]) + ': ' + str(round(list(pos.values())[i], 3)))
        #pos_it.append(list(pos.items())[i])
    df['r_pos_c'][r] = pos_it

  #Negative contributions:
    neg = df.negative_c[r].copy()
    #roundingVals(neg)
    neg_it = []
    for i in range(len(neg)):
      if list(neg.values())[i] > 0.01:
        neg_it.append(str(list(neg.keys())[i]) + ': ' + str(round(list(neg.values())[i], 3)))
        #neg_it.append(list(neg.items())[i])
    df['r_neg_c'][r] = neg_it

In [None]:
summary.positive_c[0]

dict_values([0.0014292788633557055, 0.0010853489332953675, 0.0010713508977031163, 0.0009914480867356942, 0.0005578970898222899, 0.0005537389339354949, 0.0005088235294117646, 0.00043017465331893386, 0.0003509921089443151, 0.00012165163894078986, 9.116008418833422e-05])

In [None]:
pos = summary.positive_c[1].copy()
res = ''
for i in range(len(pos)):
  if list(pos.values())[i] > 0.01:
    res+= '\n ' + str(list(pos.keys())[i]) + ': ' + str(round(list(pos.values())[i], 3))
print(res)


 definitely: 0.075
 good: 0.024


In [None]:
exp2 = summary1.copy()
exp2.shape

(20, 13)

In [None]:
rounded(exp2)
exp2.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,index,text,label,categories,clean_text,category,length,bert_pred,positive_c,negative_c,rf_pred,relevant_pos_c,relevant_neg_c,r_pos_c,r_neg_c
0,1135,Good: \n1. Elegant \n2. The rooms have a nice ...,0,"Wedding Planning, Event Planning & Services, B...",good elegant rooms nice style loved kitchen be...,Hotel Industry,86,0,"{'loved': 0.033431089996246784, 'nice': 0.0137...","{'asked': 0.05015832844026933, 'dirty': 0.0291...",0,"[(loved, 0.033431089996246784), (nice, 0.01378...","[(asked, 0.05015832844026933), (dirty, 0.02918...","[loved: 0.033, nice: 0.014, clean: 0.013, good...","[asked: 0.05, dirty: 0.029, bad: 0.027, pay: 0..."
1,1816,"These guys look like they do good work, too ba...",0,"Home Services, Flooring, Contractors",guys look like good work bad return calls emai...,Service Industry,24,0,"{'good': 0.019764806897246724, '[PAD]': 0.0004...","{'poor': 0.13657416317548124, 'bad': 0.0555959...",0,"[(good, 0.019764806897246724)]","[(poor, 0.13657416317548124), (bad, 0.05559598...",[good: 0.02],"[poor: 0.137, bad: 0.056, customer: 0.034, for..."


In [None]:
results = exp2[['index', 'label', 'bert_pred', 'rf_pred', 'category', 'length', 'r_pos_c', 'r_neg_c']]
results.head()

Unnamed: 0,index,label,bert_pred,rf_pred,category,length,r_pos_c,r_neg_c
0,1135,0,0,0,Hotel Industry,86,"[loved: 0.033, nice: 0.014, clean: 0.013, good...","[asked: 0.05, dirty: 0.029, bad: 0.027, pay: 0..."
1,1816,0,0,0,Service Industry,24,[good: 0.02],"[poor: 0.137, bad: 0.056, customer: 0.034, for..."
2,2751,0,0,0,Hotel Industry,57,[definitely: 0.055],"[terrible: 0.085, told: 0.062, clearly: 0.025,..."
3,2896,0,0,0,Beauty Industry,27,[love: 0.063],"[worst: 0.115, horrible: 0.114, ignored: 0.038..."
4,3207,0,0,0,Restaurant Industry,10,[recommend: 0.049],"[bad: 0.085, crap: 0.048, gave: 0.016, expensi..."


In [None]:
results.to_csv('/content/drive/MyDrive/Yelp/results1_.csv')

In [None]:
exp3 = summary.copy()
exp3.shape

(20, 11)

In [None]:
rounded(exp3)
exp3.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,index,text,label,categories,clean_text,category,length,bert_pred,positive_c,negative_c,rf_pred,r_pos_c,r_neg_c
0,555,I went to the Fragrance department to pick up ...,0,"Department Stores, Fashion, Men's Clothing, Wo...",went fragrance department pick bottle versace ...,Other Industries,54,0,"{'mall': 0.0014292788633557055, 'er': 0.001085...","{'asked': 0.051271851229870706, 'said': 0.0446...",0,,\n asked: 0.051 \n said: 0.045 \n phone: 0.039...
1,855,My friend found a hair in his food. Definitely...,0,"Arts & Entertainment, Nightlife, Breakfast & B...",friend hair food definitely lost appetite pull...,Restaurant Industry,19,0,"{'definitely': 0.07466639315626142, 'good': 0....","{'wasn': 0.043493839409041064, 'lost': 0.02648...",0,\n definitely: 0.075 \n good: 0.024,\n wasn: 0.043 \n lost: 0.026 \n probably: 0.0...


In [None]:
results2 = exp3[['index', 'label', 'bert_pred', 'rf_pred', 'category', 'length', 'r_pos_c', 'r_neg_c']]
results2.head()

Unnamed: 0,index,label,bert_pred,rf_pred,category,length,r_pos_c,r_neg_c
0,555,0,0,0,Other Industries,54,,\n asked: 0.051 \n said: 0.045 \n phone: 0.039...
1,855,0,0,0,Restaurant Industry,19,\n definitely: 0.075 \n good: 0.024,\n wasn: 0.043 \n lost: 0.026 \n probably: 0.0...
2,1158,0,0,0,Hotel Industry,21,\n great: 0.069 \n good: 0.018,\n unfortunately: 0.087 \n wasn: 0.049 \n edib...
3,1404,0,0,0,Service Industry,75,\n clean: 0.012,\n said: 0.047 \n dirty: 0.039 \n dry: 0.028 \...
4,3825,0,0,0,Other Industries,33,\n knowledge: 0.035 \n professional: 0.022,\n left: 0.046 \n took: 0.035 \n wouldn: 0.035...


In [None]:
results2.to_csv('/content/drive/MyDrive/Yelp/results2.csv')

In [None]:
results2['index'].unique()

array([  555,   855,  1158,  1404,  3825,  3954,  4210,  4281,  5013,
        5231,  5285,  5468,  5601,  5833,  6274,  7737,  9048, 11020,
       11251, 11952])

In [None]:
results['index'].unique()

array([ 1135,  1816,  2751,  2896,  3207,  4580,  5076,  5813,  6003,
        6829,  6853,  7850,  8050,  8183,  8957,  9520, 10871, 11251,
       11900, 12683])

In [None]:
uni_ids =[  555,   855,  1158,  1404,  3825,  3954,  4210,  4281,  5013,
        5231,  5285,  5468,  5601,  5833,  6274,  7737,  9048, 11020,
       11251, 11952] + [ 1135,  1816,  2751,  2896,  3207,  4580,  5076,  5813,  6003,
        6829,  6853,  7850,  8050,  8183,  8957,  9520, 10871, 11251,
       11900, 12683]

In [None]:
np.unique(uni_ids)

array([  555,   855,  1135,  1158,  1404,  1816,  2751,  2896,  3207,
        3825,  3954,  4210,  4281,  4580,  5013,  5076,  5231,  5285,
        5468,  5601,  5813,  5833,  6003,  6274,  6829,  6853,  7737,
        7850,  8050,  8183,  8957,  9048,  9520, 10871, 11020, 11251,
       11900, 11952, 12683])

In [None]:
len(np.unique(uni_ids))

39

In [None]:
special_rev = all_reviews[all_reviews.index.isin(special)]
special_rev['positive_c'] = ''
special_rev['negative_c'] = ''
special_rev['rf_pred'] = ''
for r in special:
  add_contributions(special_rev, all_contribs, r, ft_imp, all_ids)
special_rev.reset_index(inplace=True)
rounded(special_rev)
res_special = special_rev[['index', 'label', 'bert_pred', 'rf_pred', 'category', 'length', 'r_pos_c', 'r_neg_c']]
res_special.head()

In [None]:
res_special = special_rev[['index', 'label', 'bert_pred', 'rf_pred', 'category', 'length', 'r_pos_c', 'r_neg_c']]

In [None]:
res_special

Unnamed: 0,index,label,bert_pred,rf_pred,category,length,r_pos_c,r_neg_c
0,5685,1,1,1,Restaurant Industry,27,"[helpful: 0.069, good: 0.026, reasonably: 0.02...","[company: 0.017, going: 0.016]"
1,6654,1,1,1,Hotel Industry,27,"[fantastic: 0.106, spot: 0.041, lovely: 0.018,...",[know: 0.019]
2,8455,1,1,1,Beauty Industry,28,"[definitely: 0.069, best: 0.063, perfectly: 0....",[like: 0.012]
3,8662,1,1,1,Other Industries,32,"[amazing: 0.06, awesome: 0.06, great: 0.049, d...",[minutes: 0.068]
4,8973,1,1,1,Service Industry,26,"[nice: 0.067, fresh: 0.059]",[]


In [None]:
res_special.to_csv('/content/drive/MyDrive/Yelp/results3.csv')

In [None]:
rev8749 = all_reviews[all_reviews.index ==8749]
rev8749['positive_c'] = ''
rev8749['negative_c'] = ''
rev8749['rf_pred'] = ''

add_contributions(rev8749, all_contribs, 8749, ft_imp, all_ids)
rev8749.reset_index(inplace=True)


In [None]:
rev8749

Unnamed: 0,index,text,label,categories,clean_text,category,length,bert_pred,positive_c,negative_c,rf_pred
0,8749,A friend took me here for the first time. I wa...,1,"Nightlife, Bars, Persian/Iranian, Lounges, Foo...",friend took time persian food knew place rarel...,Restaurant Industry,60,1,"{'great': 0.06734059322579866, 'glad': 0.02766...","{'took': 0.035617427447430884, 'people': 0.011...",1


In [None]:
rounded(rev8749)

In [None]:

res_special2 = rev8749[['index', 'label', 'bert_pred', 'rf_pred', 'category', 'length', 'r_pos_c', 'r_neg_c']]
res_special2.head()

Unnamed: 0,index,label,bert_pred,rf_pred,category,length,r_pos_c,r_neg_c
0,8749,1,1,1,Restaurant Industry,60,"[great: 0.067, glad: 0.028, quick: 0.022, good...","[took: 0.036, people: 0.012, waiter: 0.012]"


In [None]:
print(rev8749['r_neg_c'][0])

['took: 0.036', 'people: 0.012', 'waiter: 0.012']


## Other revs manual

In [None]:
def rev_analysis(rev_i):
  print('Positive contributions: ')
  for i in range(len(analyzed_rev.positive_c[rev_i])):
    if list(analyzed_rev.positive_c[rev_i].values())[i] > 0.01:
      print(list(analyzed_rev.positive_c[rev_i].keys())[i])
      print(list(analyzed_rev.positive_c[rev_i].values())[i])
  print('')
  print('Negative contributions: ')
  for i in range(len(analyzed_rev.negative_c[rev_i])):
    if list(analyzed_rev.negative_c[rev_i].values())[i] > 0.01:
      print(list(analyzed_rev.negative_c[rev_i].keys())[i])
      print(list(analyzed_rev.negative_c[rev_i].values())[i])
  print('')
  print(analyzed_rev[['label', 'rf_pred', 'length', 'category']][analyzed_rev.index == rev_i])
  print('')
  print(analyzed_rev.clean_text[rev_i])
  print('' + '\nOriginal Text: ')
  print(analyzed_rev.text[rev_i])