# Predict app rating with sentiment analysis

In [1]:
from transformers import pipeline
import torch
import torch.nn.functional as F 
import pandas as pd
from sklearn.metrics import accuracy_score

In [2]:
classifier = pipeline("sentiment-analysis", model = 'nlptown/bert-base-multilingual-uncased-sentiment')

## Test sentiment model with a review dataset

In [3]:
data = pd.read_csv('./data/reviews.csv')
data.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,sortOrder,appId
0,gp:AOqpTOEhZuqSqqWnaKRgv-9ABYdajFUB0WugPGh-SG-...,Eric Tie,https://play-lh.googleusercontent.com/a-/AOh14...,I cannot open the app anymore,1,0,5.4.0.6,2020-10-27 21:24:41,,,newest,com.anydo
1,gp:AOqpTOH0WP4IQKBZ2LrdNmFy_YmpPCVrV3diEU9KGm3...,john alpha,https://play-lh.googleusercontent.com/a-/AOh14...,I have been begging for a refund from this app...,1,0,,2020-10-27 14:03:28,"Please note that from checking our records, yo...",2020-10-27 15:05:52,newest,com.anydo
2,gp:AOqpTOEMCkJB8Iq1p-r9dPwnSYadA5BkPWTf32Z1azu...,Sudhakar .S,https://play-lh.googleusercontent.com/a-/AOh14...,Very costly for the premium version (approx In...,1,0,,2020-10-27 08:18:40,,,newest,com.anydo
3,gp:AOqpTOGFrUWuKGycpje8kszj3uwHN6tU_fd4gLVFy9z...,SKGflorida@bellsouth.net DAVID S,https://play-lh.googleusercontent.com/-75aK0WF...,"Used to keep me organized, but all the 2020 UP...",1,0,,2020-10-26 13:28:07,What do you find troublesome about the update?...,2020-10-26 14:58:29,newest,com.anydo
4,gp:AOqpTOHls7DW8wmDFzTkHwxuqFkdNQtKHmO6Pt9jhZE...,Louann Stoker,https://play-lh.googleusercontent.com/-pBcY_Z-...,Dan Birthday Oct 28,1,0,5.6.0.7,2020-10-26 06:10:50,,,newest,com.anydo


In [4]:
data.shape

(12495, 12)

In [5]:
data.columns

Index(['reviewId', 'userName', 'userImage', 'content', 'score',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent',
       'repliedAt', 'sortOrder', 'appId'],
      dtype='object')

In [6]:
columns_to_drop = ['reviewId', 'userName', 'userImage',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent',
       'repliedAt', 'sortOrder']

data.drop(columns_to_drop, axis='columns', inplace=True)
data.head()

Unnamed: 0,content,score,appId
0,I cannot open the app anymore,1,com.anydo
1,I have been begging for a refund from this app...,1,com.anydo
2,Very costly for the premium version (approx In...,1,com.anydo
3,"Used to keep me organized, but all the 2020 UP...",1,com.anydo
4,Dan Birthday Oct 28,1,com.anydo


In [7]:
data['sentiment_score'] = data.apply(lambda x: int(classifier(x['content'][:512])[0]['label'].split()[0]), axis=1)

### Top 1 Accuracy

In [8]:
accuracy_score(data['score'], data['sentiment_score'])

0.5224489795918368

### Top 3 Accuracy
success => sentiment_score = score - 1 OR score OR score + 1

In [9]:
data['success'] = data.apply(lambda x: abs(x['score'] - x['sentiment_score'])<=1, axis=1)

In [10]:
data['success'].describe()

count     12495
unique        2
top        True
freq      11081
Name: success, dtype: object

In [11]:
data['success'].value_counts()

True     11081
False     1414
Name: success, dtype: int64

In [12]:
data['success'].value_counts()[1]/len(data)

0.8868347338935574

## Predict app rating
Predict [Count Masters Game](https://play.google.com/store/apps/details?id=freeplay.crowdrun.com&gl=US) rating

In [13]:
data_app = pd.read_csv('./data/count_masters_reviews.csv')
data_app.columns = ['text', 'score']
data_app

Unnamed: 0,text,score
0,I love this game it's the best game ever,5
1,NNnnnnnicef,5
2,Idk why,5
3,dj,5
4,Naais,5
...,...,...
23614,"Very baaaaad, too many ad, made me nervous",1
23615,I understand the need for ads but when your ad...,1
23616,"Believe the reviews. Game itself is decent, bu...",1
23617,"Exactly as others say. 10 secs of gameplay, 30...",1


In [14]:
data_app['sentiment_score'] = data_app.apply(lambda x: int(classifier(x['text'][:512])[0]['label'].split()[0]), axis=1)

In [15]:
data_app

Unnamed: 0,text,score,sentiment_score
0,I love this game it's the best game ever,5,5
1,NNnnnnnicef,5,1
2,Idk why,5,1
3,dj,5,5
4,Naais,5,3
...,...,...,...
23614,"Very baaaaad, too many ad, made me nervous",1,2
23615,I understand the need for ads but when your ad...,1,3
23616,"Believe the reviews. Game itself is decent, bu...",1,2
23617,"Exactly as others say. 10 secs of gameplay, 30...",1,1


### Top 1 Accuracy

In [16]:
accuracy_score(data_app['score'], data_app['sentiment_score'])

0.4668275540878107

### Top 3 Accuracy

In [17]:
data_app['success'] = data_app.apply(lambda x: abs(x['score'] - x['sentiment_score'])<=1, axis=1)

In [18]:
data_app['success'].describe()

count     23619
unique        2
top        True
freq      18215
Name: success, dtype: object

In [19]:
data_app['success'].value_counts()

True     18215
False     5404
Name: success, dtype: int64

In [20]:
data_app['success'].value_counts()[1]/len(data_app)

0.7712011516152251

### Score mean VS Sentiment Score mean

In [21]:
data['score'].mean()

3.0941976790716286

In [22]:
data['sentiment_score'].mean()

3.1575830332132853