<a href="https://colab.research.google.com/github/lyqht/googleplaystore-analytics/blob/master/notebooks/predict_rating_doc2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("https://raw.githubusercontent.com/lyqht/googleplaystore-analytics/master/data/reviews_joined.csv", index_col=0)
df.dropna(inplace=True)
df.reset_index(inplace=True)
df.drop("index", axis=1,inplace=True)
df.drop_duplicates(subset=["Preprocessed_Review","App"],inplace=True, keep="first")
df['Price'] = df['Price'].str.replace('$', '')
df['Price'] = df['Price'].astype(float)
df.columns

Index(['App', 'Review', 'Preprocessed_Review', 'Category', 'Average_Rating',
       'Rating_Label', 'Num_Reviews', 'Size', 'Installs', 'Type', 'Price',
       'Content Rating', 'Genres', 'Tokens', 'Sentiment', 'Neutral Proportion',
       'Positive Proportion', 'Negative Proportion', 'Compound Score',
       'Sentiment_Rating'],
      dtype='object')

In [2]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

columns_to_normalize = ["Average_Rating", "Installs", "Price", "Size", "Sentiment_Rating"]
for col in columns_to_normalize:
  data_scaled = min_max_scaler.fit_transform(df[col].to_numpy().reshape(-1,1))
  df[col] = data_scaled

columns_to_encode = ["Category", "Genres"]
for col in columns_to_encode:
  df[col] = LabelEncoder().fit_transform(df[col])

df.head(2)

Unnamed: 0,App,Review,Preprocessed_Review,Category,Average_Rating,Rating_Label,Num_Reviews,Size,Installs,Type,Price,Content Rating,Genres,Tokens,Sentiment,Neutral Proportion,Positive Proportion,Negative Proportion,Compound Score,Sentiment_Rating
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,like delicious food cooking food case best foo...,15,0.608696,5,2490,0.038301,0.000499,Free,0.0,Everyone,29,"['i', 'like', 'eat', 'delicious', 'food', 'tha...",Positive,0.466,0.534,0.0,0.9531,0.978082
1,10 Best Foods for You,This help eating healthy exercise regular basis,help eating healthy exercise regular basis,15,0.608696,4,2490,0.038301,0.000499,Free,0.0,Everyone,29,"['this', 'help', 'eating', 'healthy', 'exercis...",Positive,0.481,0.519,0.0,0.6597,0.829908


In [3]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

columns_to_normalize = ["Average_Rating", "Installs", "Price", "Size", "Sentiment_Rating"]
for col in columns_to_normalize:
  data_scaled = min_max_scaler.fit_transform(df[col].to_numpy().reshape(-1,1))
  df[col] = data_scaled

columns_to_encode = ["Category", "Genres"]
for col in columns_to_encode:
  df[col] = LabelEncoder().fit_transform(df[col])

df.head(2)

Unnamed: 0,App,Review,Preprocessed_Review,Category,Average_Rating,Rating_Label,Num_Reviews,Size,Installs,Type,Price,Content Rating,Genres,Tokens,Sentiment,Neutral Proportion,Positive Proportion,Negative Proportion,Compound Score,Sentiment_Rating
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,like delicious food cooking food case best foo...,15,0.608696,5,2490,0.038301,0.000499,Free,0.0,Everyone,29,"['i', 'like', 'eat', 'delicious', 'food', 'tha...",Positive,0.466,0.534,0.0,0.9531,0.978082
1,10 Best Foods for You,This help eating healthy exercise regular basis,help eating healthy exercise regular basis,15,0.608696,4,2490,0.038301,0.000499,Free,0.0,Everyone,29,"['this', 'help', 'eating', 'healthy', 'exercis...",Positive,0.481,0.519,0.0,0.6597,0.829908


In [4]:
print("Number of apps that have reviews")
df["App"].nunique()

Number of apps that have reviews


531

In [5]:
num_reviews_per_app = list(df.groupby(["App","Average_Rating"]).size())
min_num_reviews = int(np.percentile(num_reviews_per_app,25))
#min_num_reviews = 0 # temporirily edited for debugging 
print(f"25th percentile of the count of reviews :", min_num_reviews)

25th percentile of the count of reviews : 26


In [7]:
num_reviews_per_app = 28
to_keep = df.App.value_counts()[df.App.value_counts() >= num_reviews_per_app].index
print("Number of Apps that have at least ",num_reviews_per_app, "reviews :", len(to_keep))

df = df[df.App.isin(to_keep)]

Number of Apps that have at least  28 reviews : 380


In [8]:
unique_apps = to_keep

print("Creating an array containing arrays of reviews of different apps")
reviews_by_app = [df[df["App"] == unique_apps[i]]["Preprocessed_Review"].to_numpy() for i in range(len(to_keep))]
reviews_by_app

print("Creating an array containing the actual average rating of different apps")
avr_rating_per_app = [df[df["App"] == unique_apps[i]]["Average_Rating"].to_numpy() for i in range(len(to_keep))]

Creating an array containing arrays of reviews of different apps
Creating an array containing the actual average rating of different apps


In [9]:
from tqdm import tqdm
from gensim.models import doc2vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument 
import re

def get_vectors(model,corpus_size,vectors_size,index):
    vectors = np.zeros((corpus_size,vectors_size))
    for i in range(0,corpus_size):
        prefix = str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors[index] 

def label_sentences(corpus):
    labeled = []
    for i, v in enumerate(corpus):
        label = str(i)
        labeled.append(doc2vec.TaggedDocument(v,[label]))
    return labeled

vector_size = 300
all_data = label_sentences(df.Tokens)
model_dbow = gensim.models.Doc2Vec(dm = 0,vector_size = vector_size, negative = 5,min_count =1,alpha = 0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]),total_examples = len(all_data),epochs = 1)
    model_dbow.alpha -=0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 15457/15457 [00:00<00:00, 2313340.12it/s]
100%|██████████| 15457/15457 [00:00<00:00, 2538921.36it/s]
100%|██████████| 15457/15457 [00:00<00:00, 3205505.90it/s]
100%|██████████| 15457/15457 [00:00<00:00, 3368212.64it/s]
100%|██████████| 15457/15457 [00:00<00:00, 3397335.69it/s]
100%|██████████| 15457/15457 [00:00<00:00, 3326390.81it/s]
100%|██████████| 15457/15457 [00:00<00:00, 1829120.78it/s]
100%|██████████| 15457/15457 [00:00<00:00, 3348727.11it/s]
100%|██████████| 15457/15457 [00:00<00:00, 2265721.57it/s]
100%|██████████| 15457/15457 [00:00<00:00, 2170450.52it/s]
100%|██████████| 15457/15457 [00:00<00:00, 2456012.31it/s]
100%|██████████| 15457/15457 [00:00<00:00, 3248715.02it/s]
100%|██████████| 15457/15457 [00:00<00:00, 3262775.89it/s]
100%|██████████| 15457/15457 [00:00<00:00, 2349983.94it/s]
100%|██████████| 15457/15457 [00:00<00:00, 2097559.11it/s]
100%|██████████| 15457/15457 [00:00<00:00, 2152221.12it/s]
100%|██████████| 15457/15457 [00:00<00:00, 3399295.14it/

In [11]:
df['vectors'] = list(model_dbow.docvecs.vectors_docs)
for i in range(0, len(df)):
  df['vectors'][i] = pd.Series(model_dbow.docvecs.vectors_docs[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
len(df['vectors'])

21429

In [14]:
df['Tagged_Document'] = all_data
print("Creating an array containing arrays of vectors of different apps")
df["vectors"].dropna(inplace=True)
vectors_by_app = [df[df["App"] == unique_apps[i]]["vectors"].to_numpy() for i in range(len(to_keep))]

Creating an array containing arrays of vectors of different apps


In [0]:
class TextVectorDataset(Dataset):
  global num_reviews_per_app
  def __init__(self, x, y):
    self.samples = x
    self.labels = y
  
  def __len__(self):
    return len(self.samples)
  
  def __getitem__(self,idx):
    item = self.samples[idx]
    item = np.random.choice(item, size=num_reviews_per_app) # sampling 28 reviews from each app
    print(type(item))
    label = self.labels[idx][0]
    print(label)
    return tensor(item, dtype=torch.float), tensor(label, dtype=torch.float)

x = vectors_by_app # vectors_per_app
y = avr_rating_per_app

train_size = int(0.7*len(x))
val_size = len(x) - train_size

data = TextVectorDataset(x, y)
trainset, valset = random_split(data, [train_size, val_size])

BATCH_SIZE = 50
train_dataloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=True)

print("Training dataloader has ", len(train_dataloader), "batches of ", BATCH_SIZE)
print("Validation dataloader has ", len(val_dataloader), "batches of ", BATCH_SIZE)
for batch_idx, samples in enumerate(train_dataloader):
  # print(batch_idx)
  print(samples)

In [0]:
INPUT_SIZE = 1 # currently vector shape not consistent as indicated
EMBED_SIZE = vector_size
OUTPUT_SIZE = 1 # regression to reach average rating
HIDDEN_SIZE = 100 # arbitrary
learning_rate = 0.2

class VectorNet(nn.Module):
  def __init__(self, input_size, embed_dim, output_size):
        super(VectorNet, self).__init__()
        self.embedding = nn.EmbeddingBag(input_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, output_size)
        self.init_weights()

  def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

  def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

net = VectorNet(input_size=INPUT_SIZE, embed_dim = vector_size, output_size=OUTPUT_SIZE)
optimizer = optim.Adam(net.parameters(), lr=learning_rate)
loss_func = nn.MSELoss()

num_epochs = 100
losses = []
for i in range(num_epochs):
  for batch_idx, samples in enumerate(train_dataloader):
    print(samples)
    x,y = samples

    prediction = net(x)
    loss = loss_func(prediction, y)
    optimizer.zero_grad()   # clear gradients for next train
    loss.backward()         # backpropagation, compute gradients
    optimizer.step()        # apply gradients

    # for plotting
  if i % 10 == 0:
      print("Epoch ", i, ", Loss: ", loss)
  losses.append(loss)
  prediction = 0

plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.plot(range(len(losses)),losses)