# Predict app rating with linear regression

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('./data/Google-Playstore-32K.csv')
data.head()

Unnamed: 0,App Name,Category,Rating,Reviews,Installs,Size,Price,Content Rating,Last Updated,Minimum Version,Latest Version
0,DoorDash - Food Delivery,FOOD_AND_DRINK,4.548561573,305034.0,"5,000,000+",Varies with device,0,Everyone,"March 29, 2019",Varies with device,Varies with device
1,TripAdvisor Hotels Flights Restaurants Attract...,TRAVEL_AND_LOCAL,4.400671482,1207922.0,"100,000,000+",Varies with device,0,Everyone,"March 29, 2019",Varies with device,Varies with device
2,Peapod,SHOPPING,3.656329393,1967.0,"100,000+",1.4M,0,Everyone,"September 20, 2018",5.0 and up,2.2.0
3,foodpanda - Local Food Delivery,FOOD_AND_DRINK,4.107232571,389154.0,"10,000,000+",16M,0,Everyone,"March 22, 2019",4.2 and up,4.18.2
4,My CookBook Pro (Ad Free),FOOD_AND_DRINK,4.647752285,2291.0,"10,000+",Varies with device,$5.99,Everyone,"April 1, 2019",Varies with device,Varies with device


In [3]:
data.shape

(32000, 11)

## Data Cleaning

### Convert types

In [4]:
data = data[data.Size.str.contains('\d')]
data.Size[data.Size.str.contains('k')] = "0."+data.Size[data.Size.str.contains('k')].str.replace('.','')
data.Size = data.Size.str.replace('k','')
data.Size = data.Size.str.replace('M','')
data.Size = data.Size.str.replace(',','')
data.Size = data.Size.str.replace('+','')
data.Size = data.Size.astype(float)

data = data[data.Installs.str.contains('\+')]
data.Installs = data.Installs.str.replace('+','')
data.Installs = data.Installs.str.replace(',','')
data.Installs.astype(int)

data.Price = data.Price.str.contains('1|2|3|4|5|7|8|9').replace(False, 0)

data = data[data.applymap(np.isreal).Reviews]
data.Reviews = data.Reviews.astype(float)

data = data[data.Rating.str.contains('\d') == True]
data.Rating = data.Rating.astype(float)

  data.Size[data.Size.str.contains('k')] = "0."+data.Size[data.Size.str.contains('k')].str.replace('.','')
  data.Size = data.Size.str.replace('+','')
  data.Installs = data.Installs.str.replace('+','')


### Rename columns

In [5]:
data.rename(lambda x: x.lower().strip().replace(' ', '_'), 
            axis='columns', inplace=True)

In [6]:
data.columns

Index(['app_name', 'category', 'rating', 'reviews', 'installs', 'size',
       'price', 'content_rating', 'last_updated', 'minimum_version',
       'latest_version'],
      dtype='object')

## Linear Regression

In [7]:
data_dum = pd.get_dummies(data, columns=['category','content_rating','price'])

In [8]:
y = data_dum.rating
X = data_dum.drop(columns=['rating', 'app_name', 'last_updated', 'minimum_version', 'latest_version'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (21601, 60) (21601,)
Test (7201, 60) (7201,)


In [9]:
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression()

In [10]:
data['rating_pred'] = reg.predict(X)

### Accuracy

In [11]:
reg.score(X, y)*100

5.877411819346634