## Importing Libraries

In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Data Cleaning

In [8]:
data = pd.read_csv('googleplaystore.csv')

In [9]:
data.head(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5.6M,"50,000+",Free,0,Everyone,Art & Design,"March 26, 2017",1.0,2.3 and up
6,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,19M,"50,000+",Free,0,Everyone,Art & Design,"April 26, 2018",1.1,4.0.3 and up
7,Infinite Painter,ART_AND_DESIGN,4.1,36815,29M,"1,000,000+",Free,0,Everyone,Art & Design,"June 14, 2018",6.1.61.1,4.2 and up
8,Garden Coloring Book,ART_AND_DESIGN,4.4,13791,33M,"1,000,000+",Free,0,Everyone,Art & Design,"September 20, 2017",2.9.2,3.0 and up
9,Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,3.1M,"10,000+",Free,0,Everyone,Art & Design;Creativity,"July 3, 2018",2.8,4.0.3 and up


In [10]:
# Removing irrelevant features
data = data.drop(['App','Last Updated','Current Ver','Android Ver'],axis='columns')

In [11]:
data.head(10)

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres
0,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design
1,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play
2,ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design
3,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design
4,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity
5,ART_AND_DESIGN,4.4,167,5.6M,"50,000+",Free,0,Everyone,Art & Design
6,ART_AND_DESIGN,3.8,178,19M,"50,000+",Free,0,Everyone,Art & Design
7,ART_AND_DESIGN,4.1,36815,29M,"1,000,000+",Free,0,Everyone,Art & Design
8,ART_AND_DESIGN,4.4,13791,33M,"1,000,000+",Free,0,Everyone,Art & Design
9,ART_AND_DESIGN,4.7,121,3.1M,"10,000+",Free,0,Everyone,Art & Design;Creativity


In [12]:
# checking for null values
data.isna().sum()

Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
dtype: int64

In [7]:
# drop the entire record if null value is present in 'any' of the feature
data.dropna(how='any',inplace=True)

In [14]:
data.shape

(10841, 9)

In [15]:
data.isna().sum()

Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
dtype: int64

In [10]:
data.dtypes

Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
dtype: object

In [11]:
# changing the datatype of Review column from integer from object
data = data.astype({'Reviews':'int'})

In [16]:
data.Size.value_counts().head()

Size
Varies with device    1695
11M                    198
12M                    196
14M                    194
13M                    191
Name: count, dtype: int64

In [13]:
data.Size.value_counts().tail()

600k    1
585k    1
226k    1
444k    1
437k    1
Name: Size, dtype: int64

In [18]:
# Replacing 'Varies with device' value with Nan values
data['Size'].replace('Varies with device', np.nan, inplace = True ) 

In [20]:
# Removing the suffixes (k and M) and representing all the data as bytes 
# (i.e)for k, value is multiplied by 100 and for M, the value is multiplied by 1000000 
data.Size = (data.Size.replace(r'[kM]+$', '', regex=True).astype(float) * \
             data.Size.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1)
            .replace(['k','M'], [10**3, 10**6]).astype(int))

ValueError: could not convert string to float: '1,000+'

In [16]:
# filling "Varies with device" with mean of size in each category
data['Size'].fillna(data.groupby('Category')['Size'].transform('mean'),inplace = True)

In [17]:
# Removing comma(,) and plus(+) signs
data.Installs = data.Installs.apply(lambda x: x.replace(',',''))
data.Installs = data.Installs.apply(lambda x: x.replace('+',''))

In [18]:
# changing the datatype from object to integer
data = data.astype({'Installs':'int'})

In [19]:
data.Price.value_counts()

0          8719
$2.99       114
$0.99       107
$4.99        70
$1.99        59
$3.99        58
$1.49        31
$2.49        21
$5.99        18
$9.99        16
$6.99        13
$399.99      11
$14.99       10
$4.49         9
$7.99         7
$3.49         7
$29.99        6
$19.99        5
$11.99        5
$24.99        5
$12.99        5
$8.99         4
$16.99        3
$5.49         3
$10.00        3
$17.99        2
$33.99        2
$1.70         2
$9.00         2
$79.99        2
           ... 
$1.50         1
$3.88         1
$2.56         1
$18.99        1
$4.60         1
$19.40        1
$1.20         1
$4.84         1
$4.77         1
$379.99       1
$6.49         1
$1.97         1
$1.29         1
$39.99        1
$13.99        1
$3.90         1
$2.95         1
$15.99        1
$1.75         1
$37.99        1
$1.59         1
$299.99       1
$3.04         1
$3.28         1
$1.61         1
$2.00         1
$14.00        1
$2.50         1
$2.59         1
$1.76         1
Name: Price, Length: 73,

In [20]:
# Removing dollar($) sign and changing the type to float
data.Price = data.Price.apply(lambda x: x.replace('$',''))
data['Price'] = data['Price'].apply(lambda x: float(x))

In [21]:
data.Genres.value_counts().tail()

Puzzle;Education              1
Strategy;Education            1
Entertainment;Education       1
Health & Fitness;Education    1
Board;Pretend Play            1
Name: Genres, dtype: int64

Many genre contain only few record, it may make a bias.
Then, I decide to group it to bigger genre by ignore sub-genre (after " ; " sign)

In [22]:
data['Genres'] = data.Genres.str.split(';').str[0]

In [23]:
data.Genres.value_counts()

Tools                      734
Entertainment              577
Education                  563
Action                     375
Productivity               351
Medical                    350
Sports                     337
Communication              329
Finance                    323
Photography                317
Lifestyle                  315
Personalization            314
Business                   303
Health & Fitness           299
Casual                     262
Social                     259
Shopping                   238
News & Magazines           233
Travel & Local             226
Arcade                     223
Simulation                 212
Dating                     195
Books & Reference          180
Video Players & Editors    163
Puzzle                     148
Maps & Navigation          124
Role Playing               119
Racing                     114
Food & Drink               109
Strategy                   107
Educational                103
Adventure                   89
House & 

In [24]:
# Group Music & Audio as Music
data['Genres'].replace('Music & Audio', 'Music',inplace = True)

In [25]:
data['Content Rating'].value_counts()

Everyone           7420
Teen               1084
Mature 17+          461
Everyone 10+        397
Adults only 18+       3
Unrated               1
Name: Content Rating, dtype: int64

In [26]:
# Removing the entire row from the data where content rating is unrated as there is only one row
data = data[data['Content Rating'] != 'Unrated']

In [27]:
data.dtypes

Category           object
Rating            float64
Reviews             int64
Size              float64
Installs            int64
Type               object
Price             float64
Content Rating     object
Genres             object
dtype: object

## Data Preprocessing

In [28]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
column_trans = make_column_transformer(
                (OneHotEncoder(),['Category','Installs','Type','Content Rating','Genres']),
                (StandardScaler(),['Reviews','Size','Price']),
                remainder = 'passthrough')

## Train Test Split

In [29]:
# Choosing X and y value
X = data.drop('Rating',axis='columns')
y = data.Rating

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

In [31]:
column_trans.fit_transform(X_train)

<6274x109 sparse matrix of type '<class 'numpy.float64'>'
	with 50192 stored elements in Compressed Sparse Row format>

## Regression Models

### 1. Linear Regression

In [32]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
linreg = LinearRegression()
pipe = make_pipeline(column_trans,linreg)

In [33]:
from sklearn.model_selection import cross_validate
linreg_score = cross_validate(pipe, X_train, y_train, cv=10, scoring=['neg_mean_squared_error','neg_mean_absolute_error'],return_train_score=False)
print('Mean Absolute Error: {}'.format(linreg_score['test_neg_mean_absolute_error'].mean()))
print('Mean Squared Error: {}'.format(linreg_score['test_neg_mean_squared_error'].mean()))
print('Root Mean Squared Error: {}'.format(np.sqrt(-linreg_score['test_neg_mean_squared_error'].mean())))

Mean Absolute Error: -0.34055057338797057
Mean Squared Error: -0.23851495003655176
Root Mean Squared Error: 0.4883799238672202


### 2. Support Vector Regressor (SVR)

In [34]:
from sklearn.svm import SVR
svr = SVR()
pipe = make_pipeline(column_trans,svr)
svr_score = cross_validate(pipe, X_train, y_train, cv=10, scoring=['neg_mean_squared_error','neg_mean_absolute_error'],return_train_score=False)
print('Mean Absolute Error: {}'.format(svr_score['test_neg_mean_absolute_error'].mean()))
print('Mean Squared Error: {}'.format(svr_score['test_neg_mean_squared_error'].mean()))
print('Root Mean Squared Error: {}'.format(np.sqrt(-svr_score['test_neg_mean_squared_error'].mean())))

Mean Absolute Error: -0.32677041531139217
Mean Squared Error: -0.24786811249759094
Root Mean Squared Error: 0.4978635480707449


### 3. Random Forest Regressor

In [35]:
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(n_estimators=100, max_features=3, min_samples_leaf=10)
pipe = make_pipeline(column_trans,forest_model)
rfr_score = cross_validate(pipe, X_train, y_train, cv=10, scoring=['neg_mean_squared_error','neg_mean_absolute_error'],return_train_score=False)
print('Mean Absolute Error: {}'.format(rfr_score['test_neg_mean_absolute_error'].mean()))
print('Mean Squared Error: {}'.format(rfr_score['test_neg_mean_squared_error'].mean()))
print('Root Mean Squared Error: {}'.format(np.sqrt(-rfr_score['test_neg_mean_squared_error'].mean())))

Mean Absolute Error: -0.3396496574366965
Mean Squared Error: -0.2378915842353062
Root Mean Squared Error: 0.4877413087234935


## Testing on Test Set

### 1. Linear Regression

In [36]:
pipe = make_pipeline(column_trans,linreg)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [37]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
print('Mean Absolute Error: {}'.format(mean_absolute_error(y_pred,y_test)))
print('Mean Squared Error: {}'.format(mean_squared_error(y_pred,y_test)))
print('Root Mean Squared Error: {}'.format(np.sqrt(mean_absolute_error(y_pred,y_test))))

Mean Absolute Error: 0.3489443281752163
Mean Squared Error: 0.25837127713467606
Root Mean Squared Error: 0.5907150989903815


### 2. Support Vector Regressor

In [38]:
pipe = make_pipeline(column_trans,svr)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [39]:
print('Mean Absolute Error: {}'.format(mean_absolute_error(y_pred,y_test)))
print('Mean Squared Error: {}'.format(mean_squared_error(y_pred,y_test)))
print('Root Mean Squared Error: {}'.format(np.sqrt(mean_absolute_error(y_pred,y_test))))

Mean Absolute Error: 0.33534403036396315
Mean Squared Error: 0.2711355594863636
Root Mean Squared Error: 0.579088965845459


### 3. Random Forest Regressor

In [40]:
pipe = make_pipeline(column_trans,forest_model)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [41]:
print('Mean Absolute Error: {}'.format(mean_absolute_error(y_pred,y_test)))
print('Mean Squared Error: {}'.format(mean_squared_error(y_pred,y_test)))
print('Root Mean Squared Error: {}'.format(np.sqrt(mean_absolute_error(y_pred,y_test))))

Mean Absolute Error: 0.3472564853666874
Mean Squared Error: 0.2566080142048638
Root Mean Squared Error: 0.589284723513759
