In [92]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

#packages
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [93]:
# reading data from csv and creating a dataframe
df = pd.read_csv('Google-Playstore.csv')

# dataframe dimensions
print(f"This dataframe has {df.shape[0]} rows and {df.shape[1]} columns.")

This dataframe has 603047 rows and 23 columns.


# OLD DATASET

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267034 entries, 0 to 267033
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0       267034 non-null  int64  
 1   App_Name         267033 non-null  object 
 2   Category         267034 non-null  object 
 3   Rating           267034 non-null  float64
 4   Reviews          267034 non-null  int64  
 5   Installs         267034 non-null  float64
 6   Size             267034 non-null  object 
 7   Price            267034 non-null  float64
 8   Content_Rating   267034 non-null  object 
 9   Last_Updated     267034 non-null  object 
 10  Minimum_Version  267033 non-null  object 
 11  Latest_Version   267031 non-null  object 
dtypes: float64(3), int64(2), object(7)
memory usage: 24.4+ MB


In [4]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,App_Name,Category,Rating,Reviews,Installs,Size,Price,Content_Rating,Last_Updated,Minimum_Version,Latest_Version
85702,85703,Speechkeys Smart Voice Typing,TOOLS,4.05,660,100000.0,4.9M,0.0,Everyone,28-Oct-18,4.3 and up,1.13
20848,20849,Gurmukhi Keyboard,PRODUCTIVITY,4.39,14533,1000000.0,5.4M,0.0,Everyone,18-Dec-18,4.0 and up,1.7.3
64495,64496,Cute Dog Sticker WAStickerApps ??,ENTERTAINMENT,4.5,8,5000.0,5.4M,0.0,Everyone,2-Dec-18,4.0.3 and up,1.0.0.0
56113,56114,Le Fil Business,BUSINESS,4.71,7,1000.0,19M,0.0,Everyone,19-Feb-19,5.0 and up,4.0.2
118547,118548,?????? ???? ??????? ???? ??,MUSIC_AND_AUDIO,4.82,540,50000.0,79M,0.0,Everyone,13-Dec-18,4.1 and up,3
236203,236204,Best Galaxy S9 Ringtones 2019,ENTERTAINMENT,5.0,1,100.0,19M,0.0,Everyone,25-Jan-19,4.0.3 and up,1
133195,133196,UGC Net Commerce Solved Paper 2-3 10 papers,EDUCATION,5.0,3,1000.0,3.3M,0.0,Everyone,3-Nov-18,4.2 and up,1
17154,17155,Brahma Kumaris - Om Shanti,LIFESTYLE,4.67,190,10000.0,10M,0.0,Everyone,16-Feb-19,4.0.3 and up,2.3.0
57021,57022,MapCam - Geo Camera & Collages,PHOTOGRAPHY,4.16,3203,100000.0,2.4M,0.0,Everyone,3-Jun-18,4.0.3 and up,4.5.5
6747,6748,Super-Bright LED Flashlight,PRODUCTIVITY,4.6,9123436,500000000.0,Varies with device,0.0,Everyone,27-Mar-19,Varies with device,Varies with device


In [5]:
# missing values summary
pd.DataFrame(zip(df.columns,
                 df.isna().any(),
                 df.isna().sum()\
                     / df.shape[0]\
                     * 100),
            columns=['Column', 'Has Missing Values?', '% Missing Values'])\
    .sort_values('% Missing Values', ascending=False)

Unnamed: 0,Column,Has Missing Values?,% Missing Values
11,Latest_Version,True,0.001123
1,App_Name,True,0.000374
10,Minimum_Version,True,0.000374
0,Unnamed: 0,False,0.0
2,Category,False,0.0
3,Rating,False,0.0
4,Reviews,False,0.0
5,Installs,False,0.0
6,Size,False,0.0
7,Price,False,0.0


## Target Variable: Rating

- Predictor Variable: 

-- Category(dummies)
-- Rating
-- Reviews
-- Installs
-- Size
-- Prize(0 or 1),

### Recategorizing the Category variable

In [6]:
#CATEGORY 1
lifestyle=['BEAUTY','FOOD_AND_DRINK','DATING','HOUSE_AND_HOME','LIFESTYLE',
           'MUSIC_AND_AUDIO','PARENTING','PERSONALIZATION','PHOTOGRAPHY',
           'PRODUCTIVITY','SHOPPING','SOCIAL','SPORTS','VIDEO_PLAYERS']

#CATEGORY 2
education=['BOOKS_AND_REFERENCE','EDUCATION','LIBRARIES_AND_DEMO','ART_AND_DESIGN']

#CATEGORY 3
health=['HEALTH_AND_FITNESS','MEDICAL']

#CATEGORY 4
travel=['AUTO_AND_VEHICLES','MAPS_AND_NAVIGATION','TRAVEL','TRAVEL_AND_LOCAL']

#CATEGORY 5
communication=['COMMUNICATION']

#CATEGORY 6
news=['NEWS_AND_MAGAZINES']

#CATEGORY 7
tools=['TOOLS']

#CATEGORY 8
weather=['WEATHER']

#CATEGORY 9
business=['BUSINESS','FINANCE']

#CATEGORY 10
entertainment = ['COMICS','ENTERTAINMENT','EVENTS','GAME_ACTION','GAME_ADVENTURE',
                 'GAME_ARCADE','GAME_BOARD','GAME_CARD','GAME_CASINO','GAME_CASUAL',
                 'GAME_EDUCATIONAL','GAME_MUSIC','GAME_PUZZLE','GAME_RACING','GAME_ROLE_PLAYING',
                 'GAME_SIMULATION','GAME_SPORTS','GAME_STRATEGY','GAME_TRIVIA',
                 'GAME_WORD']


In [7]:
df.loc[df['Category'].isin(lifestyle), 'c_lifestyle'] = np.int64(1)
df['c_lifestyle'] = df['c_lifestyle'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(education), 'c_education'] = np.int64(1)
df['c_education'] = df['c_education'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(health), 'c_health'] = np.int64(1)
df['c_health'] = df['c_health'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(travel), 'c_travel'] = np.int64(1)
df['c_travel'] = df['c_travel'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(communication), 'c_communication'] = np.int64(1)
df['c_communication'] = df['c_communication'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(news), 'c_news'] = np.int64(1)
df['c_news'] = df['c_news'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(tools), 'c_tools'] = np.int64(1)
df['c_tools'] = df['c_tools'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(weather), 'c_weather'] = np.int64(1)
df['c_weather'] = df['c_weather'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(business), 'c_business'] = np.int64(1)
df['c_business'] = df['c_business'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(entertainment), 'c_entertainment'] = np.int64(1)
df['c_entertainment'] = df['c_entertainment'].replace(np.nan, np.int64(0))

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,App_Name,Category,Rating,Reviews,Installs,Size,Price,Content_Rating,Last_Updated,...,c_lifestyle,c_education,c_health,c_travel,c_communication,c_news,c_tools,c_weather,c_business,c_entertainment
0,1,DoorDash - Food Delivery,FOOD_AND_DRINK,4.55,305034,5000000.0,Varies with device,0.0,Everyone,29-Mar-19,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,TripAdvisor Hotels Flights Restaurants Attract...,TRAVEL_AND_LOCAL,4.4,1207922,100000000.0,Varies with device,0.0,Everyone,29-Mar-19,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Peapod,SHOPPING,3.66,1967,100000.0,1.4M,0.0,Everyone,20-Sep-18,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,foodpanda - Local Food Delivery,FOOD_AND_DRINK,4.11,389154,10000000.0,16M,0.0,Everyone,22-Mar-19,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,My CookBook Pro (Ad Free),FOOD_AND_DRINK,4.65,2291,10000.0,Varies with device,5.99,Everyone,1-Apr-19,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df.Size.value_counts()

11M     7312
12M     6362
13M     5569
14M     5266
15M     5157
        ... 
124M       1
120M       1
334M       1
3.1k       1
770k       1
Name: Size, Length: 1235, dtype: int64

In [18]:
#Remove Varies with Device from Size and put in the mean value
df['Size'] = df['Size'].replace('Varies with device', 
    np.mean(pd.to_numeric(df['Size'], errors='coerce')))

#Remove M from the size variable
df['Size'] = df['Size'].str.replace('M','')
df['Size'] = df['Size'].str.replace('k','')
df['Size'] = df['Size'].str.replace(',','')

In [22]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)


In [23]:
#Convert the other columns to factor

df['Size'] = df['Size'].astype('float64')
df['c_lifestyle'] = df['c_lifestyle'].astype('category')
df['c_education'] = df['c_education'].astype('category')
df['c_health'] = df['c_health'].astype('category')
df['c_travel'] = df['c_travel'].astype('category')
df['c_communication'] = df['c_communication'].astype('category')
df['c_news'] = df['c_news'].astype('category')
df['c_tools'] = df['c_tools'].astype('category')
df['c_weather'] = df['c_weather'].astype('category')
df['c_business'] = df['c_business'].astype('category')
df['c_entertainment'] = df['c_entertainment'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267034 entries, 0 to 267033
Data columns (total 22 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   Unnamed: 0       267034 non-null  int64   
 1   App_Name         267033 non-null  object  
 2   Category         267034 non-null  object  
 3   Rating           267034 non-null  float64 
 4   Reviews          267034 non-null  int64   
 5   Installs         267034 non-null  float64 
 6   Size             255308 non-null  float64 
 7   Price            267034 non-null  float64 
 8   Content_Rating   267034 non-null  object  
 9   Last_Updated     267034 non-null  object  
 10  Minimum_Version  267033 non-null  object  
 11  Latest_Version   267031 non-null  object  
 12  c_lifestyle      267034 non-null  category
 13  c_education      267034 non-null  category
 14  c_health         267034 non-null  category
 15  c_travel         267034 non-null  category
 16  c_communication  267

In [24]:
X =df[['Reviews','Installs','Price','Size','c_lifestyle','c_education','c_health',
       'c_travel','c_communication','c_news','c_tools','c_weather','c_business','c_entertainment']]
y = df[['Rating']]

# NEW DATASET

In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 603047 entries, 0 to 603046
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   App Name           603046 non-null  object 
 1   App Id             603047 non-null  object 
 2   Category           603047 non-null  object 
 3   Rating             599287 non-null  float64
 4   Rating Count       599287 non-null  float64
 5   Installs           602875 non-null  object 
 6   Minimum Installs   602875 non-null  float64
 7   Maximum Installs   603047 non-null  int64  
 8   Free               603047 non-null  bool   
 9   Price              603047 non-null  float64
 10  Currency           602866 non-null  object 
 11  Size               603047 non-null  object 
 12  Minimum Android    602556 non-null  object 
 13  Developer Id       603045 non-null  object 
 14  Developer Website  423783 non-null  object 
 15  Developer Email    603010 non-null  object 
 16  Re

In [95]:
#Drop unnecessary variables: Deveopler Website, Developer Id, Developer Email, Privacy Policy
df = df.drop(['Developer Id', 'Developer Website', 'Developer Email', 'Privacy Policy'], axis = 1)

In [96]:
# missing values summary
pd.DataFrame(zip(df.columns,
                 df.isna().any(),
                 df.isna().sum()\
                     / df.shape[0]\
                     * 100),
            columns=['Column', 'Has Missing Values?', '% Missing Values'])\
    .sort_values('% Missing Values', ascending=False)

Unnamed: 0,Column,Has Missing Values?,% Missing Values
13,Released,True,5.352485
3,Rating,True,0.6235
4,Rating Count,True,0.6235
12,Minimum Android,True,0.08142
10,Currency,True,0.030014
5,Installs,True,0.028522
6,Minimum Installs,True,0.028522
0,App Name,True,0.000166
17,In App Purchases,False,0.0
16,Ad Supported,False,0.0


In [97]:
#CATEGORY 1
lifestyle=['Beauty','Dating','Food&Drink','House&Home','Lifestyle','Music','Music&Audio',
           'Personalization','Photography','Parenting','Shopping','Social']
#CATEGORY 2
education=['Art&Design','Books&Reference','Education','Educational','Libraries&Demo']
#CATEGORY 3
health=['Health&Fitness','Medical']
#CATEGORY 4
travel=['Auto&Vehicles','Maps&Navigation','Travel&Local']
#CATEGORY 5
communication=['Communication']
#CATEGORY 6
news=['News&Magazines']
#CATEGORY 7
tools=['Tools','VideoPlayers&Editors','Word','Productivity']
#CATEGORY 8
weather=['Weather']
#CATEGORY 9
business=['Business','Finance']
#CATEGORY 10
entertainment = ['Entertainment','Action','Adventure','Arcade','Board','Comics','Card','Casino','Casual','Events',
                 'Puzzle','Racing','Role Playing','Simulation','Sports','Strategy','Trivia']

In [98]:
df.loc[df['Category'].isin(lifestyle), 'c_lifestyle'] = np.int64(1)
df['c_lifestyle'] = df['c_lifestyle'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(education), 'c_education'] = np.int64(1)
df['c_education'] = df['c_education'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(health), 'c_health'] = np.int64(1)
df['c_health'] = df['c_health'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(travel), 'c_travel'] = np.int64(1)
df['c_travel'] = df['c_travel'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(communication), 'c_communication'] = np.int64(1)
df['c_communication'] = df['c_communication'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(news), 'c_news'] = np.int64(1)
df['c_news'] = df['c_news'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(tools), 'c_tools'] = np.int64(1)
df['c_tools'] = df['c_tools'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(weather), 'c_weather'] = np.int64(1)
df['c_weather'] = df['c_weather'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(business), 'c_business'] = np.int64(1)
df['c_business'] = df['c_business'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(entertainment), 'c_entertainment'] = np.int64(1)
df['c_entertainment'] = df['c_entertainment'].replace(np.nan, np.int64(0))

In [99]:
df.head()

Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,...,c_lifestyle,c_education,c_health,c_travel,c_communication,c_news,c_tools,c_weather,c_business,c_entertainment
0,foodpanda - Local Food & Grocery Delivery,com.global.foodpanda.android,Food & Drink,4.4,1258493.0,"10,000,000+",10000000.0,48828081,True,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Tripadvisor Hotel, Flight & Restaurant Bookings",com.tripadvisor.tripadvisor,Travel & Local,4.4,1360172.0,"100,000,000+",100000000.0,371284181,True,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Safeway Delivery & Pick Up,com.safeway.shop,Shopping,4.2,5532.0,"500,000+",500000.0,816102,True,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,My CookBook Ad-Free,fr.cookbookpro,Food & Drink,4.7,2531.0,"10,000+",10000.0,16171,False,5.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Talabat: Food & Grocery Delivery,com.talabat,Food & Drink,4.4,197649.0,"5,000,000+",5000000.0,9218590,True,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
print(df.c_lifestyle.value_counts())
print(df.c_health.value_counts())
print(df.c_travel.value_counts())
print(df.c_communication.value_counts())
print(df.c_news.value_counts())
print(df.c_tools.value_counts())
print(df.c_weather.value_counts())
print(df.c_business.value_counts())
print(df.c_entertainment.value_counts())

0.0    500232
1.0    102815
Name: c_lifestyle, dtype: int64
0.0    594682
1.0      8365
Name: c_health, dtype: int64
0.0    603047
Name: c_travel, dtype: int64
0.0    591278
1.0     11769
Name: c_communication, dtype: int64
0.0    603047
Name: c_news, dtype: int64
0.0    541275
1.0     61772
Name: c_tools, dtype: int64
0.0    600492
1.0      2555
Name: c_weather, dtype: int64
0.0    556005
1.0     47042
Name: c_business, dtype: int64
0.0    492704
1.0    110343
Name: c_entertainment, dtype: int64


In [101]:
df.c_education.value_counts()

0.0    520983
1.0     82064
Name: c_education, dtype: int64

## Size variable

In [102]:
#Remove Varies with Device from Size
df = df[df['Size'] != 'Varies with device']

In [103]:
kb_index = df.Size[df.Size.str.contains('k')].index
mb_index = df.Size[(df.Size.str.contains('M'))].index

In [104]:
#Remove M,k,, from the size variable
#df['Size'] = df['Size'].str.replace('.','')
df['Size'] = df['Size'].str.replace(',','')
df.Size = df.Size.str.replace('M','')
df.Size = df.Size.str.replace('k','')

In [105]:
df.Size = pd.to_numeric(df.Size,errors='coerce')
df.Size.loc[mb_index] = df.Size.loc[mb_index] * 1000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [106]:
df.Size.value_counts()

11000.0     16960
12000.0     14280
13000.0     12196
14000.0     11249
16000.0     10784
            ...  
198000.0        1
347000.0        1
318000.0        1
225000.0        1
233000.0        1
Name: Size, Length: 1386, dtype: int64

In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 576945 entries, 0 to 603046
Data columns (total 30 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   App Name          576944 non-null  object        
 1   App Id            576945 non-null  object        
 2   Category          576945 non-null  object        
 3   Rating            576781 non-null  float64       
 4   Rating Count      576781 non-null  float64       
 5   Installs          576945 non-null  float64       
 6   Minimum Installs  576945 non-null  float64       
 7   Maximum Installs  576945 non-null  int64         
 8   Free              576945 non-null  bool          
 9   Price             576945 non-null  float64       
 10  Currency          576945 non-null  object        
 11  Size              576945 non-null  float64       
 12  Minimum Android   576471 non-null  object        
 13  Released          549436 non-null  datetime64[ns]
 14  Last

In [118]:
df.head()

Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,...,c_education,c_health,c_travel,c_communication,c_news,c_tools,c_weather,c_business,c_entertainment,InstallCategory
0,foodpanda - Local Food & Grocery Delivery,com.global.foodpanda.android,Food & Drink,4.4,1258493.0,10000000.0,10000000.0,48828081,True,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Very High
2,Safeway Delivery & Pick Up,com.safeway.shop,Shopping,4.2,5532.0,500000.0,500000.0,816102,True,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,High
5,DoorDash - Food Delivery,com.dd.doordash,Food & Drink,4.5,784802.0,10000000.0,10000000.0,24426156,True,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Very High
6,Peapod Delivery,com.peapod.app.mobile.droid,Shopping,2.8,2487.0,500000.0,500000.0,571969,True,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,High
8,FreshDirect,com.freshdirect.android,Shopping,2.1,893.0,100000.0,100000.0,184884,True,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,High


## Installs variable

In [109]:
df['Installs'] = df['Installs'].str.replace('+','')
df['Installs'] = df['Installs'].str.replace(',','')
df.Installs = pd.to_numeric(df.Installs,errors='coerce')

In [117]:
def cat(x):
    if x <1000:
        return "Very Low"
    elif (x>=1000 and x<10000):
        return "Low"
    elif (x>=10000 and x<100000):
        return "Medium"
    elif (x>=100000 and x<1000000):
        return "High"
    else:
        return "Very High"

for col in df.columns:
    df['InstallCategory'] = df['Installs'].apply(lambda x: cat(x))

In [120]:
df['InstallCategory'].value_counts()

Very Low     188763
Low          152087
Medium       140669
High          69467
Very High     26120
Name: InstallCategory, dtype: int64

## Currency Variable

In [128]:
df.Currency.value_counts()

USD    576945
Name: Currency, dtype: int64

In [123]:
# remove apps other than US
df = df[df['Currency'] == 'USD']

In [127]:
df.Released=pd.to_datetime(df.Released)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
