In [1]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

#packages
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# reading data from csv and creating a dataframe
df = pd.read_csv('PlaystoreData.csv')

# dataframe dimensions
print(f"This dataframe has {df.shape[0]} rows and {df.shape[1]} columns.")

This dataframe has 267034 rows and 12 columns.


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267034 entries, 0 to 267033
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0       267034 non-null  int64  
 1   App_Name         267033 non-null  object 
 2   Category         267034 non-null  object 
 3   Rating           267034 non-null  float64
 4   Reviews          267034 non-null  int64  
 5   Installs         267034 non-null  float64
 6   Size             267034 non-null  object 
 7   Price            267034 non-null  float64
 8   Content_Rating   267034 non-null  object 
 9   Last_Updated     267034 non-null  object 
 10  Minimum_Version  267033 non-null  object 
 11  Latest_Version   267031 non-null  object 
dtypes: float64(3), int64(2), object(7)
memory usage: 24.4+ MB


In [4]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,App_Name,Category,Rating,Reviews,Installs,Size,Price,Content_Rating,Last_Updated,Minimum_Version,Latest_Version
85702,85703,Speechkeys Smart Voice Typing,TOOLS,4.05,660,100000.0,4.9M,0.0,Everyone,28-Oct-18,4.3 and up,1.13
20848,20849,Gurmukhi Keyboard,PRODUCTIVITY,4.39,14533,1000000.0,5.4M,0.0,Everyone,18-Dec-18,4.0 and up,1.7.3
64495,64496,Cute Dog Sticker WAStickerApps ??,ENTERTAINMENT,4.5,8,5000.0,5.4M,0.0,Everyone,2-Dec-18,4.0.3 and up,1.0.0.0
56113,56114,Le Fil Business,BUSINESS,4.71,7,1000.0,19M,0.0,Everyone,19-Feb-19,5.0 and up,4.0.2
118547,118548,?????? ???? ??????? ???? ??,MUSIC_AND_AUDIO,4.82,540,50000.0,79M,0.0,Everyone,13-Dec-18,4.1 and up,3
236203,236204,Best Galaxy S9 Ringtones 2019,ENTERTAINMENT,5.0,1,100.0,19M,0.0,Everyone,25-Jan-19,4.0.3 and up,1
133195,133196,UGC Net Commerce Solved Paper 2-3 10 papers,EDUCATION,5.0,3,1000.0,3.3M,0.0,Everyone,3-Nov-18,4.2 and up,1
17154,17155,Brahma Kumaris - Om Shanti,LIFESTYLE,4.67,190,10000.0,10M,0.0,Everyone,16-Feb-19,4.0.3 and up,2.3.0
57021,57022,MapCam - Geo Camera & Collages,PHOTOGRAPHY,4.16,3203,100000.0,2.4M,0.0,Everyone,3-Jun-18,4.0.3 and up,4.5.5
6747,6748,Super-Bright LED Flashlight,PRODUCTIVITY,4.6,9123436,500000000.0,Varies with device,0.0,Everyone,27-Mar-19,Varies with device,Varies with device


In [5]:
# missing values summary
pd.DataFrame(zip(df.columns,
                 df.isna().any(),
                 df.isna().sum()\
                     / df.shape[0]\
                     * 100),
            columns=['Column', 'Has Missing Values?', '% Missing Values'])\
    .sort_values('% Missing Values', ascending=False)

Unnamed: 0,Column,Has Missing Values?,% Missing Values
11,Latest_Version,True,0.001123
1,App_Name,True,0.000374
10,Minimum_Version,True,0.000374
0,Unnamed: 0,False,0.0
2,Category,False,0.0
3,Rating,False,0.0
4,Reviews,False,0.0
5,Installs,False,0.0
6,Size,False,0.0
7,Price,False,0.0


## Target Variable: Rating

- Predictor Variable: 

-- Category(dummies)
-- Rating
-- Reviews
-- Installs
-- Size
-- Prize(0 or 1),

### Recategorizing the Category variable

In [6]:
#CATEGORY 1
lifestyle=['BEAUTY','FOOD_AND_DRINK','DATING','HOUSE_AND_HOME','LIFESTYLE',
           'MUSIC_AND_AUDIO','PARENTING','PERSONALIZATION','PHOTOGRAPHY',
           'PRODUCTIVITY','SHOPPING','SOCIAL','SPORTS','VIDEO_PLAYERS']

#CATEGORY 2
education=['BOOKS_AND_REFERENCE','EDUCATION','LIBRARIES_AND_DEMO','ART_AND_DESIGN']

#CATEGORY 3
health=['HEALTH_AND_FITNESS','MEDICAL']

#CATEGORY 4
travel=['AUTO_AND_VEHICLES','MAPS_AND_NAVIGATION','TRAVEL','TRAVEL_AND_LOCAL']

#CATEGORY 5
communication=['COMMUNICATION']

#CATEGORY 6
news=['NEWS_AND_MAGAZINES']

#CATEGORY 7
tools=['TOOLS']

#CATEGORY 8
weather=['WEATHER']

#CATEGORY 9
business=['BUSINESS','FINANCE']

#CATEGORY 10
entertainment = ['COMICS','ENTERTAINMENT','EVENTS','GAME_ACTION','GAME_ADVENTURE',
                 'GAME_ARCADE','GAME_BOARD','GAME_CARD','GAME_CASINO','GAME_CASUAL',
                 'GAME_EDUCATIONAL','GAME_MUSIC','GAME_PUZZLE','GAME_RACING','GAME_ROLE_PLAYING',
                 'GAME_SIMULATION','GAME_SPORTS','GAME_STRATEGY','GAME_TRIVIA',
                 'GAME_WORD']


In [7]:
df.loc[df['Category'].isin(lifestyle), 'c_lifestyle'] = np.int64(1)
df['c_lifestyle'] = df['c_lifestyle'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(education), 'c_education'] = np.int64(1)
df['c_education'] = df['c_education'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(health), 'c_health'] = np.int64(1)
df['c_health'] = df['c_health'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(travel), 'c_travel'] = np.int64(1)
df['c_travel'] = df['c_travel'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(communication), 'c_communication'] = np.int64(1)
df['c_communication'] = df['c_communication'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(news), 'c_news'] = np.int64(1)
df['c_news'] = df['c_news'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(tools), 'c_tools'] = np.int64(1)
df['c_tools'] = df['c_tools'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(weather), 'c_weather'] = np.int64(1)
df['c_weather'] = df['c_weather'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(business), 'c_business'] = np.int64(1)
df['c_business'] = df['c_business'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(entertainment), 'c_entertainment'] = np.int64(1)
df['c_entertainment'] = df['c_entertainment'].replace(np.nan, np.int64(0))

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,App_Name,Category,Rating,Reviews,Installs,Size,Price,Content_Rating,Last_Updated,...,c_lifestyle,c_education,c_health,c_travel,c_communication,c_news,c_tools,c_weather,c_business,c_entertainment
0,1,DoorDash - Food Delivery,FOOD_AND_DRINK,4.55,305034,5000000.0,Varies with device,0.0,Everyone,29-Mar-19,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,TripAdvisor Hotels Flights Restaurants Attract...,TRAVEL_AND_LOCAL,4.4,1207922,100000000.0,Varies with device,0.0,Everyone,29-Mar-19,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Peapod,SHOPPING,3.66,1967,100000.0,1.4M,0.0,Everyone,20-Sep-18,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,foodpanda - Local Food Delivery,FOOD_AND_DRINK,4.11,389154,10000000.0,16M,0.0,Everyone,22-Mar-19,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,My CookBook Pro (Ad Free),FOOD_AND_DRINK,4.65,2291,10000.0,Varies with device,5.99,Everyone,1-Apr-19,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df.Size.value_counts()

11M     7312
12M     6362
13M     5569
14M     5266
15M     5157
        ... 
124M       1
120M       1
334M       1
3.1k       1
770k       1
Name: Size, Length: 1235, dtype: int64

In [18]:
#Remove Varies with Device from Size and put in the mean value
df['Size'] = df['Size'].replace('Varies with device', 
    np.mean(pd.to_numeric(df['Size'], errors='coerce')))

#Remove M from the size variable
df['Size'] = df['Size'].str.replace('M','')
df['Size'] = df['Size'].str.replace('k','')
df['Size'] = df['Size'].str.replace(',','')

In [22]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)


In [23]:
#Convert the other columns to factor

df['Size'] = df['Size'].astype('float64')
df['c_lifestyle'] = df['c_lifestyle'].astype('category')
df['c_education'] = df['c_education'].astype('category')
df['c_health'] = df['c_health'].astype('category')
df['c_travel'] = df['c_travel'].astype('category')
df['c_communication'] = df['c_communication'].astype('category')
df['c_news'] = df['c_news'].astype('category')
df['c_tools'] = df['c_tools'].astype('category')
df['c_weather'] = df['c_weather'].astype('category')
df['c_business'] = df['c_business'].astype('category')
df['c_entertainment'] = df['c_entertainment'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267034 entries, 0 to 267033
Data columns (total 22 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   Unnamed: 0       267034 non-null  int64   
 1   App_Name         267033 non-null  object  
 2   Category         267034 non-null  object  
 3   Rating           267034 non-null  float64 
 4   Reviews          267034 non-null  int64   
 5   Installs         267034 non-null  float64 
 6   Size             255308 non-null  float64 
 7   Price            267034 non-null  float64 
 8   Content_Rating   267034 non-null  object  
 9   Last_Updated     267034 non-null  object  
 10  Minimum_Version  267033 non-null  object  
 11  Latest_Version   267031 non-null  object  
 12  c_lifestyle      267034 non-null  category
 13  c_education      267034 non-null  category
 14  c_health         267034 non-null  category
 15  c_travel         267034 non-null  category
 16  c_communication  267

In [24]:
X =df[['Reviews','Installs','Price','Size','c_lifestyle','c_education','c_health',
       'c_travel','c_communication','c_news','c_tools','c_weather','c_business','c_entertainment']]
y = df[['Rating']]

In [25]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1)

#run linear regression
from regressors import stats
ols = linear_model.LinearRegression()
ols.fit(X, y)

# to print summary table:
print("\n=========== SUMMARY ===========")
#xlabels = boston.feature_names[which_betas]
stats.summary(ols, X, y)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').