In [9]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
#packages
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# reading data from csv and creating a dataframe
df = pd.read_csv('PlaystoreData.csv')

# dataframe dimensions
print(f"This dataframe has {df.shape[0]} rows and {df.shape[1]} columns.")

This dataframe has 267034 rows and 12 columns.


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267034 entries, 0 to 267033
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0       267034 non-null  int64  
 1   App_Name         267033 non-null  object 
 2   Category         267034 non-null  object 
 3   Rating           267034 non-null  float64
 4   Reviews          267034 non-null  int64  
 5   Installs         267034 non-null  float64
 6   Size             267034 non-null  object 
 7   Price            267034 non-null  float64
 8   Content_Rating   267034 non-null  object 
 9   Last_Updated     267034 non-null  object 
 10  Minimum_Version  267033 non-null  object 
 11  Latest_Version   267031 non-null  object 
dtypes: float64(3), int64(2), object(7)
memory usage: 24.4+ MB


In [4]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,App_Name,Category,Rating,Reviews,Installs,Size,Price,Content_Rating,Last_Updated,Minimum_Version,Latest_Version
200814,200815,Lakhmir Singh & Manjit Kaur Solutions,EDUCATION,4.24,669,50000.0,6.1M,0.0,Everyone,14-Jan-19,4.1 and up,1.6
196342,196343,???????,HEALTH_AND_FITNESS,4.94,17,500.0,7.0M,0.0,Teen,9-Mar-19,4.4 and up,0.0.1
123676,123677,Online CMR,BUSINESS,4.4,168,10000.0,4.5M,0.0,Everyone,25-Jun-18,4.0 and up,0.3.10
66878,66879,Disable Increasing Ring,TOOLS,4.25,24286,100000.0,337k,0.0,Everyone,19-Jan-17,2.3 and up,1.7.2
143896,143897,Intruder GO Keyboard Animated Theme,PERSONALIZATION,4.7,57,10000.0,16M,0.0,Everyone,11-Jan-18,4.0 and up,4.5
149842,149843,Compare AEW - Compare prices before you buy,SHOPPING,4.43,7,50.0,3.7M,0.0,Everyone,19-Jan-19,4.0.3 and up,1.0.3
168059,168060,Holy Bible,LIFESTYLE,4.72,844,50000.0,17M,0.0,Everyone 10+,27-Aug-18,4.0 and up,1.7.4
101452,101453,ValueBit - Live Bitcoin Tracker,FINANCE,5.0,4,10.0,1.9M,0.0,Everyone,15-Jul-18,4.1 and up,1
133077,133078,iNatal � App de embarazo,MEDICAL,4.3,20,5000.0,49M,0.0,Mature 17+,2-Aug-18,4.4 and up,1.0.7
74122,74123,Music Editor Pro,MUSIC_AND_AUDIO,4.57,56,100.0,21M,0.99,Everyone,26-Feb-19,4.3 and up,1.0.8


In [5]:
# missing values summary
pd.DataFrame(zip(df.columns,
                 df.isna().any(),
                 df.isna().sum()\
                     / df.shape[0]\
                     * 100),
            columns=['Column', 'Has Missing Values?', '% Missing Values'])\
    .sort_values('% Missing Values', ascending=False)

Unnamed: 0,Column,Has Missing Values?,% Missing Values
11,Latest_Version,True,0.001123
1,App_Name,True,0.000374
10,Minimum_Version,True,0.000374
0,Unnamed: 0,False,0.0
2,Category,False,0.0
3,Rating,False,0.0
4,Reviews,False,0.0
5,Installs,False,0.0
6,Size,False,0.0
7,Price,False,0.0


## Target Variable: Rating

- Predictor Variable: 

-- Category(dummies)
-- Rating
-- Reviews
-- Installs
-- Size
-- Prize(0 or 1),

### Recategorizing the Category variable

In [6]:
#CATEGORY 1
lifestyle=['BEAUTY','FOOD_AND_DRINK','DATING','HOUSE_AND_HOME','LIFESTYLE',
           'MUSIC_AND_AUDIO','PARENTING','PERSONALIZATION','PHOTOGRAPHY',
           'PRODUCTIVITY','SHOPPING','SOCIAL','SPORTS','VIDEO_PLAYERS']

#CATEGORY 2
education=['BOOKS_AND_REFERENCE','EDUCATION','LIBRARIES_AND_DEMO','ART_AND_DESIGN']

#CATEGORY 3
health=['HEALTH_AND_FITNESS','MEDICAL']

#CATEGORY 4
travel=['AUTO_AND_VEHICLES','MAPS_AND_NAVIGATION','TRAVEL','TRAVEL_AND_LOCAL']

#CATEGORY 5
communication=['COMMUNICATION']

#CATEGORY 6
news=['NEWS_AND_MAGAZINES']

#CATEGORY 7
tools=['TOOLS']

#CATEGORY 8
weather=['WEATHER']

#CATEGORY 9
business=['BUSINESS','FINANCE']

#CATEGORY 10
entertainment = ['COMICS','ENTERTAINMENT','EVENTS','GAME_ACTION','GAME_ADVENTURE',
                 'GAME_ARCADE','GAME_BOARD','GAME_CARD','GAME_CASINO','GAME_CASUAL',
                 'GAME_EDUCATIONAL','GAME_MUSIC','GAME_PUZZLE','GAME_RACING','GAME_ROLE_PLAYING',
                 'GAME_SIMULATION','GAME_SPORTS','GAME_STRATEGY','GAME_TRIVIA',
                 'GAME_WORD']


In [7]:
df.loc[df['Category'].isin(lifestyle), 'c_lifestyle'] = np.int64(1)
df['c_lifestyle'] = df['c_lifestyle'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(education), 'c_education'] = np.int64(1)
df['c_education'] = df['c_education'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(health), 'c_health'] = np.int64(1)
df['c_health'] = df['c_health'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(travel), 'c_travel'] = np.int64(1)
df['c_travel'] = df['c_travel'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(communication), 'c_communication'] = np.int64(1)
df['c_communication'] = df['c_communication'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(news), 'c_news'] = np.int64(1)
df['c_news'] = df['c_news'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(tools), 'c_tools'] = np.int64(1)
df['c_tools'] = df['c_tools'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(weather), 'c_weather'] = np.int64(1)
df['c_weather'] = df['c_weather'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(business), 'c_business'] = np.int64(1)
df['c_business'] = df['c_business'].replace(np.nan, np.int64(0))

df.loc[df['Category'].isin(entertainment), 'c_entertainment'] = np.int64(1)
df['c_entertainment'] = df['c_entertainment'].replace(np.nan, np.int64(0))

In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,App_Name,Category,Rating,Reviews,Installs,Size,Price,Content_Rating,Last_Updated,...,c_lifestyle,c_education,c_health,c_travel,c_communication,c_news,c_tools,c_weather,c_business,c_entertainment
0,1,DoorDash - Food Delivery,FOOD_AND_DRINK,4.55,305034,5000000.0,Varies with device,0.0,Everyone,29-Mar-19,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,TripAdvisor Hotels Flights Restaurants Attract...,TRAVEL_AND_LOCAL,4.4,1207922,100000000.0,Varies with device,0.0,Everyone,29-Mar-19,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Peapod,SHOPPING,3.66,1967,100000.0,1.4M,0.0,Everyone,20-Sep-18,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,foodpanda - Local Food Delivery,FOOD_AND_DRINK,4.11,389154,10000000.0,16M,0.0,Everyone,22-Mar-19,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,My CookBook Pro (Ad Free),FOOD_AND_DRINK,4.65,2291,10000.0,Varies with device,5.99,Everyone,1-Apr-19,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
df.Size.value_counts()

Varies with device    11726
11M                    7312
12M                    6362
13M                    5569
14M                    5266
                      ...  
426k                      1
705k                      1
286k                      1
650k                      1
531k                      1
Name: Size, Length: 1236, dtype: int64

In [None]:
#Remove Varies with Device from Size and put in the mean value

In [None]:
#Convert the other columns to factor

In [33]:
X =df[['Reviews','Installs','Price','c_lifestyle','c_education','c_health',
       'c_travel','c_communication','c_news','c_tools','c_weather','c_business','c_entertainment']]
y = df[['Rating']]

In [34]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1)

#run linear regression
from regressors import stats
ols = linear_model.LinearRegression()
ols.fit(X, y)

# to print summary table:
print("\n=========== SUMMARY ===========")
#xlabels = boston.feature_names[which_betas]
stats.summary(ols, X, y)





ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 1 and the array at index 1 has size 13