# Google Play Store Apps
In this notebook, we embark on a thorough analysis of the Android app market, delving into the vast ecosystem of apps available on the Google Play store. 

**Data Source:** <br>
Google Play Store by Lavanya Gupta in 2018 .Web scraped data of 10k Play Store apps for analysing the Android market. Original files listed [here](
https://www.kaggle.com/lava18/google-play-store-apps).


# Import Statements

In [1]:
import pandas as pd
import plotly.express as px

In [2]:
# Show numeric output in decimal format e.g., 2.15
pd.options.display.float_format = '{:,.2f}'.format

# Read the Dataset

In [3]:
data = pd.read_csv('apps.csv')

# Data Cleaning and Preparation

### Getting overiview of the data

In [4]:
data.shape

(10841, 12)

In [5]:
data.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size_MBs', 'Installs', 'Type',
       'Price', 'Content_Rating', 'Genres', 'Last_Updated', 'Android_Ver'],
      dtype='object')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  int64  
 4   Size_MBs        10841 non-null  float64
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content_Rating  10841 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last_Updated    10841 non-null  object 
 11  Android_Ver     10839 non-null  object 
dtypes: float64(2), int64(1), object(9)
memory usage: 1016.5+ KB


In [7]:
data.sample(2)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres,Last_Updated,Android_Ver
8890,DB Navigator,MAPS_AND_NAVIGATION,4.0,119685,20.0,10000000,Free,0,Everyone,Maps & Navigation,"July 2, 2018",4.0.3 and up
10506,Microsoft OneDrive,PRODUCTIVITY,4.4,1038306,4.0,100000000,Free,0,Everyone,Productivity,"August 1, 2018",Varies with device


### Drop Unused Columns

In [8]:
data.drop(['Last_Updated', 'Android_Ver'], axis=1, inplace=True)
data.head()

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
0,Ak Parti Yardım Toplama,SOCIAL,,0,8.7,0,Paid,$13.99,Teen,Social
1,Ain Arabic Kids Alif Ba ta,FAMILY,,0,33.0,0,Paid,$2.99,Everyone,Education
2,Popsicle Launcher for Android P 9.0 launcher,PERSONALIZATION,,0,5.5,0,Paid,$1.49,Everyone,Personalization
3,Command & Conquer: Rivals,FAMILY,,0,19.0,0,,0,Everyone 10+,Strategy
4,CX Network,BUSINESS,,0,10.0,0,Free,0,Everyone,Business


### Removing NaN values in Ratings

In [9]:
N_value = data[data.Rating.isna()]
N_value.head(5)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
0,Ak Parti Yardım Toplama,SOCIAL,,0,8.7,0,Paid,$13.99,Teen,Social
1,Ain Arabic Kids Alif Ba ta,FAMILY,,0,33.0,0,Paid,$2.99,Everyone,Education
2,Popsicle Launcher for Android P 9.0 launcher,PERSONALIZATION,,0,5.5,0,Paid,$1.49,Everyone,Personalization
3,Command & Conquer: Rivals,FAMILY,,0,19.0,0,,0,Everyone 10+,Strategy
4,CX Network,BUSINESS,,0,10.0,0,Free,0,Everyone,Business


In [10]:
clean_data = data.dropna()
clean_data.shape

(9367, 10)

### Remove Duplicates



In [11]:
duplicate_items = clean_data[clean_data.duplicated()]
print(duplicate_items.shape)
duplicate_items.head()

(476, 10)


Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
946,420 BZ Budeze Delivery,MEDICAL,5.0,2,11.0,100,Free,0,Mature 17+,Medical
1133,MouseMingle,DATING,2.7,3,3.9,100,Free,0,Mature 17+,Dating
1196,"Cardiac diagnosis (heart rate, arrhythmia)",MEDICAL,4.4,8,6.5,100,Paid,$12.99,Everyone,Medical
1231,Sway Medical,MEDICAL,5.0,3,22.0,100,Free,0,Everyone,Medical
1247,Chat Kids - Chat Room For Kids,DATING,4.7,6,4.9,100,Free,0,Mature 17+,Dating


In [12]:
clean_data = clean_data.drop_duplicates()
clean_data = clean_data.drop_duplicates(subset=['App', 'Type', 'Price'])
clean_data.shape

(8199, 10)

### Changing price and install column data to numeric 


In [13]:
# converting install data to numeric
clean_data.Installs = clean_data.Installs.astype(str).str.replace(',', "")
clean_data.Installs = pd.to_numeric(clean_data.Installs)

#adjustig price coloumn to str to numeric
clean_data.Price = clean_data.Price.astype(str).str.replace('$', "")
clean_data.Price = pd.to_numeric(clean_data.Price)

#removing price above 250 to remove outliers
clean_data = clean_data[clean_data['Price'] < 250]
clean_data.sort_values('Price', ascending=False).head(5)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
2281,Vargo Anesthesia Mega App,MEDICAL,4.6,92,32.0,1000,Paid,79.99,Everyone,Medical
1407,LTC AS Legal,MEDICAL,4.0,6,1.3,100,Paid,39.99,Everyone,Medical
2629,I am Rich Person,LIFESTYLE,4.2,134,1.8,1000,Paid,37.99,Everyone,Lifestyle
2481,A Manual of Acupuncture,MEDICAL,3.5,214,68.0,1000,Paid,33.99,Everyone,Medical
2463,PTA Content Master,MEDICAL,4.2,64,41.0,1000,Paid,29.99,Everyone,Medical


# Data Analysis

## Basic overview of the data

In [14]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8184 entries, 21 to 10835
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             8184 non-null   object 
 1   Category        8184 non-null   object 
 2   Rating          8184 non-null   float64
 3   Reviews         8184 non-null   int64  
 4   Size_MBs        8184 non-null   float64
 5   Installs        8184 non-null   int64  
 6   Type            8184 non-null   object 
 7   Price           8184 non-null   float64
 8   Content_Rating  8184 non-null   object 
 9   Genres          8184 non-null   object 
dtypes: float64(3), int64(2), object(5)
memory usage: 703.3+ KB


In [15]:
clean_data.describe()

Unnamed: 0,Rating,Reviews,Size_MBs,Installs,Price
count,8184.0,8184.0,8184.0,8184.0,8184.0
mean,4.17,255539.38,20.11,9186103.03,0.32
std,0.54,1987030.19,21.66,58301373.66,1.93
min,1.0,1.0,0.01,1.0,0.0
25%,4.0,126.0,4.9,10000.0,0.0
50%,4.3,3027.0,11.0,100000.0,0.0
75%,4.5,43813.0,28.0,1000000.0,0.0
max,5.0,78158306.0,100.0,1000000000.0,79.99


In [16]:
clean_data.sort_values('Reviews', ascending=False).head()

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
10805,Facebook,SOCIAL,4.1,78158306,5.3,1000000000,Free,0.0,Teen,Social
10785,WhatsApp Messenger,COMMUNICATION,4.4,69119316,3.5,1000000000,Free,0.0,Everyone,Communication
10806,Instagram,SOCIAL,4.5,66577313,5.3,1000000000,Free,0.0,Teen,Social
10784,Messenger – Text and Video Chat for Free,COMMUNICATION,4.0,56642847,3.5,1000000000,Free,0.0,Everyone,Communication
10650,Clash of Clans,GAME,4.6,44891723,98.0,100000000,Free,0.0,Everyone 10+,Strategy


## Working with catogory

In [17]:
content_rating = clean_data.Content_Rating.value_counts()
content_rating

Content_Rating
Everyone           6607
Teen                911
Mature 17+          357
Everyone 10+        305
Adults only 18+       3
Unrated               1
Name: count, dtype: int64

In [18]:
fig = px.pie(labels=content_rating.index, 
             values=content_rating.values,
             title="Content Rating",
             names=content_rating.index,
             color_discrete_sequence=px.colors.qualitative.Set2
)

fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()

# Analysing App Categories

In [19]:
# Number of different categories
clean_data.Category.nunique()

33

In [20]:
# Number of apps per category
catagory_top = clean_data.Category.value_counts()[:10]
catagory_top

Category
FAMILY             1606
GAME                910
TOOLS               719
PRODUCTIVITY        301
PERSONALIZATION     298
LIFESTYLE           297
FINANCE             296
MEDICAL             292
PHOTOGRAPHY         263
BUSINESS            262
Name: count, dtype: int64

### Catagory with higest number of application

In [21]:
bar_chat = px.bar(
        x = catagory_top.index, 
        y = catagory_top.values,
        title='Application in each Category',
        color= catagory_top.index
        )

bar_chat.update_layout(xaxis_title='Category', yaxis_title='Number of application')
bar_chat.show()

### Catagory with higest downloads

In [22]:
# Group apps by category and then sum the number of installations
category_installs = clean_data.groupby('Category').agg({'Installs': pd.Series.sum})
category_installs.sort_values('Installs', ascending=True, inplace=True)

horizontal_bar = px.bar(
        x = category_installs.Installs,
        y = category_installs.index,
        orientation='h',
        title='Category Popularity',
        color= category_installs.Installs
        )

horizontal_bar.update_layout(xaxis_title='Number of Downloads',
                              yaxis_title='Category',
                              coloraxis_showscale=False
                        )
horizontal_bar.show()

### Category Concentration - Downloads vs. Competition

In [23]:
# Find the competitin in different catogry
catagory_no = clean_data.groupby('Category').agg({'App': pd.Series.count})

#merging two differnt data
merged_df = pd.merge(catagory_no, category_installs, on='Category', how="inner")
print(f'The dimensions of the DataFrame are: {merged_df.shape}')
merged_df.sort_values('Installs', ascending=False)

The dimensions of the DataFrame are: (33, 2)


Unnamed: 0_level_0,App,Installs
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
GAME,910,13858762717
COMMUNICATION,257,11039241530
TOOLS,719,8099724500
PRODUCTIVITY,301,5788070180
SOCIAL,203,5487841475
PHOTOGRAPHY,263,4649143130
FAMILY,1606,4437554490
VIDEO_PLAYERS,148,3916897200
TRAVEL_AND_LOCAL,187,2894859300
NEWS_AND_MAGAZINES,204,2369110650


In [24]:
scatter = px.scatter(merged_df, # data
                     x='App', # column name
                     y='Installs',
                     title='Category Concentration',
                     size='App',
                     hover_name=merged_df.index,
                     color='Installs'
) 

scatter.update_layout(xaxis_title="Number of Apps (Lower=More Concentrated)",
                      yaxis_title="Installs",
                      yaxis=dict(type='log'))

scatter.show()

# Genres analysis


In [25]:
# Number of Genres
len(clean_data.Genres.unique())

114

In [26]:

clean_data.Genres.value_counts().sort_values(ascending=True)[:5]

Genres
Lifestyle;Pretend Play      1
Strategy;Education          1
Adventure;Education         1
Role Playing;Brain Games    1
Tools;Education             1
Name: count, dtype: int64

In [27]:

stack = clean_data.Genres.str.split(';', expand=True).stack()
print(f'We now have a single column with shape: {stack.shape}')
genres = stack.value_counts()
print(f'Number of genres: {len(genres)}')

We now have a single column with shape: (8564,)
Number of genres: 53


## Bar chart showing competion in each genres

In [28]:
bar = px.bar(
        x = genres.index[:10],
        y = genres.values[:10],
        title='Top Genres',
        hover_name=genres.index[:10],
        color=genres.values[:10],
        color_continuous_scale='Agsunset'
)

bar.update_layout(xaxis_title='Genre',
                  yaxis_title='Number of Apps',
                  coloraxis_showscale=False)

bar.show()

# Free vs. Paid Apps

In [29]:
type = clean_data.Type.value_counts()

In [30]:
fig = px.pie(labels=type.index, 
             values=type.values,
             title="Paid and Free apps",
             names=type.index,
             color_discrete_sequence=px.colors.qualitative.Set2
)

fig.update_traces(textposition='outside', textinfo='percent+label')

fig.show()

##  Sales Revenue Estimate



In [31]:
clean_data['Revenue_Estimate'] = clean_data.Installs.mul(clean_data.Price)
clean_data.sort_values('Revenue_Estimate', ascending=False)[:5]

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres,Revenue_Estimate
9220,Minecraft,FAMILY,4.5,2376564,19.0,10000000,Paid,6.99,Everyone 10+,Arcade;Action & Adventure,69900000.0
8825,Hitman Sniper,GAME,4.6,408292,29.0,10000000,Paid,0.99,Mature 17+,Action,9900000.0
7151,Grand Theft Auto: San Andreas,GAME,4.4,348962,26.0,1000000,Paid,6.99,Mature 17+,Action,6990000.0
7477,Facetune - For Free,PHOTOGRAPHY,4.4,49553,48.0,1000000,Paid,5.99,Everyone,Photography,5990000.0
7977,Sleep as Android Unlock,LIFESTYLE,4.5,23966,0.85,1000000,Paid,5.99,Everyone,Lifestyle,5990000.0


## Lost Downloads for Paid Apps


In [32]:

box = px.box(clean_data,
             y='Installs',
             x='Type',
             color='Type',
             notched=True,
             points='all',
             title='How Many Downloads are Paid Apps Giving Up?')

box.update_layout(yaxis=dict(type='log'),
                  height=600,  
                  width=700    
                  )

box.show()


In [33]:
df_free_vs_paid = clean_data.groupby(["Category", "Type"], 
                                      as_index=False).agg({'App': pd.Series.count})
df_free_vs_paid.sort_values('App')

Unnamed: 0,Category,Type,App
3,AUTO_AND_VEHICLES,Paid,1
24,FOOD_AND_DRINK,Paid,2
38,NEWS_AND_MAGAZINES,Paid,2
40,PARENTING,Paid,2
17,ENTERTAINMENT,Paid,2
...,...,...,...
31,LIFESTYLE,Free,284
21,FINANCE,Free,289
53,TOOLS,Free,656
25,GAME,Free,834


In [34]:
bar_chat = px.bar(df_free_vs_paid, 
               x='Category', 
               y='App',
               title='Free vs Paid Apps by Category',
               color='Type', 
               barmode='group',)
bar_chat.update_layout(xaxis_title='Category',
                    yaxis_title='Number of Apps',
                    xaxis={'categoryorder':'total descending'},
                    yaxis=dict(type='log'),
                    )
bar_chat.show()

## Revenue per App Category

In [35]:
df_paid_apps = clean_data[clean_data['Type'] == 'Paid']

In [36]:
box = px.box(df_paid_apps, 
             x='Category', 
             y='Revenue_Estimate',
             title='How Much Can Paid Apps Earn?')

box.update_layout(xaxis_title='Category',
                  yaxis_title='Paid App Ballpark Revenue',
                  xaxis={'categoryorder':'min ascending'},
                  yaxis=dict(type='log'))


box.show()

## Examine Paid App Pricing Strategies by Category


In [37]:
df_paid_apps.Price.median()

2.99

In [38]:
box = px.box(df_paid_apps, 
             x='Category', 
             y="Price",
             title='Price per Category')

box.update_layout(xaxis_title='Category',
                  yaxis_title='Paid App Price',
                  xaxis={'categoryorder':'max descending'},
                  yaxis=dict(type='log'))


box.show()

# Correlation


In [39]:
price_installs_corr = clean_data['Price'].corr(clean_data['Installs'])
print("Correlation between Price and Installs:", price_installs_corr)

Correlation between Price and Installs: -0.026069813937200034


In [40]:

correlation_matrix = clean_data[['Rating', 'Reviews', 'Size_MBs']].corr()

print("Correlation Matrix:")
print(correlation_matrix)

Correlation Matrix:
          Rating  Reviews  Size_MBs
Rating      1.00     0.05      0.05
Reviews     0.05     1.00      0.05
Size_MBs    0.05     0.05      1.00
