In [136]:
# Analyse the Android App Store Data

import pandas as pd

# Show numeric output in decimal format e.g., 2.15
pd.options.display.float_format = '{:,.2f}'.format

In [137]:
df_apps = pd.read_csv('apps.csv')

In [3]:
df_apps.shape

(10841, 12)

In [4]:
df_apps.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size_MBs', 'Installs', 'Type',
       'Price', 'Content_Rating', 'Genres', 'Last_Updated', 'Android_Ver'],
      dtype='object')

In [5]:
df_apps.head()

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres,Last_Updated,Android_Ver
0,Ak Parti Yardım Toplama,SOCIAL,,0,8.7,0,Paid,$13.99,Teen,Social,"July 28, 2017",4.1 and up
1,Ain Arabic Kids Alif Ba ta,FAMILY,,0,33.0,0,Paid,$2.99,Everyone,Education,"April 15, 2016",3.0 and up
2,Popsicle Launcher for Android P 9.0 launcher,PERSONALIZATION,,0,5.5,0,Paid,$1.49,Everyone,Personalization,"July 11, 2018",4.2 and up
3,Command & Conquer: Rivals,FAMILY,,0,19.0,0,,0,Everyone 10+,Strategy,"June 28, 2018",Varies with device
4,CX Network,BUSINESS,,0,10.0,0,Free,0,Everyone,Business,"August 6, 2018",4.1 and up


In [6]:
# randomly select 5 rows from Pandas DataFrame
df_apps.sample(n=5)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres,Last_Updated,Android_Ver
1802,Dialer theme G Black Gold,PERSONALIZATION,4.2,33,0.84,500,Paid,$1.49,Everyone,Personalization,"January 1, 2016",4.0 and up
7949,Learn to code with el Chavo,FAMILY,3.9,5933,60.0,1000000,Free,0,Everyone,Educational,"March 27, 2018",4.1 and up
10201,Hotspot Shield Free VPN Proxy & Wi-Fi Security,TOOLS,4.2,1116393,8.2,50000000,Free,0,Everyone,Tools,"August 6, 2018",4.1 and up
4231,Bixby Button Remapper - bxActions Pro / Coffee,TOOLS,3.9,319,0.02,10000,Paid,$2.99,Everyone,Tools,"April 28, 2017",7.0 and up
3401,Learn Artificial Intelligence,FAMILY,4.6,27,4.2,10000,Free,0,Everyone,Education,"July 14, 2018",4.1 and up


In [7]:
# By specifying the column axis (axis='columns'), the drop() method removes the specified column
# By specifying the row axis (axis='index'), the drop() method removes the specified row.
# drop columns Last_updated and Android_Version
updated_df = df_apps.drop(['Last_Updated','Android_Ver'], axis='columns')
updated_df.shape

(10841, 10)

In [8]:
updated_df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size_MBs             0
Installs             0
Type                 1
Price                0
Content_Rating       0
Genres               0
dtype: int64

In [9]:
# dropping the rows having NaN values
df_apps_clean = updated_df.dropna()
 
# To reset the indices
df_apps_clean = df_apps_clean.reset_index(drop=True)

In [10]:
df_apps_clean

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
0,KBA-EZ Health Guide,MEDICAL,5.00,4,25.00,1,Free,0,Everyone,Medical
1,Ra Ga Ba,GAME,5.00,2,20.00,1,Paid,$1.49,Everyone,Arcade
2,Mu.F.O.,GAME,5.00,2,16.00,1,Paid,$0.99,Everyone,Arcade
3,Brick Breaker BR,GAME,5.00,7,19.00,5,Free,0,Everyone,Arcade
4,Anatomy & Physiology Vocabulary Exam Review App,MEDICAL,5.00,1,4.60,5,Free,0,Everyone,Medical
...,...,...,...,...,...,...,...,...,...,...
9362,Subway Surfers,GAME,4.50,27723193,76.00,1000000000,Free,0,Everyone 10+,Arcade
9363,Subway Surfers,GAME,4.50,27724094,76.00,1000000000,Free,0,Everyone 10+,Arcade
9364,Subway Surfers,GAME,4.50,27725352,76.00,1000000000,Free,0,Everyone 10+,Arcade
9365,Subway Surfers,GAME,4.50,27725352,76.00,1000000000,Free,0,Everyone 10+,Arcade


In [11]:
df_apps_clean.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size_MBs          0
Installs          0
Type              0
Price             0
Content_Rating    0
Genres            0
dtype: int64

In [12]:
#Check for duplicate
df_apps_clean.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
9362    False
9363    False
9364    False
9365     True
9366    False
Length: 9367, dtype: bool

In [13]:
# Check if any duplicate values
duplicated_rows = df_apps_clean[df_apps_clean.duplicated()]
print(duplicated_rows)
duplicated_rows.head()

                                             App            Category  Rating  \
212                       420 BZ Budeze Delivery             MEDICAL    5.00   
281                                  MouseMingle              DATING    2.70   
308   Cardiac diagnosis (heart rate, arrhythmia)             MEDICAL    4.40   
327                                 Sway Medical             MEDICAL    5.00   
335               Chat Kids - Chat Room For Kids              DATING    4.70   
...                                          ...                 ...     ...   
9328               Skype - free IM & video calls       COMMUNICATION    4.10   
9335                                   Instagram              SOCIAL    4.50   
9352                                Google Drive        PRODUCTIVITY    4.40   
9358                                 Google News  NEWS_AND_MAGAZINES    3.90   
9365                              Subway Surfers                GAME    4.50   

       Reviews  Size_MBs       Installs

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
212,420 BZ Budeze Delivery,MEDICAL,5.0,2,11.0,100,Free,0,Mature 17+,Medical
281,MouseMingle,DATING,2.7,3,3.9,100,Free,0,Mature 17+,Dating
308,"Cardiac diagnosis (heart rate, arrhythmia)",MEDICAL,4.4,8,6.5,100,Paid,$12.99,Everyone,Medical
327,Sway Medical,MEDICAL,5.0,3,22.0,100,Free,0,Everyone,Medical
335,Chat Kids - Chat Room For Kids,DATING,4.7,6,4.9,100,Free,0,Mature 17+,Dating


In [14]:
# Find the duplicate value
#  check for an individual app like ‘Instagram’ by looking up all the entries with that name in the App column.
df_apps_clean[df_apps_clean.App == 'Instagram']

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
9332,Instagram,SOCIAL,4.5,66577313,5.3,1000000000,Free,0,Teen,Social
9334,Instagram,SOCIAL,4.5,66577446,5.3,1000000000,Free,0,Teen,Social
9335,Instagram,SOCIAL,4.5,66577313,5.3,1000000000,Free,0,Teen,Social
9336,Instagram,SOCIAL,4.5,66509917,5.3,1000000000,Free,0,Teen,Social


In [15]:
# Remove the duplicate 
# need to specify the subset for identifying duplciates
df_apps_clean = df_apps_clean.drop_duplicates(subset=['App', 'Type', 'Price'])
df_apps_clean[df_apps_clean.App == 'Instagram']

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
9332,Instagram,SOCIAL,4.5,66577313,5.3,1000000000,Free,0,Teen,Social


In [16]:
df_apps_clean.shape

(8199, 10)

In [17]:
# Find the highest rated app
df_apps_clean.sort_values('Rating',ascending=False).head()

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
0,KBA-EZ Health Guide,MEDICAL,5.0,4,25.0,1,Free,0,Everyone,Medical
326,Sway Medical,MEDICAL,5.0,3,22.0,100,Free,0,Everyone,Medical
323,AJ Men's Grooming,LIFESTYLE,5.0,2,22.0,100,Free,0,Everyone,Lifestyle
321,FK Dedinje BGD,SPORTS,5.0,36,2.6,100,Free,0,Everyone,Sports
320,CB VIDEO VISION,PHOTOGRAPHY,5.0,13,2.6,100,Free,0,Everyone,Photography


In [18]:
# Find the app with largest size in MB
df_apps_clean.sort_values('Size_MBs',ascending=False).head(n=10)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
8468,Talking Babsy Baby: Baby Games,LIFESTYLE,4.0,140995,100.0,10000000,Free,0,Everyone,Lifestyle;Pretend Play
9213,Hungry Shark Evolution,GAME,4.5,6074334,100.0,100000000,Free,0,Teen,Arcade
8469,Miami crime simulator,GAME,4.0,254518,100.0,10000000,Free,0,Mature 17+,Action
8470,Gangster Town: Vice District,FAMILY,4.3,65146,100.0,10000000,Free,0,Mature 17+,Simulation
1749,Vi Trainer,HEALTH_AND_FITNESS,3.6,124,100.0,5000,Free,0,Everyone,Health & Fitness
8471,Ultimate Tennis,SPORTS,4.3,183004,100.0,10000000,Free,0,Everyone,Sports
6452,Post Bank,FINANCE,4.5,60449,100.0,1000000,Free,0,Everyone,Finance
6453,The Walking Dead: Our World,GAME,4.0,22435,100.0,1000000,Free,0,Teen,Action
6454,Stickman Legends: Shadow Wars,GAME,4.4,38419,100.0,1000000,Paid,$0.99,Everyone 10+,Action
2740,Car Crash III Beam DH Real Damage Simulator 2018,GAME,3.6,151,100.0,10000,Free,0,Everyone,Racing


In [19]:
# Find the app with highest review and check if any paid apps are included in top 50
df_apps_clean.sort_values('Reviews',ascending=False).head(n=50)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
9331,Facebook,SOCIAL,4.1,78158306,5.3,1000000000,Free,0,Teen,Social
9311,WhatsApp Messenger,COMMUNICATION,4.4,69119316,3.5,1000000000,Free,0,Everyone,Communication
9332,Instagram,SOCIAL,4.5,66577313,5.3,1000000000,Free,0,Teen,Social
9310,Messenger – Text and Video Chat for Free,COMMUNICATION,4.0,56642847,3.5,1000000000,Free,0,Everyone,Communication
9176,Clash of Clans,GAME,4.6,44891723,98.0,100000000,Free,0,Everyone 10+,Strategy
9270,Clean Master- Space Cleaner & Antivirus,TOOLS,4.7,42916526,3.4,500000000,Free,0,Everyone,Tools
9361,Subway Surfers,GAME,4.5,27722264,76.0,1000000000,Free,0,Everyone 10+,Arcade
9354,YouTube,VIDEO_PLAYERS,4.3,25655305,4.65,1000000000,Free,0,Teen,Video Players & Editors
9272,"Security Master - Antivirus, VPN, AppLock, Boo...",TOOLS,4.7,24900999,3.4,500000000,Free,0,Everyone,Tools
9110,Clash Royale,GAME,4.6,23133508,97.0,100000000,Free,0,Everyone 10+,Strategy


In [20]:
# number of occurrences of each content rating
ratings = df_apps_clean.Content_Rating.value_counts()
ratings

Content_Rating
Everyone           6621
Teen                912
Mature 17+          357
Everyone 10+        305
Adults only 18+       3
Unrated               1
Name: count, dtype: int64

In [21]:
import plotly.express as px

#create a graph object
pie_chart = px.pie(title='Ratings Pie Chart', labels = ratings.index, values=ratings.values, names=ratings.index)

pie_chart.show()



In [22]:
# to configure other aspect than the parameters of .pie(), we can use .update_traces()

pie_chart = px.pie(title='Content Ratings', labels = ratings.index, values=ratings.values, names=ratings.index)

pie_chart.update_traces(textposition='outside', textinfo='percent+label')

pie_chart.show()

In [23]:
# create a donut chart
pie_chart = px.pie(title='Content Ratings', labels = ratings.index, values=ratings.values, names=ratings.index, hole = 0.5)

pie_chart.update_traces(textposition='inside', textfont_size=15, textinfo='percent')

pie_chart.show()

In [24]:
df_apps_clean.columns


Index(['App', 'Category', 'Rating', 'Reviews', 'Size_MBs', 'Installs', 'Type',
       'Price', 'Content_Rating', 'Genres'],
      dtype='object')

In [25]:
# Check the datatype of the Installs column
df_apps_clean.Installs.describe()

count          8199
unique           19
top       1,000,000
freq           1417
Name: Installs, dtype: object

In [26]:
df_apps_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8199 entries, 0 to 9361
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             8199 non-null   object 
 1   Category        8199 non-null   object 
 2   Rating          8199 non-null   float64
 3   Reviews         8199 non-null   int64  
 4   Size_MBs        8199 non-null   float64
 5   Installs        8199 non-null   object 
 6   Type            8199 non-null   object 
 7   Price           8199 non-null   object 
 8   Content_Rating  8199 non-null   object 
 9   Genres          8199 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 704.6+ KB


In [27]:
# number of apps at each level of installations
df_apps_clean.groupby('Installs').count().App

Installs
1                   3
1,000             698
1,000,000        1417
1,000,000,000      20
10                 69
10,000            988
10,000,000        933
100               303
100,000          1096
100,000,000       189
5                   9
5,000             425
5,000,000         607
50                 56
50,000            457
50,000,000        202
500               199
500,000           504
500,000,000        24
Name: App, dtype: int64

In [28]:
# number of apps at each level of installations by removing comma
df_apps_clean.Installs = df_apps_clean.Installs.astype(str).str.replace(',', "")

# convert the Installs column data to numeric data
df_apps_clean.Installs = pd.to_numeric(df_apps_clean.Installs)

# count the number of apps at each level
df_apps_clean[['App', 'Installs']].groupby('Installs').count()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,App
Installs,Unnamed: 1_level_1
1,3
5,9
10,69
50,56
100,303
500,199
1000,698
5000,425
10000,988
50000,457


In [29]:
# Remove the $ character from price column
df_apps_clean.Price = df_apps_clean.Price.astype(str).str.replace('$', "")

# convert the price column to numeric
df_apps_clean.Price = pd.to_numeric(df_apps_clean.Price)

df_apps_clean[['App','Price']].sort_values('Price', ascending=False).head(n=20)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,App,Price
2515,I'm Rich - Trump Edition,400.0
1163,I AM RICH PRO PLUS,399.99
3155,I Am Rich Premium,399.99
1750,I am rich(premium),399.99
2143,💎 I'm rich,399.99
4298,I am rich,399.99
765,I am rich (Most expensive app),399.99
1420,I Am Rich Pro,399.99
1824,I am Rich Plus,399.99
1719,I am Rich,399.99


In [30]:
# Remove all App the cost more than $250
df_apps_clean = df_apps_clean[df_apps_clean['Price'] < 250]
df_apps_clean.sort_values('Price', ascending=False).head()

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
1027,Vargo Anesthesia Mega App,MEDICAL,4.6,92,32.0,1000,Paid,79.99,Everyone,Medical
414,LTC AS Legal,MEDICAL,4.0,6,1.3,100,Paid,39.99,Everyone,Medical
1291,I am Rich Person,LIFESTYLE,4.2,134,1.8,1000,Paid,37.99,Everyone,Lifestyle
1179,A Manual of Acupuncture,MEDICAL,3.5,214,68.0,1000,Paid,33.99,Everyone,Medical
1165,PTA Content Master,MEDICAL,4.2,64,41.0,1000,Paid,29.99,Everyone,Medical


In [31]:
# Add new column 'Revenue Estimate' and Get the highest grossing paid app
df_apps_clean['Revenue Estimate'] = df_apps_clean.Installs.mul(df_apps_clean.Price)
df_apps_clean.sort_values('Revenue Estimate',ascending=False)[:10]

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres,Revenue Estimate
7746,Minecraft,FAMILY,4.5,2376564,19.0,10000000,Paid,6.99,Everyone 10+,Arcade;Action & Adventure,69900000.0
7351,Hitman Sniper,GAME,4.6,408292,29.0,10000000,Paid,0.99,Mature 17+,Action,9900000.0
5678,Grand Theft Auto: San Andreas,GAME,4.4,348962,26.0,1000000,Paid,6.99,Mature 17+,Action,6990000.0
6003,Facetune - For Free,PHOTOGRAPHY,4.4,49553,48.0,1000000,Paid,5.99,Everyone,Photography,5990000.0
6503,Sleep as Android Unlock,LIFESTYLE,4.5,23966,0.85,1000000,Paid,5.99,Everyone,Lifestyle,5990000.0
5122,DraStic DS Emulator,GAME,4.6,87766,12.0,1000000,Paid,4.99,Everyone,Action,4990000.0
4611,Weather Live,WEATHER,4.5,76593,4.75,500000,Paid,5.99,Everyone,Weather,2995000.0
6480,Bloons TD 5,FAMILY,4.6,190086,94.0,1000000,Paid,2.99,Everyone,Strategy,2990000.0
6159,Five Nights at Freddy's,GAME,4.6,100805,50.0,1000000,Paid,2.99,Teen,Action,2990000.0
5274,Card Wars - Adventure Time,FAMILY,4.3,129603,23.0,1000000,Paid,2.99,Everyone 10+,Card;Action & Adventure,2990000.0


In [32]:
# find the number of different categories
df_apps_clean.Category.nunique()

33

In [33]:
# Get the top 10 category based on number of apps in each category
top10_category = df_apps_clean.Category.value_counts()[:10]
top10_category

Category
FAMILY             1606
GAME                910
TOOLS               719
PRODUCTIVITY        301
PERSONALIZATION     298
LIFESTYLE           297
FINANCE             296
MEDICAL             292
PHOTOGRAPHY         263
BUSINESS            262
Name: count, dtype: int64

In [34]:
bar = px.bar(x = top10_category.index, y = top10_category.values)

bar.show()

In [35]:
# group all our apps by category and sum the number of installations
category_installs = df_apps_clean.groupby('Category').agg({'Installs' : pd.Series.sum})
category_installs.sort_values('Installs', ascending=True, inplace=True)

In [36]:
# horizontal bar
h_bar = px.bar(x = category_installs.Installs, y = category_installs.index, orientation='h', title='Category Popularity')

h_bar.update_layout(xaxis_title='Number of Downloads', yaxis_title='Category')

h_bar.show()

In [37]:
# Dataframe that has number of app and number of installs in the columns
cat_number = df_apps_clean.groupby('Category').agg({'App' : pd.Series.count})

# combine two dataframe
cat_merged_df = pd.merge(cat_number, category_installs, on='Category', how='inner')

cat_merged_df.sort_values('Installs',ascending=False)

Unnamed: 0_level_0,App,Installs
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
GAME,910,13858762717
COMMUNICATION,257,11039241530
TOOLS,719,8099724500
PRODUCTIVITY,301,5788070180
SOCIAL,203,5487841475
PHOTOGRAPHY,263,4649143130
FAMILY,1606,4437554490
VIDEO_PLAYERS,148,3916897200
TRAVEL_AND_LOCAL,187,2894859300
NEWS_AND_MAGAZINES,204,2369110650


In [38]:
#Scatter Chart
scatter = px.scatter(cat_merged_df , x='App', y='Installs', title='Category COncentration', size='App', hover_name=cat_merged_df.index, color='Installs')

# y-axis should be on a log-scale like so: yaxis=dict(type='log')
scatter.update_layout(xaxis_title = 'Number of Apps (Lower=More Concentrated)', yaxis_title='Installs', yaxis=dict(type='log'))

scatter.show()

In [39]:
# how many genre types are there
df_type = df_apps_clean.Genres
print(len(df_type.unique()))



114


In [40]:
df_apps_clean.Genres.value_counts().sort_values(ascending=True)

Genres
Lifestyle;Pretend Play        1
Strategy;Education            1
Adventure;Education           1
Role Playing;Brain Games      1
Tools;Education               1
                           ... 
Personalization             298
Productivity                301
Education                   429
Entertainment               467
Tools                       718
Name: count, Length: 114, dtype: int64

In [41]:
# Split the strings on the semi-colon and then .stack them.
stack = df_apps_clean.Genres.str.split(';',expand=True).stack()

print(f'We now have a single column with shape: {stack.shape}')

num_genres = stack.value_counts()
print(f'Number of genres: {len(num_genres)}')



We now have a single column with shape: (8564,)
Number of genres: 53


In [42]:
bar_Chart = px.bar(x=num_genres.index[:15], y = num_genres.values[:15], title='Top Genres', hover_name=num_genres.index[:15], color = num_genres.values[:15], color_continuous_scale='Agsunset')

bar_Chart.update_layout(xaxis_title='Genre', yaxis_title='Number of Apps', coloraxis_showscale=False)
bar_Chart.show()

In [46]:
df_apps_clean.Type.value_counts()

#df_apps_clean.columns

Type
Free    7595
Paid     589
Name: count, dtype: int64

In [54]:
df_free_vs_paid = df_apps_clean.groupby(['Category', 'Type'], as_index=False).agg({'App':pd.Series.count})

df_free_vs_paid.head()

Unnamed: 0,Category,Type,App
0,ART_AND_DESIGN,Free,58
1,ART_AND_DESIGN,Paid,3
2,AUTO_AND_VEHICLES,Free,72
3,AUTO_AND_VEHICLES,Paid,1
4,BEAUTY,Free,42


In [114]:
df_apps_clean.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size_MBs', 'Installs', 'Type',
       'Price', 'Content_Rating', 'Genres', 'Revenue Estimate'],
      dtype='object')

In [56]:
df_free_vs_paid.sort_values(by='App')

Unnamed: 0,Category,Type,App
3,AUTO_AND_VEHICLES,Paid,1
24,FOOD_AND_DRINK,Paid,2
38,NEWS_AND_MAGAZINES,Paid,2
40,PARENTING,Paid,2
17,ENTERTAINMENT,Paid,2
...,...,...,...
31,LIFESTYLE,Free,284
21,FINANCE,Free,289
53,TOOLS,Free,656
25,GAME,Free,834


In [98]:
grouped_bar = px.bar(df_free_vs_paid,
                     x='Category',
                     y='App',
                     title='Free vs Paid Apps by Category',
                     color='Type',
                     barmode='group')

grouped_bar.update_layout(xaxis_title='Category',
                          yaxis_title='Number of Apps',
                          xaxis={'categoryorder':'total descending'},
                          yaxis=dict(type='log'))

grouped_bar.show()






In [113]:
#Box plot
box_fig = px.box(df_apps_clean, x='Type', y='Installs',points='all', color='Type',title='How Many Downloads are Paid Apps Giving Up?',notched=True)
box_fig.update_layout(yaxis=dict(type='log'))

box_fig.show()

In [128]:
df_paid_apps = df_apps_clean[df_apps_clean['Type'] == 'Paid']

box_fig_category = px.box(df_paid_apps,
                     x='Category',
                     y='Revenue Estimate',
                     title='How Much Can Paid Apps Earn?'
                    )

box_fig_category.update_layout(xaxis_title='Category',
                         yaxis_title='Paid App Ballpark Revenue',
                          xaxis={'categoryorder':'min ascending'},
                          yaxis=dict(type='log'))


box_fig_category.show()

In [132]:
# Compare Pricing by Category

box_fig_category = px.box(df_paid_apps,
                     x='Category',
                     y='Price',
                     title='Price Per Category'
                    )

box_fig_category.update_layout(xaxis_title='Category',
                         yaxis_title='Paid App Price',
                          xaxis={'categoryorder':'max descending'},
                          yaxis=dict(type='log'))


box_fig_category.show()


In [133]:
#Median price of paid android app
df_paid_apps.Price.median()

2.99

In [135]:
# pull a random 5 samples from dataframe
df_paid_apps.sample(n=5)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres,Revenue Estimate
1581,Tetrobot and Co.,FAMILY,4.5,709,18.0,5000,Paid,2.99,Everyone,Puzzle,14950.0
3430,Sonic 4™ Episode I,GAME,3.7,8014,37.0,100000,Paid,2.99,Everyone,Arcade,299000.0
5571,Tasker,TOOLS,4.6,43045,3.4,1000000,Paid,2.99,Everyone,Tools,2990000.0
3713,Kernel Manager for Franco Kernel ✨,TOOLS,4.8,12700,10.0,100000,Paid,3.49,Everyone,Tools,349000.0
1307,Ad Remove Plugin for App2SD,PRODUCTIVITY,4.1,66,0.02,1000,Paid,1.29,Everyone,Productivity,1290.0
