In [1]:
# Guided Project: Profitable App Profiles for the App Store and Google Play Markets
# from dataquest.io

# In this guided project, you’ll work as a data analyst for a company that builds mobile apps.
# You’ll use Python to provide value through practical data analysis.
# Our goal is to determine the kinds of apps that are likely to attract more users
# because the number of people using our apps affect our revenue.
# To minimize risks and overhead, our validation strategy for an app idea has three steps:

#   - Build a minimal Android version of the app, and add it to Google Play.
#   - If the app has a good response from users, we develop it further.
#   - If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.

# Because our end goal is to add the app on both Google Play and the App Store,
# we need to find app profiles that are successful in both markets. 

In [2]:
import pandas as pd

In [3]:
# open datasets
ios_store = pd.read_csv('C:\\Users\\pilar\\Documents\\AppleStore.csv')
play_store = pd.read_csv('C:\\Users\\pilar\\Documents\\googleplaystore.csv')

In [4]:
# First look at datasets
print('\n\n Apple Store Dataset')
display(ios_store.head())
print('\n\n Play Store Store Dataset')
display(play_store.head())



 Apple Store Dataset


Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,284882215,Facebook,389879808,USD,0.0,2974676,212,3.5,3.5,95.0,4+,Social Networking,37,1,29,1
1,389801252,Instagram,113954816,USD,0.0,2161558,1289,4.5,4.0,10.23,12+,Photo & Video,37,0,29,1
2,529479190,Clash of Clans,116476928,USD,0.0,2130805,579,4.5,4.5,9.24.12,9+,Games,38,5,18,1
3,420009108,Temple Run,65921024,USD,0.0,1724546,3842,4.5,4.0,1.6.2,9+,Games,40,5,1,1
4,284035177,Pandora - Music & Radio,130242560,USD,0.0,1126879,3594,4.0,4.5,8.4.1,12+,Music,37,4,1,1




 Play Store Store Dataset


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [5]:
### Data Cleaning
# We only want to consider apps in English that are free

##### faulty entries (NaN) and duplicates are removed
# remove NaN
play_store = play_store.dropna(ignore_index = True)

# remove duplicate entries
play_store = play_store.drop_duplicates(ignore_index = True)

# remove apps that are not in English
# the data does not have a column that indicates language so the apps will be removed based on their ASCII characters. 
# The numbers corresponding to the characters we commonly use in an English text are all in the range 0 to 127.
# Therefore any app names that have TWO OR MORE CHARCTERS outside of that range are assumed as non-English and deleted from the data
# The function is described in the next cell



In [6]:
# function to identify non-English characters
def check_char(string):
    count = 0
    for letter in string:
        if letter.isascii() == False:
            count += 1
    if count >= 2:          # having two or more non-Eglish characters assumes the app name is not in English
        return False
    else:
        return True

In [7]:
# deletion of non-English language app names
for i in range(0, len(play_store['App'])):
    if check_char(play_store['App'][i]) == False:
        play_store = play_store.drop(i)       

In [8]:
# Next, we will only consider free apps, i.e apps whose price is zero
play_store = play_store[play_store['Price'] == '0']

In [9]:
# The same data cleaning process is used for the ios app store
##### faulty entries (NaN) and duplicates are removed
# remove NaN
ios_store = ios_store.dropna(ignore_index = True)

# remove duplicate entries
ios_store = ios_store.drop_duplicates(ignore_index = True)

# deletion of non-English language app names
for i in range(0, len(ios_store['track_name'])):
    if check_char(ios_store['track_name'][i]) == False:
        ios_store = ios_store.drop(i)       
        
# Next, we will only consider free apps, i.e apps whose price is zero
ios_store = ios_store[ios_store['price'] == 0]

In [10]:
# In order to continue with the analysis later on it'll be necessary to look at most popular genres.
# In the play store data set this will be done by looking at the number of installs
# However, the number of installs are of the type 'string' and the sort function is not organizing them correctly
# Therefore, they'll be converted to float type as we sorted as numbers and not strings to fix the problem
# Unfortunately, the exact number of installs is not provided, so an app with 10 000+ installs will be considered to have 10 000 installs. 
play_store['Installs'] = play_store['Installs'].str.replace('+','')
play_store['Installs'] = play_store['Installs'].str.replace(',','')
play_store['Installs'] = play_store['Installs'].astype(float)

In [11]:
## Data Analysis
# The data analysis begins by first grouping the apps by most common genre ("Category" and "Genres" in the dataset)
# Frequency Table

# Play Store
print("\n Play Store \n", pd.crosstab(index=play_store['Category'], columns='count', normalize = "all").sort_values(by=['count'], ascending = False))

#IOS Apple store
print("\n IOS Apple Store \n", pd.crosstab(index=ios_store['prime_genre'], columns='count', normalize = "all").sort_values(by=['count'], ascending = False))

# It is possible to observe that gaming and entertainment apps are the most popular in the apple store
# and the play store has both gaming/entertainment and practical apps as the most popular (more balanced mix).



 Play Store 
 col_0                   count
Category                     
FAMILY               0.188103
GAME                 0.120191
TOOLS                0.081104
PRODUCTIVITY         0.038476
FINANCE              0.036888
PHOTOGRAPHY          0.035178
COMMUNICATION        0.034811
LIFESTYLE            0.034323
SPORTS               0.032002
BUSINESS             0.031758
HEALTH_AND_FITNESS   0.030658
SOCIAL               0.029559
PERSONALIZATION      0.029559
MEDICAL              0.028826
NEWS_AND_MAGAZINES   0.025284
SHOPPING             0.024307
TRAVEL_AND_LOCAL     0.023818
BOOKS_AND_REFERENCE  0.020398
VIDEO_PLAYERS        0.018932
DATING               0.018566
EDUCATION            0.015146
MAPS_AND_NAVIGATION  0.014169
ENTERTAINMENT        0.013192
FOOD_AND_DRINK       0.012459
AUTO_AND_VEHICLES    0.008672
HOUSE_AND_HOME       0.008062
WEATHER              0.008062
LIBRARIES_AND_DEMO   0.007695
ART_AND_DESIGN       0.006962
COMICS               0.006596
PARENTING            0.00

In [12]:
# The most popular genres and apps by genre (i.e with the most users) are determined next
# Popularity will be determined by number of installs
# https://www.statology.org/pandas-top-n-by-group/

# Play Store
play_store_pop_apps = play_store[['App', 'Category', 'Installs']] # new dataset with only App, Category, and Install columns

# Shows the top 5 downloaded apps of each category
play_store_pop_apps.sort_values(by=['Category', 'Installs'], ascending = [True, False]).drop_duplicates().groupby(['Category']).head(10)\
.style.format(precision = 1) # style format shows all rows

# line below will only show first of each category
# play_store_pop_apps.sort_values(by=['Category', 'Installs'], ascending = [True, False]).drop_duplicates(subset=['Category'],keep='first')

Unnamed: 0,App,Category,Installs
3,Sketch - Draw & Paint,ART_AND_DESIGN,50000000.0
12,Tattoo Name On My Photo Editor,ART_AND_DESIGN,10000000.0
18,ibis Paint X,ART_AND_DESIGN,10000000.0
40,Textgram - write on photos,ART_AND_DESIGN,10000000.0
43,"Canva: Poster, banner, card maker & graphic design",ART_AND_DESIGN,10000000.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide Apps",ART_AND_DESIGN,5000000.0
17,FlipaClip - Cartoon animation,ART_AND_DESIGN,5000000.0
35,Floor Plan Creator,ART_AND_DESIGN,5000000.0
7,Infinite Painter,ART_AND_DESIGN,1000000.0
8,Garden Coloring Book,ART_AND_DESIGN,1000000.0


In [13]:
# The most popular genres are determined by average number of installs
play_store_pop_apps[['Category', 'Installs']].groupby('Category').mean().sort_values(by='Installs', ascending = False)

# The most popular genres in the play store are Communication (Messenger, Whatsapp, etc), Social (Facebook, Instagram, ...),
# Video players (Youtube), Productivity (Google Drive, Word, ...), and Photography (Google Photos, ...)

Unnamed: 0_level_0,Installs
Category,Unnamed: 1_level_1
COMMUNICATION,84739940.0
SOCIAL,51710060.0
VIDEO_PLAYERS,40134360.0
PRODUCTIVITY,39560660.0
PHOTOGRAPHY,33747450.0
TRAVEL_AND_LOCAL,32623930.0
GAME,31818500.0
NEWS_AND_MAGAZINES,26048260.0
ENTERTAINMENT,22726480.0
TOOLS,17240850.0


In [14]:
# The IOS Apple store does not contain information about the number of installs
# Instead, the number of user ratings will be used

# Apple Store
ios_store_pop_apps = ios_store[['track_name', 'prime_genre','rating_count_tot']]

# Shows the top 5 downloaded apps of each category
ios_store_pop_apps.sort_values(by=['prime_genre', 'rating_count_tot'], ascending = [True, False]).drop_duplicates().groupby(['prime_genre'])\
.head(10).style.format(precision = 1) # style format shows all rows

Unnamed: 0,track_name,prime_genre,rating_count_tot
77,"Kindle – Read eBooks, Magazines & Textbooks",Book,252076
208,"Audible – audio books, original series & podcasts",Book,105274
244,Color Therapy Adult Coloring Book for Adults,Book,84062
285,OverDrive – Library eBooks and Audiobooks,Book,65450
364,HOOKED - Chat Stories,Book,47829
2704,BookShout: Read eBooks & Track Your Reading Goals,Book,879
3232,Dr. Seuss Treasury — 50 best kids books,Book,451
3369,Green Riding Hood,Book,392
3954,Weirdwood Manor,Book,197
5875,MangaZERO - comic reader,Book,9


In [15]:
# The most popular genres for the Apple store are determined by the number of user ratings
ios_store_pop_apps[['prime_genre', 'rating_count_tot']].groupby('prime_genre').mean().sort_values(by='rating_count_tot', ascending = False)

# The most popular genres in the Apple store are Navigation (Waze, Google Maps, etc), Reference (Bible, Dictionary, ...),
# Social Networking (Facebook, Pinterest, ...), Music (Pandora, Spotify, ...), and Weather

# Although the genres in each dataset have different names, it's possible to observe that social media apps and music and video apps
# are very popular in both datasets. Weather and navigation apps seem to be much more popular in the Apple Store.
# It's also important to note that although the games category accounts for 58 % of the Apple store apps,
# they are not the ones with the most users. 
# Since the idea is to pitch an app idea that will first be inserted in the play store and, if successful, will be introduced
# later to Apple users. Competing with an app that would fit the Social Networking/Social category would not be interesting
# because there are many apps already and it would be very difficult to compete with giants like Instagram, Facebook, Whatsapp, etc.
# An interesting app genre to invest in could be Tools (in Play Store)/Utilites (Apple Store). It's not the most popular genre,
# but it's somewhere in the middle and that could be a potential for growth because it possibly means the market is not saturated.
# The most popular apps in this category are apps such as google search, VPNs, calculators, antivirus, flashlights, and alarms. 
# An idea for an app could be a calendar or a task/to do list app. The app could have some features to set it apart,
# such as syncronizing calendars from two or more accounts and organizing events and a to do list in order of priority
# for the user. It could also suggest when to take breaks, what tasks to start with, and estimate how long a task takes. 


Unnamed: 0_level_0,rating_count_tot
prime_genre,Unnamed: 1_level_1
Navigation,86090.333333
Reference,79350.470588
Social Networking,72916.548077
Music,58205.030769
Weather,54215.296296
Book,46384.916667
Food & Drink,33333.923077
Finance,32367.028571
Travel,31358.5
Photo & Video,28441.54375
