# Profitable App Profiles for the App Store and Google Play Markets
Collecting data for over four million apps requires a significant amount of time and money, so we'll try to analyze a sample of the data instead. To avoid spending resources on collecting new data ourselves, we should first try to see whether we can find any relevant existing data at no cost. Luckily, these are two data sets that seem suitable for our goals:

A data set containing data about approximately ten thousand Android apps from Google Play — the data was collected in August 2018

A data set containing data about approximately seven thousand iOS apps from the App Store — the data was collected in July 2017
# Opening and Exploring the Data


In [1]:
open_file_apple=open('AppleStore.csv')
from csv import reader
read_file_apple=reader(open_file_apple)
data_apple=list(read_file_apple)
apple_header=data_apple[0]
data_apple=data_apple[1:]
open_file_play=open('googleplaystore.csv')
from csv import reader
read_file_play=reader(open_file_play)
data_play=list(read_file_play)
play_header=data_play[0]

In [2]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [3]:
explore_data(data_apple,0,6)
print(apple_header)
explore_data(data_play,0,6)
print(play_header)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']


['429047995', 'Pinterest', '74778624', 'USD', '0.0', '1061624', '1814', '4.5', '4.0', '6.26', '12+', 'Social Networking', '37', '5', '27', '1']


['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont

In [4]:
explore_data(data_apple,0,6,True)
explore_data(data_play,0,6,True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']


['429047995', 'Pinterest', '74778624', 'USD', '0.0', '1061624', '1814', '4.5', '4.0', '6.26', '12+', 'Social Networking', '37', '5', '27', '1']


Number of rows: 7197
Number of columns: 16
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Ge

# Deleting Wrong Data 

In [5]:
print(data_play[10472])
print('\n')
print(play_header)
print('\n')
print(data_play[1])

['Xposed Wi-Fi-Pwd', 'PERSONALIZATION', '3.5', '1042', '404k', '100,000+', 'Free', '0', 'Everyone', 'Personalization', 'August 5, 2014', '3.0.0', '4.0.3 and up']


['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


In [6]:
print(len(data_play))
del data_play[10472]
print(len(data_play))
del data_play[0]

10842
10841


# Removing Duplicate Entries

In [7]:
duplicate_apps = []
unique_apps = []

for app in data_play:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
    
print('Number of duplicate apps:', len(duplicate_apps))
print('\n')
print('Examples of duplicate apps:', duplicate_apps[:15])


Number of duplicate apps: 1181


Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']


In [8]:
reviews_max = {}

for app in data_play:
    name = app[0]
    n_reviews = (app[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
        
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
print('Expected length:', len(data_play) - 1181)
print('Actual length:', len(reviews_max))

Expected length: 9659
Actual length: 9659


In [9]:
android_clean = []
already_added = []

for app in data_play:
    name = app[0]
    n_reviews = (app[3])
    
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name)

# Removing Non-English Apps

In [10]:

def is_english(string):
    non_ascii = 0
    
    for character in string:
        if ord(character) > 127:
            non_ascii += 1
    
    if non_ascii > 3:
        return False
    else:
        return True
android_english=[]
ios_english = []
for app in android_clean:
    name = app[0]
    if is_english(name):
        android_english.append(app)
for app in data_apple:
    name = app[1]
    if is_english(name):
        ios_english.append(app)
explore_data(android_english, 0, 3, True)
print('\n')
explore_data(ios_english, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9614
Number of columns: 13


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 

# Isolating the Free Apps

In [11]:
android_final = []
ios_final=[]

for app in android_english:
    price = app[7]
    if price == '0':
        android_final.append(app)
for app in ios_english:
    price = app[4]
    if price == '0.0':
        ios_final.append(app)
        
        

print(len(android_final))
print(len(ios_final))


8861
3222


In [12]:
print(play_header)
print("\n")
print(apple_header)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


# Most Common Apps by Genre

In [13]:
def freq_table(dataset,index):
    table={}
    total=0
    for a in dataset:
        if a[index] in table:
            table[a[index]]+=1
        else:
            table[a[index]]=1
        total+=1
    table_percentage={}
    for key in table:
        table_percentage[key]=(table[key]/total)*100
    return table_percentage
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        

In [14]:
display_table(ios_final, -5)

Games : 58.16263190564867
Entertainment : 7.883302296710118
Photo & Video : 4.9658597144630665
Education : 3.662321539416512
Social Networking : 3.2898820608317814
Shopping : 2.60707635009311
Utilities : 2.5139664804469275
Sports : 2.1415270018621975
Music : 2.0484171322160147
Health & Fitness : 2.0173805090006205
Productivity : 1.7380509000620732
Lifestyle : 1.5828677839851024
News : 1.3345747982619491
Travel : 1.2414649286157666
Finance : 1.1173184357541899
Weather : 0.8690254500310366
Food & Drink : 0.8069522036002483
Reference : 0.5586592178770949
Business : 0.5276225946617008
Book : 0.4345127250155183
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665


In [15]:
display_table(android_final,1)

FAMILY : 18.936914569461685
GAME : 9.694165444080802
TOOLS : 8.452770567656021
BUSINESS : 4.593161042771697
LIFESTYLE : 3.9047511567543167
PRODUCTIVITY : 3.8934657487868187
FINANCE : 3.7016138133393524
MEDICAL : 3.521047285859384
SPORTS : 3.3969077982169056
PERSONALIZATION : 3.3066245344769216
COMMUNICATION : 3.2389120866719328
HEALTH_AND_FITNESS : 3.080916375126961
PHOTOGRAPHY : 2.9454914795169844
NEWS_AND_MAGAZINES : 2.7987811759395105
SOCIAL : 2.663356280329534
TRAVEL_AND_LOCAL : 2.3360794492720913
SHOPPING : 2.245796185532107
BOOKS_AND_REFERENCE : 2.144227513824625
DATING : 1.8620923146371742
VIDEO_PLAYERS : 1.794379866832186
MAPS_AND_NAVIGATION : 1.3993905879697552
FOOD_AND_DRINK : 1.2413948764247829
EDUCATION : 1.1736824286197944
ENTERTAINMENT : 0.9592596772373322
LIBRARIES_AND_DEMO : 0.9366888613023362
AUTO_AND_VEHICLES : 0.9254034533348381
HOUSE_AND_HOME : 0.8238347816273558
WEATHER : 0.8012639656923597
EVENTS : 0.7109807019523756
PARENTING : 0.6545536621148855
ART_AND_DESIGN :

In [16]:
display_table(android_final,5)

1,000,000+ : 15.743144114659746
100,000+ : 11.544972350750479
10,000,000+ : 10.518000225708159
10,000+ : 10.202008802618215
1,000+ : 8.396343527818532
100+ : 6.91795508407629
5,000,000+ : 6.838957228303803
500,000+ : 5.574991535944025
50,000+ : 4.773727570251665
5,000+ : 4.51416318699921
10+ : 3.5436181017943795
500+ : 3.250197494639431
50,000,000+ : 2.290937817402099
100,000,000+ : 2.1216566978896285
50+ : 1.9185193544746644
5+ : 0.7899785577248618
1+ : 0.5078433585374111
500,000,000+ : 0.27084979121995256
1,000,000,000+ : 0.22570815934996052
0+ : 0.0451416318699921
0 : 0.011285407967498025


# Most Popular Apps by Genre on Google Play

In [18]:
cat= freq_table(android_final, 1)

for c in cat:
    total = 0
    len_category = 0
    for app in android_final:
        category_app = app[1]
        if category_app == c:            
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category += 1
    avg_n_installs = total / len_category
    print(c, ':', avg_n_installs)

ENTERTAINMENT : 11640705.88235294
COMICS : 817657.2727272727
SPORTS : 3638640.1428571427
GAME : 15560965.599534342
HOUSE_AND_HOME : 1331540.5616438356
EDUCATION : 1820673.076923077
MAPS_AND_NAVIGATION : 4056941.7741935486
LIFESTYLE : 1437816.2687861272
DATING : 854028.8303030303
PERSONALIZATION : 5218893.815699658
PARENTING : 542603.6206896552
HEALTH_AND_FITNESS : 4188821.9853479853
AUTO_AND_VEHICLES : 647317.8170731707
NEWS_AND_MAGAZINES : 9549178.467741935
MEDICAL : 120616.48717948717
COMMUNICATION : 38456119.167247385
PHOTOGRAPHY : 17805627.643678162
FINANCE : 1387692.475609756
FAMILY : 3694276.334922527
LIBRARIES_AND_DEMO : 638503.734939759
EVENTS : 253542.22222222222
VIDEO_PLAYERS : 24727872.452830188
SHOPPING : 7036877.311557789
BUSINESS : 1712290.1474201474
WEATHER : 5074486.197183099
SOCIAL : 23253652.127118643
TOOLS : 10682301.033377837
TRAVEL_AND_LOCAL : 13984077.710144928
BEAUTY : 513151.88679245283
FOOD_AND_DRINK : 1924897.7363636363
ART_AND_DESIGN : 1986335.0877192982
BOOK