# Analyzing AppStore Data

This project analyzes data fetched from app stores in order to find out most profitable option.

In [1]:
import csv
reader = csv.reader

In [2]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [3]:
applef = reader(open("AppleStore.csv"))
apple = list(applef)
googlef = reader(open("googleplaystore.csv"))
google = list(googlef)

In [32]:
explore_data(google,2,3, True)
explore_data(apple,2,3, True)


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 8863
Number of columns: 13
['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 3220
Number of columns: 16


We create headers below:

In [5]:
gheader = google[0]
print(gheader)
google = google[1:]
aheader = apple[0]
print(aheader)
apple = apple[1:]

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


## Clear Data
Check if there is any faulty entries

In [6]:
for i in range(len(google)-1):
    if len(google[i]) != len(gheader):
        print("Dropping from google:\n",google[i])
        del google[i]
for i in range(len(apple)):
    if len(apple[i]) != len(aheader):
        print("Dropping from apple:\n",apple[i])
        del apple[i]

Dropping from google:
 ['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


Check if there is  duplicate applications

In [7]:
g_unique_apps = []
g_dup_apps = []
for app in google:
    name = app[0]
    if name in g_unique_apps:
        g_dup_apps.append(name)
    else:
        g_unique_apps.append(name)
print(len(g_unique_apps))
print(len(g_dup_apps))
        
a_unique_apps = []
a_dup_apps = []
for app in apple:
    name = app[1]
    if name in a_unique_apps:
        a_dup_apps.append(name)
    else:
        a_unique_apps.append(name)
print(len(a_unique_apps))
print(len(a_dup_apps))
        

9659
1181
7195
2


Note that we won't drop duplicates randomly. Most recent entry is chosen by functipn below.

In [8]:
g_name_vs_review = {}
for i in google:
    name = i[0]
    cur_rev = float(i[3])
    if name not in g_name_vs_review:
        g_name_vs_review[name] = cur_rev
    else:
        if cur_rev > g_name_vs_review[name]:
            g_name_vs_review[name] = cur_rev
print(len(g_name_vs_review))

a_name_vs_review = {}
for i in apple:
    name = i[1]
    cur_rev = float(i[5])
    if name not in a_name_vs_review:
        a_name_vs_review[name] = cur_rev
    else:
        if cur_rev > a_name_vs_review[name]:
            a_name_vs_review[name] = cur_rev
print(len(a_name_vs_review))

9659
7195


Function below will create clean google list:

In [9]:
google_clean = []
added_apps = []
for app in google:
    name = app[0]
    rev = float(app[3])
    
    if (g_name_vs_review[name] == rev) and (name not in added_apps):
        google_clean.append(app)
        added_apps.append(name)
print(len(google_clean))

apple_clean = []
added_apps = []
for app in apple:
    name = app[1]
    rev = float(app[5])
    
    if (a_name_vs_review[name] == rev) and (name not in added_apps):
        apple_clean.append(app)
        added_apps.append(name)
print(len(apple_clean))

google = google_clean
apple  = apple_clean

9659
7195


Check for if english

In [11]:
def english(name):
    count = 0
    for char in name:
        if ord(char) > 127:
            count += 1
    if count > 3:
        return False
    else:
        return True

google_english = []
apple_english = []

for app in google:
    name = app[0]
    if english(name):
        google_english.append(app)
print(len(google_english))

for app in apple:
    name = app[1]
    if english(name):
        apple_english.append(app)
print(len(apple_english))

google = google_english
apple = apple_english

9614
6181


In [28]:
g_not_free = []
a_not_free = []
g_free = []
a_free = []
for i in range(len(google)):
    name=google[i][0]
    price=google[i][6]
    if price != "Free":
        g_not_free.append(name)
    else:
        g_free.append(google[i])

a_not_free = []
for i in range(len(apple)):
    name=apple[i][1]
    price=apple[i][4]
    if price != "0.0":
        a_not_free.append(name)
    else:
        a_free.append(apple[i])

print(len(g_free), len(a_free))

8863 3220


Reset the datasets:

In [29]:
google = g_free
apple = a_free

Since we want to build an app that succeds in both markets, we would like find out app profiles that are successfull in both markets. To achieve that, we may first look for most common genres:

In [36]:
def genre_freq(genre,dataset):
    if genre in genres[dataset]:
        genres[dataset][genre] += 1
    else:
        genres[dataset][genre] = 1

Function

In [50]:
google_genres = {}
apple_genres = {}
genres =  [google_genres,apple_genres]
for app in google:
    genre = app[1]
    genre_freq(genre,0)
print(google_genres)
for app in apple:
    genre = app[11]
    genre_freq(genre,1)
print(apple_genres)
    

{'ART_AND_DESIGN': 57, 'AUTO_AND_VEHICLES': 82, 'BEAUTY': 53, 'BOOKS_AND_REFERENCE': 190, 'BUSINESS': 407, 'COMICS': 55, 'COMMUNICATION': 287, 'DATING': 165, 'EDUCATION': 103, 'ENTERTAINMENT': 85, 'EVENTS': 63, 'FINANCE': 328, 'FOOD_AND_DRINK': 110, 'HEALTH_AND_FITNESS': 273, 'HOUSE_AND_HOME': 73, 'LIBRARIES_AND_DEMO': 83, 'LIFESTYLE': 346, 'GAME': 862, 'FAMILY': 1675, 'MEDICAL': 313, 'SOCIAL': 236, 'SHOPPING': 199, 'PHOTOGRAPHY': 261, 'SPORTS': 301, 'TRAVEL_AND_LOCAL': 207, 'TOOLS': 750, 'PERSONALIZATION': 294, 'PRODUCTIVITY': 345, 'PARENTING': 58, 'WEATHER': 71, 'VIDEO_PLAYERS': 159, 'NEWS_AND_MAGAZINES': 248, 'MAPS_AND_NAVIGATION': 124}
{'Social Networking': 106, 'Photo & Video': 160, 'Games': 1872, 'Music': 66, 'Reference': 18, 'Health & Fitness': 65, 'Weather': 28, 'Utilities': 81, 'Travel': 40, 'Shopping': 84, 'News': 43, 'Navigation': 6, 'Lifestyle': 51, 'Entertainment': 254, 'Food & Drink': 26, 'Sports': 69, 'Book': 14, 'Finance': 36, 'Education': 118, 'Productivity': 56, 'Busi

for google:

In [53]:
reversed_google_genres = {v: k for k, v in google_genres.items()}
g_max_list = []
for key in google_genres:
    g_max_list.append(google_genres[key])
g_max_list = sorted(g_max_list,reverse=True)

c = 0
google_genres = {}
for i in g_max_list:
    name = reversed_google_genres[i]
    value = i
    google_genres[name] = value
print(google_genres)

FAMILY 1675
GAME 862
TOOLS 750
BUSINESS 407
LIFESTYLE 346
PRODUCTIVITY 345
FINANCE 328
MEDICAL 313
SPORTS 301
PERSONALIZATION 294
COMMUNICATION 287
HEALTH_AND_FITNESS 273
PHOTOGRAPHY 261
NEWS_AND_MAGAZINES 248
SOCIAL 236
TRAVEL_AND_LOCAL 207
SHOPPING 199
BOOKS_AND_REFERENCE 190
DATING 165
VIDEO_PLAYERS 159
MAPS_AND_NAVIGATION 124
FOOD_AND_DRINK 110
EDUCATION 103
ENTERTAINMENT 85
LIBRARIES_AND_DEMO 83
AUTO_AND_VEHICLES 82
HOUSE_AND_HOME 73
WEATHER 71
EVENTS 63
PARENTING 58
ART_AND_DESIGN 57
COMICS 55
BEAUTY 53
{'FAMILY': 1675, 'GAME': 862, 'TOOLS': 750, 'BUSINESS': 407, 'LIFESTYLE': 346, 'PRODUCTIVITY': 345, 'FINANCE': 328, 'MEDICAL': 313, 'SPORTS': 301, 'PERSONALIZATION': 294, 'COMMUNICATION': 287, 'HEALTH_AND_FITNESS': 273, 'PHOTOGRAPHY': 261, 'NEWS_AND_MAGAZINES': 248, 'SOCIAL': 236, 'TRAVEL_AND_LOCAL': 207, 'SHOPPING': 199, 'BOOKS_AND_REFERENCE': 190, 'DATING': 165, 'VIDEO_PLAYERS': 159, 'MAPS_AND_NAVIGATION': 124, 'FOOD_AND_DRINK': 110, 'EDUCATION': 103, 'ENTERTAINMENT': 85, 'LIBRA

for apple:

In [54]:
reversed_apple_genres = {v: k for k, v in apple_genres.items()}
a_max_list = []
for key in apple_genres:
    a_max_list.append(apple_genres[key])
a_max_list = sorted(a_max_list,reverse=True)

c = 0
apple_genres = {}
for i in a_max_list:
    name = reversed_apple_genres[i]
    value = i
    apple_genres[name] = value
print(apple_genres)
    

{'Games': 1872, 'Entertainment': 254, 'Photo & Video': 160, 'Education': 118, 'Social Networking': 106, 'Shopping': 84, 'Utilities': 81, 'Sports': 69, 'Music': 66, 'Health & Fitness': 65, 'Productivity': 56, 'Lifestyle': 51, 'News': 43, 'Travel': 40, 'Finance': 36, 'Weather': 28, 'Food & Drink': 26, 'Reference': 18, 'Business': 17, 'Book': 14, 'Medical': 6, 'Catalogs': 4}


In [65]:
def freq_table(dataset, index):
    table = {}
    percentage = {}
    for i in range(len(dataset)):
        value = dataset[i][index]
        if value not in table:
            table[value] = 0
        table[value] += 1
    sum = 0
    for count in table:
        sum += table[count]
    for n in table:
        percentage[n] = (table[n]/sum)*100
    return [percentage]

print(freq_table(google,2))

[{'4.1': 6.6794539095114525, '4.7': 4.321335890781902, '4.5': 8.811914701568318, '4.3': 9.522734965587274, '4.4': 9.285794877580955, '3.8': 3.0125239760803337, '4.2': 8.473428861559292, '4.6': 6.8374139681823305, '3.2': 0.6882545413516867, '4.0': 5.517319192147128, '4.8': 2.06476362405506, '3.9': 3.870021437436534, '4.9': 0.8913460453571026, '3.6': 1.771409229380571, '3.7': 2.403249464064087, 'NaN': 14.633871149723571, '3.3': 1.0718718266952498, '3.4': 1.3088119147015682, '3.5': 1.6472977547105947, '3.1': 0.7333859866862237, '5.0': 2.7417353040731127, '3.0': 0.8236488773552973, '2.5': 0.21437436533904997, '2.8': 0.4174658693444658, '2.7': 0.2369400880063184, '1.0': 0.15796005867087895, '1.9': 0.1241114746699763, '2.9': 0.4400315920117342, '2.3': 0.20309150400541578, '2.6': 0.2482229493399526, '2.2': 0.15796005867087895, '1.7': 0.07898002933543948, '2.0': 0.1241114746699763, '2.4': 0.19180864267178158, '1.8': 0.07898002933543948, '1.6': 0.045131445334536835, '2.1': 0.09026289066907367, 

In [66]:
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])