# Profit mobile apps - data cleaning

### Opening datasets from different folder and defining open_dataset() function

In [25]:
import os
from csv import reader


def open_dataset(file_name):        
    fileDir = os.path.dirname(os.path.realpath('__file__'))
    data_dir = os.path.join(fileDir, '../00-datasets')
    data_dir = os.path.abspath(os.path.realpath(data_dir))
    
    opened_file = open(data_dir + file_name, encoding="utf8")
    read_file = reader(opened_file)
    data = list(read_file)
    return data

### Open both datasets and save in 'google_store' and 'apple_store'

In [26]:
google_store = open_dataset('/googleplaystore.csv')
apple_store = open_dataset('/AppleStore.csv')

### Explore_data() function for displaying datatsets in readable format

In [27]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [28]:
google_store_header = google_store[0]
apple_store_header = apple_store[0]

print('Google Store header and first 3 rows:\n')
print('Header:', google_store_header, '\n')
explore_data(google_store[1:], 0, 3, True)

print('\nApple Store header and first 3 rows:\n')
print('Header:', apple_store_header, '\n')
explore_data(apple_store[1:], 0, 3, True)

Google Store header and first 3 rows:

Header: ['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'] 

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13

Apple Store header and first 3 rows:

Header: ['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rati

## Descriptions based on info from sources:
- https://www.kaggle.com/lava18/google-play-store-apps
- https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps


### For google_store cols:

| Column name | Description |
|:-:|:-:|
| 'App'      | Application name    |
| 'Category' | Category the app belongs to |
| 'Rating' | Overall user rating of the app (as when scraped)|
| 'Reviews'|Number of user reviews for the app (as when scraped)|
| 'Size'| Size of the app (as when scraped)|
| 'Installs'| Number of user downloads/installs for the app (as when scraped)|
| 'Type'| Paid or Free|
| 'Price'| Price of the app (as when scraped)|
| 'Content Rating'| Age group the app is targeted at - Children / Mature 21+ / Adult |
| 'Genres'| An app can belong to multiple genres (apart from its main category). For eg, a musical family game will belong to Music, Game, Family genres. |
| 'Last Updated'| Date when the app was last updated on Play Store (as when scraped)|
| 'Current Ver' |Current version of the app available on Play Store (as when scraped)|
| 'Android Ver'| Min required Android version (as when scraped)|

### For apple_store cols:


| Column name | Description |
|:-:|:-:|
| 'id'      | App ID 	| 
| 'track_name'   | App Name        |
| 'size_bytes' | Size (bytes)|
|"currency" | Currency Type|
|"price" | Price amount|
| "rating_count_tot" | User Rating counts (for all version)|
| "rating_count_ver" | User Rating counts (for current version)|
| "user_rating" | Average User Rating value (for all version) |
| "user_rating_ver" | Average User Rating value (for current version) |
| "ver" | Latest version code|
| "cont_rating" | Content Rating |
| "prime_genre" | Primary Genre |
| "sup_devices.num" | Number of supporting devices |
| "ipadSc_urls.num" | Number of screenshots showed for display |
| "lang.num" | Number of supported languages |
| "vpp_lic" | Vpp Device Based Licensing Enabled ||


### Deleting row number 10473 from google_store because of missing data in Category column

In [29]:
print(google_store_header,'\nmissing "Category"\n', google_store[10473])

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'] 
missing "Category"
 ['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [30]:
del google_store[10473]

### Deleting column 0 from `apple_store` to use find_duplicates() on both datasets

In [31]:
for record in apple_store:
    del record[ :1]

explore_data(apple_store, 0, 3, True)

['track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7198
Number of columns: 15


### Finding number of duplicates in both datasets

In [32]:
def find_duplicates(dataset):
    duplicates = []
    unique = []
    for row in dataset[1:]:
        if row[0] in unique:
            duplicates.append(row[0])
        else:
            unique.append(row[0])
            
    return duplicates

In [33]:
google_duplicates_check = find_duplicates(google_store)
apple_duplicates_check = find_duplicates(apple_store)

print('Number of duplicates in apple_store:\n' + str(len(apple_duplicates_check)))
print('Number of duplicates in google_store:\n' + str(len(google_duplicates_check)))

Number of duplicates in apple_store:
2
Number of duplicates in google_store:
1181


### Find all unique records with max_value of reviews in google_store

In [34]:
reviews_max = {}

for row in google_store[1:]:
    name = row[0]
    n_reviews = int(row[3])
    
    if (name in reviews_max) and (reviews_max[name] < n_reviews):
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
    

n_searched_records = len(google_store[1:]) - len(google_duplicates_check)
print(n_searched_records == len(reviews_max), n_searched_records, len(reviews_max))

True 9659 9659


### Delete duplicates in google_store - store new list of cleaned data 

In [35]:
google_store_clean = []
already_added = []


for record in google_store[1:]:
    name = record[0]
    n_reviews = int(record[3])
    if (n_reviews == reviews_max[name]) and (name not in already_added):
        google_store_clean.append(record)
        already_added.append(name)
        
explore_data(google_store_clean, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9659
Number of columns: 13


### Remove non-english apps
english_name() checks if app name contains at least 4 symbols with ASCII code value more than 127.

In [36]:
def english_name(name):
    n_non_english_char = 0
    for character in name:
        if ord(character) > 127 and n_non_english_char == 3:
            return False
        elif ord(character) > 127:
            n_non_english_char += 1
    return True

print(english_name('Instagram')) # returns True 
print(english_name('Surówka')) # returns False 
print(english_name('Facebook')) # returns True 
print(english_name('AO & AO')) # returns True
print(english_name('Docs To Go™ Free Office Suite')) # returns True
print(english_name('Instachat 😜')) # returns True
print(english_name('爱奇艺PPS -《欢乐颂2》电视剧热播')) # returns False - correct

True
True
True
True
True
True
False


In [37]:
def english_dataset(previous_list):
    cleaned_list =[]
    for record in previous_list:
        name = record[0]
        if english_name(name):
            cleaned_list.append(record)
    return cleaned_list

google_store_cleaned_en = english_dataset(google_store_clean)
apple_store_cleaned_en = english_dataset(apple_store[1:])

print('google_store_cleaned_en:')
explore_data(google_store_cleaned_en, 0, 2, True)
print('\napple_store_cleaned_en:')
explore_data(apple_store_cleaned_en, 0, 2, True)

google_store_cleaned_en:
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 9614
Number of columns: 13

apple_store_cleaned_en:
['Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 6183
Number of columns: 15


### Choosing only free apps from both datasets

In [38]:
google_cleaned_free_apps = []
apple_cleaned_free_apps = []

for app in google_store_cleaned_en:
    if app[7] == '0':
        google_cleaned_free_apps.append(app)
        
for app in apple_store_cleaned_en:
    if float(app[3]) == 0:
         apple_cleaned_free_apps.append(app)
        
explore_data(google_cleaned_free_apps, 0, 2, True)
explore_data(apple_cleaned_free_apps, 0, 2, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 8864
Number of columns: 13
['Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 3222
Number of columns: 15


# Profit mobile apps - data analysis

As we mentioned in the introduction, our aim is to determine the kinds of apps that are likely to attract more users because our revenue is highly influenced by the number of people using our apps.

To minimize risks and overhead, our validation strategy for an app idea is comprised of three steps:
1. Build a minimal Android version of the app, and add it to Google Play.
2. If the app has a good response from users, we develop it further.
3. If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.

Because our end goal is to add the app on both Google Play and the App Store, we need to find app profiles that are successful on both markets. For instance, a profile that works well for both markets might be a productivity app that makes use of gamification.

We'll use `google_cleaned_free_apps[1]` - Category and `[9]` - Genres for Genre investigation in google_store apps and `apple_cleaned_free_apps[10]` - Primary Genre in AppleStore apps.

### Creating frequency tables

In [39]:
def freq_table(dataset, index):
    freq_result = {}
    for record in dataset:
        if record[index] in freq_result:
            freq_result[record[index]] += 1
        else:
            freq_result[record[index]] = 1

    for record in freq_result:
        freq_result[record]= round(100*( freq_result[record] / len(dataset) ), 2)
    return freq_result

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
    return table

In [40]:
print('\nGoogle by Category:\n')
google_store_by_category = display_table(google_cleaned_free_apps, 1)
print('\nGoogle by Genre:\n')
google_store_by_genre = display_table(google_cleaned_free_apps, 9)
print('\nApple by Genre:\n')
apple_store_by_genre = display_table(apple_cleaned_free_apps, 10)


Google by Category:

FAMILY : 18.91
GAME : 9.72
TOOLS : 8.46
BUSINESS : 4.59
LIFESTYLE : 3.9
PRODUCTIVITY : 3.89
FINANCE : 3.7
MEDICAL : 3.53
SPORTS : 3.4
PERSONALIZATION : 3.32
COMMUNICATION : 3.24
HEALTH_AND_FITNESS : 3.08
PHOTOGRAPHY : 2.94
NEWS_AND_MAGAZINES : 2.8
SOCIAL : 2.66
TRAVEL_AND_LOCAL : 2.34
SHOPPING : 2.25
BOOKS_AND_REFERENCE : 2.14
DATING : 1.86
VIDEO_PLAYERS : 1.79
MAPS_AND_NAVIGATION : 1.4
FOOD_AND_DRINK : 1.24
EDUCATION : 1.16
ENTERTAINMENT : 0.96
LIBRARIES_AND_DEMO : 0.94
AUTO_AND_VEHICLES : 0.93
HOUSE_AND_HOME : 0.82
WEATHER : 0.8
EVENTS : 0.71
PARENTING : 0.65
ART_AND_DESIGN : 0.64
COMICS : 0.62
BEAUTY : 0.6

Google by Genre:

Tools : 8.45
Entertainment : 6.07
Education : 5.35
Business : 4.59
Productivity : 3.89
Lifestyle : 3.89
Finance : 3.7
Medical : 3.53
Sports : 3.46
Personalization : 3.32
Communication : 3.24
Action : 3.1
Health & Fitness : 3.08
Photography : 2.94
News & Magazines : 2.8
Social : 2.66
Travel & Local : 2.32
Shopping : 2.25
Books & Reference : 

### Comments to tables:

__App Store__  is dominated by apps that are designed for fun (especially games). Still, the fact that fun apps are the most numerous doesn't also imply that they also have the greatest number of users — the demand might not be the same as the offer.

__Google Play__ there are not that many apps designed for fun, and it seems that a good number of apps are designed for practical purposes (family, tools, business, lifestyle, productivity, etc.). 

The difference between the `Genres` and the `Category` columns is not clear so far, but one thing we can notice is that the `Genres` column has much more clusters inside. 

### Inspecting number of users
Exploring Google Play apps based on `Install` column `[5]`.

In [41]:
print('Google Apps by number on Installs:\n')

google_display = display_table(google_cleaned_free_apps, 5)
covering_range = round(google_display['5,000,000+'] + google_display['10,000,000+'],2)
print('\nChosen range for analysis: number of installs from 5M to 50M: ' + str(covering_range) + '%')

Google Apps by number on Installs:

1,000,000+ : 15.73
100,000+ : 11.55
10,000,000+ : 10.55
10,000+ : 10.2
1,000+ : 8.39
100+ : 6.92
5,000,000+ : 6.83
500,000+ : 5.56
50,000+ : 4.77
5,000+ : 4.51
10+ : 3.54
500+ : 3.25
50,000,000+ : 2.3
100,000,000+ : 2.13
50+ : 1.92
5+ : 0.79
1+ : 0.51
500,000,000+ : 0.27
1,000,000,000+ : 0.23
0+ : 0.05
0 : 0.01

Chosen range for analysis: number of installs from 5M to 50M: 17.38%


The most interesting apps are with `Install` count in range from 5 M+ to 50 M.

To perform computations, however, we'll need to convert each install number to float — this means that we need to remove the commas and the plus characters, otherwise the conversion will fail and raise an error. We'll do this directly in the loop below, where we also compute the average number of installs for each genre (category).

In [43]:
for category in google_store_by_category:
    total = 0
    len_category = 0
    for app in google_cleaned_free_apps:
        category_app = app[1]
        if category_app == category:            
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category += 1
    avg_n_installs = total / len_category
    print(category, ':', round(avg_n_installs/1000000 ,2), ' in M of installs')


ART_AND_DESIGN : 1.99  in M of installs
AUTO_AND_VEHICLES : 0.65  in M of installs
BEAUTY : 0.51  in M of installs
BOOKS_AND_REFERENCE : 8.77  in M of installs
BUSINESS : 1.71  in M of installs
COMICS : 0.82  in M of installs
COMMUNICATION : 38.46  in M of installs
DATING : 0.85  in M of installs
EDUCATION : 1.83  in M of installs
ENTERTAINMENT : 11.64  in M of installs
EVENTS : 0.25  in M of installs
FINANCE : 1.39  in M of installs
FOOD_AND_DRINK : 1.92  in M of installs
HEALTH_AND_FITNESS : 4.19  in M of installs
HOUSE_AND_HOME : 1.33  in M of installs
LIBRARIES_AND_DEMO : 0.64  in M of installs
LIFESTYLE : 1.44  in M of installs
GAME : 15.59  in M of installs
FAMILY : 3.7  in M of installs
MEDICAL : 0.12  in M of installs
SOCIAL : 23.25  in M of installs
SHOPPING : 7.04  in M of installs
PHOTOGRAPHY : 17.84  in M of installs
SPORTS : 3.64  in M of installs
TRAVEL_AND_LOCAL : 13.98  in M of installs
TOOLS : 10.8  in M of installs
PERSONALIZATION : 5.2  in M of installs
PRODUCTIVITY 

### The most interesting Categories are `COMMUNICATION`, `PHOTOGRAPHY`, `PRODUCTIVITY`, `SOCIAL` ,`GAME`, `TRAVEL_AND_LOCAL`.

In [45]:
print('COMMUNICATION ON GOOGLE STORE FROM 5 M+ TO 50 M:\n')

for app in google_cleaned_free_apps:
    if app[1] == 'COMMUNICATION' and (app[5] == '5,000,000+' or app[5] == '10,000,000+'):
        print(app[0], ':', app[5])
       
print('\nNPHOTOGRAPHY ON GOOGLE STORE FROM 5 M+ TO 50 M:\n')
for app in google_cleaned_free_apps:
    if app[1] == 'PHOTOGRAPHY' and (app[5] == '5,000,000+' or app[5] == '10,000,000+'):
        print(app[0], ':', app[5])
        
print('\nSOCIAL ON GOOGLE STORE FROM 5 M+ TO 50 M:\n')
for app in google_cleaned_free_apps:
    if app[1] == 'SOCIAL' and (app[5] == '5,000,000+' or app[5] == '10,000,000+'):
        print(app[0], ':', app[5])

COMMUNICATION ON GOOGLE STORE FROM 5 M+ TO 50 M:

Messenger for SMS : 10,000,000+
My Tele2 : 5,000,000+
Call Free – Free Call : 5,000,000+
Web Browser & Explorer : 5,000,000+
Browser 4G : 10,000,000+
MegaFon Dashboard : 10,000,000+
ZenUI Dialer & Contacts : 10,000,000+
Cricket Visual Voicemail : 10,000,000+
Xperia Link™ : 10,000,000+
TouchPal Keyboard - Fun Emoji & Android Keyboard : 10,000,000+
Skype Lite - Free Video Call & Chat : 5,000,000+
AT&T Visual Voicemail : 10,000,000+
GMX Mail : 10,000,000+
Omlet Chat : 10,000,000+
My Vodacom SA : 5,000,000+
Microsoft Edge : 5,000,000+
Calls & Text by Mo+ : 5,000,000+
chomp SMS : 10,000,000+
Glide - Video Chat Messenger : 10,000,000+
Text SMS : 10,000,000+
Talkray - Free Calls & Texts : 10,000,000+
GroupMe : 10,000,000+
Contacts+ : 10,000,000+
ExDialer - Dialer & Contacts : 10,000,000+
Full Screen Caller ID : 5,000,000+
Hiya - Caller ID & Block : 10,000,000+
Mr. Number-Block calls & spam : 10,000,000+
CIA - Caller ID & Call Blocker : 5,000,0

Communication, Social and Photography are good place for big players, eg FB, Google, Instagram. Anyway it seems a little bit saturated in terms or rivary. 

Let's focus on `PRODUCTIVITY`, `GAME` and `TRAVEL_AND_LOCAL`.

In [47]:
print('PRODUCTIVITY ON GOOGLE STORE FROM 5 M+ TO 50 M:\n')

for app in google_cleaned_free_apps:
    if app[1] == 'PRODUCTIVITY' and (app[5] == '5,000,000+' or app[5] == '10,000,000+'):
        print(app[0], ':', app[5])
       
print('\nGAME ON GOOGLE STORE FROM 5 M+ TO 50 M:\n')
for app in google_cleaned_free_apps:
    if app[1] == 'GAME' and (app[5] == '5,000,000+' or app[5] == '10,000,000+'):
        print(app[0], ':', app[5])
        
print('\nTRAVEL_AND_LOCAL ON GOOGLE STORE FROM 5 M+ TO 50 M:\n')
for app in google_cleaned_free_apps:
    if app[1] == 'TRAVEL_AND_LOCAL' and (app[5] == '5,000,000+' or app[5] == '10,000,000+'):
        print(app[0], ':', app[5])

PRODUCTIVITY ON GOOGLE STORE FROM 5 M+ TO 50 M:

All-In-One Toolbox: Cleaner, Booster, App Manager : 10,000,000+
AVG Cleaner – Speed, Battery & Memory Booster : 10,000,000+
QR Scanner & Barcode Scanner 2018 : 10,000,000+
Chrome Beta : 10,000,000+
Google PDF Viewer : 10,000,000+
My Claro Peru : 5,000,000+
Google Assistant : 10,000,000+
Metro name iD : 10,000,000+
Archos File Manager : 5,000,000+
ASUS SuperNote : 10,000,000+
HTC File Manager : 10,000,000+
ASUS Quick Memo : 10,000,000+
HTC Calendar : 10,000,000+
ASUS Calling Screen : 10,000,000+
lifebox : 5,000,000+
Yandex.Disk : 5,000,000+
Content Transfer : 5,000,000+
HTC Mail : 10,000,000+
MyVodafone (India) - Online Recharge & Pay Bills : 10,000,000+
Microsoft Translator : 5,000,000+
Keeper: Free Password Manager & Secure Vault : 10,000,000+
Wunderlist: To-Do List & Tasks : 10,000,000+
Todoist: To-do lists for task management & errands : 10,000,000+
Trello : 5,000,000+
Easy Voice Recorder : 10,000,000+
CM FILE MANAGER HD : 10,000,000+

Crazy Wheels : 10,000,000+
Cartoon Wars: Blade : 5,000,000+
Cytus : 5,000,000+
Dan the Man: Action Platformer : 10,000,000+
Geometry Dash SubZero : 10,000,000+
Metal Soldiers 2 : 10,000,000+
Run Sausage Run! : 10,000,000+
Knife Hit : 10,000,000+
The Visitor : 5,000,000+
Just Dance Now : 10,000,000+
DRAGON BALL LEGENDS : 5,000,000+
Injustice 2 : 5,000,000+
Injustice: Gods Among Us : 10,000,000+
MARVEL Avengers Academy : 10,000,000+
Power Rangers: Legacy Wars : 10,000,000+
Truck Driver Cargo : 10,000,000+
Checkers : 10,000,000+
Scratch Logo Quiz. Challenging brain puzzle : 10,000,000+
DH Texas Poker - Texas Hold'em : 10,000,000+
DEER HUNTER RELOADED : 5,000,000+
Call of Mini™ Dino Hunter : 10,000,000+
Bike Mayhem Free : 10,000,000+
DEER HUNTER CHALLENGE : 5,000,000+
Defender : 10,000,000+
Texas HoldEm Poker Deluxe : 10,000,000+
Deck Heroes: Legacy : 10,000,000+
Robbery Bob : 10,000,000+
Nyan Cat: Lost In Space : 10,000,000+
NARUTO X BORUTO NINJA VOLTAGE : 5,000,000+
Does not Commute : 5,

### Comments:
Seems like `GAME`, `TRAVEL_AND_LOCAL` ale also quite overloaded in terms of variety of apps. But `PRODUCTIVITY` looks promising for a start-up. To-Do-List with special features can be easy to implement and maintain on low costs.

Since we want to start new app in Google Store first, now we should focus on similar To-Do-Apps in Apple Store. :)

For the App Store data set amount of Installations is missing. As a workaround, we'll take the total number of user ratings as a proxy, which we can find in the `rating_count_tot` app.

In [48]:
apple_store_genre = freq_table(apple_cleaned_free_apps, 10)

for genre in apple_store_by_genre:
    total = 0
    len_genre = 0
    for app in apple_cleaned_free_apps:
        genre_app = app[10]
        if genre_app == genre:            
            n_reviews = float(app[4])
            total += n_reviews
            len_genre += 1
    avg_n_reviews = round((total / len_genre)/ 1000, 2)
    print(genre, ':', avg_n_reviews, 'in thousands')

Social Networking : 71.55 in thousands
Photo & Video : 28.44 in thousands
Games : 22.79 in thousands
Music : 57.33 in thousands
Reference : 74.94 in thousands
Health & Fitness : 23.3 in thousands
Weather : 52.28 in thousands
Utilities : 18.68 in thousands
Travel : 28.24 in thousands
Shopping : 26.92 in thousands
News : 21.25 in thousands
Navigation : 86.09 in thousands
Lifestyle : 16.49 in thousands
Entertainment : 14.03 in thousands
Food & Drink : 33.33 in thousands
Sports : 23.01 in thousands
Book : 39.76 in thousands
Finance : 31.47 in thousands
Education : 7.0 in thousands
Productivity : 21.03 in thousands
Business : 7.49 in thousands
Catalogs : 4.0 in thousands
Medical : 0.61 in thousands


The most common app genres for To-Do-Apps are `Productivity` and `Utilities`. So we can check them now.

In [49]:
print('Productivity app on Apple Store:\n')

for app in apple_cleaned_free_apps:
    if app[10] == 'Productivity' and float(app[4]) > 5000:
        print(app[0], ':', app[4], 'reviews')
        
print('\nUtilities app on Apple Store:\n')

for app in apple_cleaned_free_apps:
    if app[10] == 'Utilities' and float(app[4]) > 5000:
        print(app[0], ':', app[4], 'reviews')

Productivity app on Apple Store:

Evernote - stay organized : 161065 reviews
Gmail - email by Google: secure, fast & organized : 135962 reviews
iTranslate - Language Translator & Dictionary : 123215 reviews
Yahoo Mail - Keeps You Organized! : 113709 reviews
Google Docs : 64259 reviews
Google Drive - free online storage : 59255 reviews
Dropbox : 49578 reviews
Microsoft Word : 47999 reviews
Microsoft OneNote : 39638 reviews
Microsoft Outlook - email and calendar : 32807 reviews
Hotspot Shield Free VPN Proxy & Wi-Fi Privacy : 32499 reviews
Documents 6 - File manager, PDF reader and browser : 29110 reviews
Google Sheets : 24602 reviews
Microsoft Excel : 24430 reviews
Inbox by Gmail : 21561 reviews
T-Mobile : 19977 reviews
Paper by FiftyThree - Sketch, Diagram, Take Notes : 18219 reviews
MyScript Calculator - Handwriting calculator : 16555 reviews
VPN Proxy Master - Unlimited WiFi security VPN : 13674 reviews
Microsoft OneDrive – File & photo cloud storage : 12797 reviews
Ever - Capture You

### Conclusion:

In `Productivity` we have only one bigger To-Do-App, wich is Evernote. So we can expect we have found some niche for our app in Apple Store as well.