# Exploratory Data Analysis of Profitable Apps in App Store and Google store

The purpose of this project is to help developers understand what type of apps are likely to attract more users on Google Play and the App Store.

In [1]:
#Helper function for exploring the data
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n')

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

Let us first open the datasets

In [2]:
from csv import reader

### Google Playstore Dataset ###
opened_file = open('./data/googleplaystore.csv', encoding='utf8')
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android = android[1:]

### Apple store Dataset ###
opened_file = open('./data/AppleStore.csv', encoding='utf8')
read_file = reader(opened_file)
ios = list(read_file)
ios_header = ios[0]
ios = ios[1:]

Let's check the Android data

In [3]:
print(android_header)
print('\n')
explore_data(android, 0, 5, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Eve

Checking the columns of the Google Play Store data, I think the useful features that can be helpful for our goal are the following: ['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Android Ver']

Removing the following coluns: ['Last Updated', 'Current Ver']

Now let's check the Apple data

In [4]:
print(ios_header)
print('\n')
explore_data(ios, 0, 5, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']


Number of rows: 7197
Number of columns: 16


For the Apple data, I think we can use all columns except id

In [5]:
def header_indices(header):
    i = 0
    header_dict = {}
    while i < len(header):
        header_dict[header[i]] = i
        i += 1
    return header_dict

In [6]:
an_hi = header_indices(android_header)

In [7]:
# Check the row if it has all columns
# Android
column_len = len(android_header)
for index, row in enumerate(android):
    if len(row) != column_len:
        print(index)
        print(row)

10472
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [8]:
len(android[10472])

12

One of the entry in Android only has 12 featrues instead of 13. Let's check what feature is missing then decide if it can be filled or just delete the entry.

In [9]:
print(android_header)
print(android[0])
print(android[10472])

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


It is missing the 'Category' feature which I think can be filled.

In [24]:
# Let's try to get all existing categories
categories = {row[1] for row in android}
print(categories)

{'SPORTS', 'DATING', 'BOOKS_AND_REFERENCE', 'SOCIAL', 'HOUSE_AND_HOME', 'HEALTH_AND_FITNESS', 'COMICS', 'PRODUCTIVITY', 'AUTO_AND_VEHICLES', 'NEWS_AND_MAGAZINES', 'TOOLS', 'LIBRARIES_AND_DEMO', 'BEAUTY', 'PERSONALIZATION', 'COMMUNICATION', 'EDUCATION', 'SHOPPING', 'TRAVEL_AND_LOCAL', 'ART_AND_DESIGN', 'GAME', 'PARENTING', 'FAMILY', 'LIFESTYLE', 'WEATHER', 'FINANCE', 'PHOTOGRAPHY', 'EVENTS', 'BUSINESS', 'ENTERTAINMENT', '1.9', 'MEDICAL', 'FOOD_AND_DRINK', 'MAPS_AND_NAVIGATION', 'VIDEO_PLAYERS'}


Based on the existing categories, it would fit the 'Photography' category.

In [26]:
android[10472].insert(1, 'PHOTOGRAPHY')
print(len(android[10472]))
print(android[10472])

13
['Life Made WI-Fi Touchscreen Photo Frame', 'PHOTOGRAPHY', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


Let us proceed to check if there are duplicate entries in our data.

In [27]:
# We make a set of app names to be compared with the number of data entries
apps = {row[0] for row in android}
print(f'Total app data: {len(android)}')
print(f'Unique app names: {len(apps)}')
print(f'App names: {apps}')

Total app data: 10841
Unique app names: 9660
App names: {'TAMAGO', 'Endomondo - Running & Walking', 'StrongLifts 5x5 Workout Gym Log & Personal Trainer', 'BD Field Force', 'Manga Net – Best Online Manga Reader', 'G-Switch', 'CS', 'Chest CT Sectional Walker', 'Woody Puzzle', 'LEGO® Juniors Create & Cruise', 'A-Z App Store', 'BH Online', 'AJ Rafael Music Lyrics', 'AV Buddy Free', 'Township', 'Top BR Chaya Songs', 'Train driving simulator', "Guns'n'Glory", 'No.Draw - Colors by Number 2018', 'NerdWallet: Personal Finance, Credit Score & Cash', 'Eg Call', 'EZ TV Player', 'Barcelona Live - Goal Score & News for Barca Fans', 'PayPal', 'Phone X Launcher, OS 11 iLauncher & Control Center', 'M-Sight Pro', 'Cookbook Recipes', 'My Emma :)', "CJ's Tire & Automotive", 'Make a burger king', 'dk', 'Treadmill Workouts Free (P)', 'Top Eleven 2018 - Be a Soccer Manager', 'Asiimov Skin - CS GO Icon Pack', 'A-Z Screen Recorder -', 'Familyfirst Messenger', 'NOOK Audiobooks', 'The Weather Network', 'Bg TV On

Based on the results of our code above, there are 1181 cases of apps occuring more than once.
Let's take a look at one of the apps with duplicate entry.

In [28]:
app_histogram = {}
for row in android:
  name = row[0]
  if name in app_histogram:
    app_histogram[name] += 1
  else:
    app_histogram[name] = 1
print(app_histogram)

{'Photo Editor & Candy Camera & Grid & ScrapBook': 1, 'Coloring book moana': 2, 'U Launcher Lite – FREE Live Cool Themes, Hide Apps': 1, 'Sketch - Draw & Paint': 1, 'Pixel Draw - Number Art Coloring Book': 1, 'Paper flowers instructions': 1, 'Smoke Effect Photo Maker - Smoke Editor': 1, 'Infinite Painter': 1, 'Garden Coloring Book': 1, 'Kids Paint Free - Drawing Fun': 1, 'Text on Photo - Fonteee': 1, 'Name Art Photo Editor - Focus n Filters': 1, 'Tattoo Name On My Photo Editor': 1, 'Mandala Coloring Book': 1, '3D Color Pixel by Number - Sandbox Art Coloring': 1, 'Learn To Draw Kawaii Characters': 1, 'Photo Designer - Write your name with shapes': 1, '350 Diy Room Decor Ideas': 1, 'FlipaClip - Cartoon animation': 1, 'ibis Paint X': 1, 'Logo Maker - Small Business': 1, "Boys Photo Editor - Six Pack & Men's Suit": 1, 'Superheroes Wallpapers | 4K Backgrounds': 1, 'Mcqueen Coloring pages': 2, 'HD Mickey Minnie Wallpapers': 1, 'Harley Quinn wallpapers HD': 1, 'Colorfit - Drawing & Coloring':

In [33]:
for name, count in app_histogram.items():
  if count > 2:
    print(f'{name}, {count}')

Google My Business, 3
Box, 3
Quick PDF Scanner + OCR FREE, 3
Google Ads, 3
Slack, 3
QuickBooks Accounting: Invoicing & Expenses, 3
join.me - Simple Meetings, 3
Messenger – Text and Video Chat for Free, 3
WhatsApp Messenger, 3
Google Chrome: Fast & Secure, 3
Gmail, 3
Hangouts, 4
Viber Messenger, 5
Firefox Browser fast & private, 3
Yahoo Mail – Stay Organized, 3
imo free video calls and chat, 4
Opera Mini - fast web browser, 3
Opera Browser: Fast and Secure, 3
Firefox Focus: The privacy browser, 3
Google Voice, 3
WeChat, 4
UC Browser Mini -Tiny Fast Private & Secure, 3
Telegram, 3
Puffin Web Browser, 3
UC Browser - Fast Download Private & Secure, 3
free video calls and chat, 3
Skype - free IM & video calls, 3
Google Allo, 3
LINE: Free Calls & Messages, 3
KakaoTalk: Free Calls & Text, 3
OkCupid Dating, 3
Hily: Dating, Chat, Match, Meet & Hook up, 3
BBW Dating & Plus Size Chat, 3
Moco - Chat, Meet People, 3
Hot or Not - Find someone right now, 3
Just She - Top Lesbian Dating, 3
muzmatch: M

In [35]:
print(android_header)
for app in android:
  name = app[0]
  if name == 'WeChat':
    print(app)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['WeChat', 'COMMUNICATION', '4.2', '5387333', 'Varies with device', '100,000,000+', 'Free', '0', 'Everyone', 'Communication', 'July 31, 2018', 'Varies with device', 'Varies with device']
['WeChat', 'COMMUNICATION', '4.2', '5387446', 'Varies with device', '100,000,000+', 'Free', '0', 'Everyone', 'Communication', 'July 31, 2018', 'Varies with device', 'Varies with device']
['WeChat', 'COMMUNICATION', '4.2', '5387446', 'Varies with device', '100,000,000+', 'Free', '0', 'Everyone', 'Communication', 'July 31, 2018', 'Varies with device', 'Varies with device']
['WeChat', 'COMMUNICATION', '4.2', '5387631', 'Varies with device', '100,000,000+', 'Free', '0', 'Everyone', 'Communication', 'July 31, 2018', 'Varies with device', 'Varies with device']


Inspecting the data of the duplicate entries for WeChat, they differ on the 4th column which represents the number of reviews. This could indicate that data was collected at different times.
With this information, we can use it as a criterion for removing the duplicate entries. We will only keep the entry with the highest review count which indicates that it is the latest entry.

In [39]:
import time
t1 = time.time()

# For the first part, we create a dictionary of apps with its highest review count
reviews_max = {}

for row in android:
  name = row[an_hi['App']]
  review_count = int(row[an_hi['Reviews']])
  if name not in reviews_max:
    reviews_max[name] = review_count
  else:
    reviews_max[name] = review_count if review_count > reviews_max[name] else reviews_max[name]

print(len(reviews_max))

android_clean = []
added_app = set()
for row in android:
  name = row[an_hi['App']]
  review_count = int(row[an_hi['Reviews']])
  if review_count == reviews_max[name] and name not in added_app:
    android_clean.append(row)
    added_app.add(name)

t2 = time.time()
print("Time taken: %.6f" %(t2 - t1))
print(f'Expected length: 9660, Actual: {len(android_clean)}')

9660
Time taken: 0.014398
Expected length: 9660, Actual: 9660


In [16]:
def isEnglish(s):
    try:
        s.encode(encoding="utf-8").decode("ascii")
    except UnicodeDecodeError:
        return False
    else:
        return True

In [18]:
for index, row in enumerate(android):
    if not row[0].isascii():#Check app name if english
        print(row)
        print(index)

['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']
2
['CarMax – Cars for Sale: Search Used Car Inventory', 'AUTO_AND_VEHICLES', '4.4', '21777', 'Varies with device', '1,000,000+', 'Free', '0', 'Everyone', 'Auto & Vehicles', 'August 4, 2018', 'Varies with device', 'Varies with device']
85
['AutoScout24 Switzerland – Find your new car', 'AUTO_AND_VEHICLES', '4.6', '13372', 'Varies with device', '1,000,000+', 'Free', '0', 'Everyone', 'Auto & Vehicles', 'August 3, 2018', 'Varies with device', 'Varies with device']
88
['Zona Azul Digital Fácil SP CET - OFFICIAL São Paulo', 'AUTO_AND_VEHICLES', '4.6', '7880', 'Varies with device', '100,000+', 'Free', '0', 'Everyone', 'Auto & Vehicles', 'May 10, 2018', '4.6.5', 'Varies with device']
89
['Wattpad 📖 Free Books', 'BOOKS_AND_REFERENCE', '4.6', '2914724', 'Varies with device', '100,000,000+', 'Free', '0', 