# App Profiles Analysis Project


## About

In this project, we'll pretend we're working as data analysts for a company that builds Android and iOS mobile apps. We make our apps available on Google Play and in the App Store.


## Goal

This project is to analyze data to help our developers understand what type of apps are likely to attract more users.

In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n')
    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [2]:
apple_store_csv = open('AppleStore.csv')
google_play_csv = open('googleplaystore.csv')

In [3]:
from csv import reader

In [4]:
ap_dataset = list(reader(apple_store_csv))
gp_dataset = list(reader(google_play_csv))

In [5]:
explore_data(ap_dataset, 0, 5, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


Number of rows: 7198
Number of columns: 16


In [6]:
explore_data(gp_dataset, 0, 5, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 10842
Number of columns: 13


## Find the missing columnrow of google play

In [7]:
def incorrect_row(dataset):
    # get the column count of dataset
    cloumn_count = len(dataset[0])
    
    # iterate the dataset check which row's colmun count is incorrect
    error_indexes = []
    index = 1
    for data in dataset[1:]:
        if len(data) != cloumn_count:
            error_indexes.append(index);
        index += 1
    return error_indexes

In [8]:
print(incorrect_row(gp_dataset))

[10473]


In [9]:
print(gp_dataset[0])
print(gp_dataset[10473])

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [10]:
# remove the error row
del gp_dataset[10473]

In [11]:
print(incorrect_row(gp_dataset))

[]


## Find duplicates app row

1. Find the duplicate app name
2. Pick the latest data row according to the highest review number
3. Remove ohter duplicate row 

In [12]:
def duplicate_app_names(dataset):
    duplicate_apps = []
    unique_apps = []
    
    for data in dataset[1:]:
        name = data[0]
        if name in unique_apps:
            duplicate_apps.append(name)
        else:
            unique_apps.append(name)
    print(duplicate_apps)
    print('duplicate app count:', len(duplicate_apps))

In [13]:
duplicate_app_names(gp_dataset)

['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software', 'MailChimp - Email, Marketing Automation', 'Crew - Free Messaging and Scheduling', 'Asana: organize team projects', 'Google Analytics', 'AdWords Express', 'Accounting App - Zoho Books', 'Invoice & Time Tracking - Zoho', 'join.me - Simple Meetings', 'Invoice 2go — Professional Invoices and Estimates', 'SignEasy | Sign and Fill PDF and other Documents', 'Quick PDF Scanner + OCR FREE', 'Genius Scan - PDF Scanner', 'Tiny Scanner - PDF Scanner App', 'Fast Scanner : Free PDF Scan', 'Mobile Doc Scanner (MDScan) Lite', 'TurboScan: scan documents and receipts in PDF', 'Tiny Scanner Pro: PDF Doc Scan', 'Docs To Go™ Free Office Suite', 'OfficeSuite : Free Office + PDF Editor',

In [14]:
reviews_max = {}

for data in gp_dataset[1:]:
    name = data[0]
    n_reviews = float(data[3])
    if name not in reviews_max:
        reviews_max[name] = n_reviews
    elif name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews

print(len(reviews_max))

9659


In [15]:
android_clean = []
already_added = []

for data in gp_dataset[1:]:
    name = data[0]
    n_reviews = float(data[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(data)
        already_added.append(name)
        
duplicate_app_names(android_clean)

[]
duplicate app count: 0


## Remove non-english apps

In [16]:
def is_english(text):
    count = 0
    for char in text:
        if ord(char) > 127:
            count += 1
    return count <= 3

In [17]:
print(is_english('Instachat 😜'))
print(is_english('你好世界'))

True
False


In [18]:
android_english_apps = []
ios_english_apps = []

for data in android_clean:
    name = data[0]
    if is_english(name):
        android_english_apps.append(data)
        
for data in ap_dataset[1:]:
    name = data[1]
    if is_english(name):
        ios_english_apps.append(data)

In [19]:
print(len(android_english_apps))
print(len(ios_english_apps))

9614
6183


## Removing paid app

In [20]:
free_android_english_apps = []
free_ios_english_apps = []

for data in android_english_apps:
    price = data[7]
    if (price == '0'):
        free_android_english_apps.append(data)
        
for data in ios_english_apps:
    price = data[4]
    if (price == '0.0'):
        free_ios_english_apps.append(data)

In [21]:
print(len(free_android_english_apps))
print(len(free_ios_english_apps))

8864
3222


In [22]:
def freq_table(dataset, index):
    result = {}
    total = len(dataset)
    
    for data in dataset:
        target = data[index]
        if target in result:
            result[target] += 1
        else:
            result[target] = 1
    
    for key in result:
        result[key] = (result[key] / total) * 100
    
    return result

In [23]:
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [24]:
display_table(free_ios_english_apps, -5)

Games : 58.16263190564867
Entertainment : 7.883302296710118
Photo & Video : 4.9658597144630665
Education : 3.662321539416512
Social Networking : 3.2898820608317814
Shopping : 2.60707635009311
Utilities : 2.5139664804469275
Sports : 2.1415270018621975
Music : 2.0484171322160147
Health & Fitness : 2.0173805090006205
Productivity : 1.7380509000620732
Lifestyle : 1.5828677839851024
News : 1.3345747982619491
Travel : 1.2414649286157666
Finance : 1.1173184357541899
Weather : 0.8690254500310366
Food & Drink : 0.8069522036002483
Reference : 0.5586592178770949
Business : 0.5276225946617008
Book : 0.4345127250155183
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665


In [25]:
display_table(free_android_english_apps,1)

FAMILY : 18.907942238267147
GAME : 9.724729241877256
TOOLS : 8.461191335740072
BUSINESS : 4.591606498194946
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.531137184115524
SPORTS : 3.395758122743682
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2378158844765346
HEALTH_AND_FITNESS : 3.0798736462093865
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.7937725631768955
MAPS_AND_NAVIGATION : 1.3989169675090252
FOOD_AND_DRINK : 1.2409747292418771
EDUCATION : 1.1620036101083033
ENTERTAINMENT : 0.9589350180505415
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
HOUSE_AND_HOME : 0.8235559566787004
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
PARENTING : 0.6543321299638989
ART_AND_DESIGN : 

In [26]:
display_table(free_android_english_apps,-4)

Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.591606498194946
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.531137184115524
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2378158844765346
Action : 3.1024368231046933
Health & Fitness : 3.0798736462093865
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.8501805054151623
Video Players & Editors : 1.7712093862815883
Casual : 1.7599277978339352
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.9250902527075

In [27]:
as_genre_table = freq_table(free_ios_english_apps, -5)

for genre in as_genre_table:
    total = 0
    gen_count = 0
    
    for data in free_ios_english_apps:
        gen = data[-5]
        if gen == genre:
            total += float(data[5])
            gen_count += 1
    avg = total / gen_count
    print('Genre: ' , genre, 'average ratings: ', avg)

Genre:  Social Networking average ratings:  71548.34905660378
Genre:  Photo & Video average ratings:  28441.54375
Genre:  Games average ratings:  22788.6696905016
Genre:  Music average ratings:  57326.530303030304
Genre:  Reference average ratings:  74942.11111111111
Genre:  Health & Fitness average ratings:  23298.015384615384
Genre:  Weather average ratings:  52279.892857142855
Genre:  Utilities average ratings:  18684.456790123455
Genre:  Travel average ratings:  28243.8
Genre:  Shopping average ratings:  26919.690476190477
Genre:  News average ratings:  21248.023255813954
Genre:  Navigation average ratings:  86090.33333333333
Genre:  Lifestyle average ratings:  16485.764705882353
Genre:  Entertainment average ratings:  14029.830708661417
Genre:  Food & Drink average ratings:  33333.92307692308
Genre:  Sports average ratings:  23008.898550724636
Genre:  Book average ratings:  39758.5
Genre:  Finance average ratings:  31467.944444444445
Genre:  Education average ratings:  7003.983050

In [28]:
categories_android = freq_table(free_android_english_apps, 1)

for category in categories_android:
    total = 0
    len_category = 0
    for app in free_android_english_apps:
        category_app = app[1]
        if category_app == category:            
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category += 1
    avg_n_installs = total / len_category
    print(category, ':', avg_n_installs)

ART_AND_DESIGN : 1986335.0877192982
AUTO_AND_VEHICLES : 647317.8170731707
BEAUTY : 513151.88679245283
BOOKS_AND_REFERENCE : 8767811.894736841
BUSINESS : 1712290.1474201474
COMICS : 817657.2727272727
COMMUNICATION : 38456119.167247385
DATING : 854028.8303030303
EDUCATION : 1833495.145631068
ENTERTAINMENT : 11640705.88235294
EVENTS : 253542.22222222222
FINANCE : 1387692.475609756
FOOD_AND_DRINK : 1924897.7363636363
HEALTH_AND_FITNESS : 4188821.9853479853
HOUSE_AND_HOME : 1331540.5616438356
LIBRARIES_AND_DEMO : 638503.734939759
LIFESTYLE : 1437816.2687861272
GAME : 15588015.603248259
FAMILY : 3695641.8198090694
MEDICAL : 120550.61980830671
SOCIAL : 23253652.127118643
SHOPPING : 7036877.311557789
PHOTOGRAPHY : 17840110.40229885
SPORTS : 3638640.1428571427
TRAVEL_AND_LOCAL : 13984077.710144928
TOOLS : 10801391.298666667
PERSONALIZATION : 5201482.6122448975
PRODUCTIVITY : 16787331.344927534
PARENTING : 542603.6206896552
WEATHER : 5074486.197183099
VIDEO_PLAYERS : 24727872.452830188
NEWS_AND_