# Analysis of Apps for Android and iOS
This project analyzes data about apps this company built for the Google Play and Apple App stores. The goal is to know what type of apps are likely to attract more users to our apps in each app store.

In [1]:
opened_file = open('AppleStore.csv')
from csv import reader
read_file = reader(opened_file)
apple_apps_data = list(read_file)

opened_file_2 = open('googleplaystore.csv')
read_file_2 = reader(opened_file_2)
google_apps_data = list(read_file_2)

In [2]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [3]:
explore_data(dataset=apple_apps_data, start=0, end=5, rows_and_columns=True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


Number of rows: 7198
Number of columns: 16


In [4]:
print(google_apps_data[10473])

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [5]:
del google_apps_data[10473]

This Google Play data set has some duplicate entries. Let's clean them up!

In [6]:
duplicate_apps = []
unique_apps = []

for app in google_apps_data:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
    
print(duplicate_apps[0:4])

['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings']


In [7]:
print('Number of duplicate apps:', len(duplicate_apps))

Number of duplicate apps: 1181


I am not randomly removing the duplicates. I am removing all but the one with the most reviews since the most recent data would have the most reviews. 

In [8]:
reviews_max = {}

for app in google_apps_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

In [9]:
print(len(reviews_max))

explore_data(android_clean, 0, 3, True)

9659


NameError: name 'android_clean' is not defined

In [12]:
android_clean = []
already_added = []

for app in google_apps_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name)

In [13]:
print(len(android_clean))

9659


In the previous blocks I used a loop to rank duplicate apps by number of ratings and identify the duplicate with the highest number of ratings. All apps with the highest number of ratings go into the dictionary called reviews_max. 

I then append these values into the list android_clean and identify the names of these no longer duplicate apps in the list already_added. This also helps the loop to check if a duplicate is coming in. 

In [20]:
def is_english(string):
    non_ascii = 0
    
    for c in string:
        char = ord(c)
        if char > 127:
            non_ascii += 1
            
        if non_ascii > 3:
            return False
        else:
            return True
        
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(is_english('Instachat 😜'))

True
True


In [15]:
android_english = []
ios_english = []

for app in android_clean:
    if is_english(app):
        android_english.append(app)
        
for app in ios_clean:
    if is_english(app):
        ios_english.append(app)

TypeError: ord() expected a character, but string of length 46 found

In [None]:
free_gp_apps = []
free_ap_apps = []

for app in google_apps_data[1:]:
    price = app[7]
    if price == '0':
        free_gp_apps.append(app)

for app in apple_apps_data[1:]:
    price = app[7]
    if price == '0.0':
        free_ap_apps.append(app)

print(len(free_gp_apps))
print(len(free_ap_apps))