# App Profile Recommendation
I will pretend as I am a part of a company that builds both Android and iOS mobile apps. My goal is to analyse data to help developers understand what type of apps are likely to attract more users.

In [18]:
from csv import reader
app_store = open('AppleStore.csv', encoding='utf8')
app_store = list(reader(app_store)) # 16, 7198

play_store = open('googleplaystore.csv', encoding='utf8')
play_store = list(reader(play_store)) # 13, 10842

In [19]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [20]:
explore_data(play_store, 0, 3, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


Number of rows: 10842
Number of columns: 13


In [21]:
explore_data(app_store, 0, 3, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7198
Number of columns: 16


## Displaying the columns

In [22]:
app_store_header = app_store[0]
app_store_header

['id',
 'track_name',
 'size_bytes',
 'currency',
 'price',
 'rating_count_tot',
 'rating_count_ver',
 'user_rating',
 'user_rating_ver',
 'ver',
 'cont_rating',
 'prime_genre',
 'sup_devices.num',
 'ipadSc_urls.num',
 'lang.num',
 'vpp_lic']

In [23]:
play_store_header = play_store[0]
play_store_header

['App',
 'Category',
 'Rating',
 'Reviews',
 'Size',
 'Installs',
 'Type',
 'Price',
 'Content Rating',
 'Genres',
 'Last Updated',
 'Current Ver',
 'Android Ver']

### Documentation for datasets

https://www.kaggle.com/lava18/google-play-store-apps

https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps

## Testing if dataset contains any empty values

In [61]:
for i in range(1,len(play_store)):
    for x in range (0 , 12):
        test_value = str(play_store[i][x])
    if not test_value:
        print ("Index" , i , " is not correct")
        print (play_store[i][x])
        print (play_store[i])
    
print ("finished")

finished


In [59]:
#del play_store[1553]
#Index 1553 was incorrect

## Duplicates
When looking through the data we see that there are many cases where there are duplicates of apps. Example given below 

In [66]:
for app in play_store:
    name = app[0]
    if name == "Instagram":
        print (app)

['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']


It can be seen that there are 4 duplicates for Instagram. They all seem the same except for the Rating column, so we can base our pick from the bunch on that (highest number means the latest add so we take that one)

### Getting all duplicate apps

In [100]:
duplicate_apps = []


for app in play_store:
    name = app[0]
    
    if name in unique_apps:
        duplicate_apps.append(name)
    else :
        unique_apps.append(name)
        
print ("Number of duplicate apps: ", len(duplicate_apps))
print ("\n")
print ("Example of duplicate apps: ", duplicate_apps[:15])

Number of duplicate apps:  1181


Example of duplicate apps:  ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']


### Selecting the best pick


In [124]:
reviews = {}

for app in play_store:
    name = app[0]
    n_reviews = float(app[3])
        
    if name in reviews and reviews[name] < n_reviews:
        reviews[name] = n_reviews
        
    elif name not in reviews:
        reviews[name] = n_reviews
        
print (len(play_store))
print (len(duplicate_apps))

10839
0


In [125]:
android_clean = []
already_added = []

for app in play_store:
    name = app[0]
    n_reviews = float(app[3])
    
    if (reviews[name] == n_reviews) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name)
                

In [126]:
explore_data(android_clean, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9658
Number of columns: 13
