# Profitable app profiles for the App Store and Google Play Market:

## This project aims to analyze data for a company that builds Android and iOS mobile apps, with the goal to understand what kind of apps are more attractive for users on Google Play and App Store


> The code created at the guided project from dataquest will be adapted to use the main concepts of objected-oriented programming (OOP)



### Class Dataset 


> This class will be responsible for storing the dataset, whether referring to Apple or Google apps, in order to perform all the functions of cleaning, organizing and analyzing the data



In [105]:
from csv import reader

class Dataset():
  def __init__(self, file):
    '''This function will recieve the filename of the dataset and store its information in list of lists'''
    open_file = open(file)
    read_file = reader(open_file)
    complet_file = list(read_file)
    self.header = complet_file[0:1]
    self.data = complet_file[1:] 
  
  def explore_data(self, start, end, rows_and_columns=False):
    '''This functions explores the dataset's data, including or not the number of rows and columns'''
    dataset_slice = self.data[start:end]
    for row in dataset_slice:
        print(row)
        print('\n')
    if rows_and_columns:
        print('Number of rows:', len(self.data))
        print('Number of columns:', len(self.data[0]))

  def delete_wrong_row(self, index):
    '''This function deletes a wrong row present on the dataset'''
    del self.data[index]
  
  def remove_duplicate_rows(self):
    '''This function removes any duplicated rows in the dataset
       Important: the rows won't be deleted randomly, we'll keep the ones with the highest number of reviews
       Also important: there are duplicated rows only for Google Play Store data
    '''
    #Checking the number of duplicated rows
    duplicate = []
    unique = []

    for app in self.data:
      name = app[0]
      if name in unique:
          duplicate.append(name)
      else:
          unique.append(name)
    duplicated = len(duplicate)

    #Number of rows in the dataset without any duplicated rows
    reviews_max = {}
    for app in self.data:
      name = app[0]
      n_reviews = float(app[3])
      if name in reviews_max and reviews_max[name] < n_reviews:
          reviews_max[name] = n_reviews
      elif name not in reviews_max:
          reviews_max[name] = n_reviews
    
    correct_number_of_rows = len(reviews_max)

    #Cleaning the data
    cleaned_data = []
    already_added = []
    for app in self.data:
      name = app[0]
      n_reviews = float(app[3])
      if n_reviews == reviews_max[name] and name not in already_added:
          cleaned_data.append(app)
          already_added.append(name)
    self.data = cleaned_data
    actual_number_of_rows = len(self.data)

    print(f'Number of duplicated rows: {duplicated}')
    print(f'Correct number of rows without any duplicated: {correct_number_of_rows}')
    print(f'Actual number of rows after cleaning: {actual_number_of_rows}')
  
  def is_english(string):
    '''This function checks if the apps names are in english or not
       Important: it may have emojis at the app's name, so, we'll keep a maximum of 3 non-English characters   
    '''
    non_english = 0
    for l in string:
        num = ord(l)
        if num > 127:
            non_english += 1
    if non_english > 3:
        return False
    else:
        return True

  def remove_non_english_apps(self):
    '''This function removes apps with non-english characters'''
    english = []
    for app in self.data:
      name = app[0]
      if Dataset.is_english(name):
        english.append(app)
    self.data = english

    print(f'Actual size of dataset: {len(self.data)}')

  def include_only_free_apps(self, index):
    '''This function includes only free apps in the dataset
       Important: the column price index for the datasets is different, so it must be passed as a function parameter
    '''
    final = []
    for app in self.data:
      price = app[index]
      if price == '0' or price == '0.0':
        final.append(app)
    self.data = final
    print(f'Actual size of dataset: {len(self.data)}')

  def freq_table(self, index):
    '''This function calculares the frequency table for a column in the dataset'''
    dic = {}
    total = 0
    for row in self.data:
        feature = row[index]
        if feature in dic:
            dic[feature] += 1
        else:
            dic[feature] = 1
        total += 1
    dic_porc = {}
    for key in dic:
        porc = (dic[key]/total)*100
        dic_porc[key] = porc
    return dic_porc

  def display_table(self, index):
    '''This function displays the frequency table for a column in the dataset'''
    table = self.freq_table(index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
  
  def get_dataset(self):
    return self.data

### Instanciating classes

In [106]:
google = Dataset('googleplaystore.csv')
apple = Dataset('AppleStore.csv')

### Exploring data

In [107]:
google.explore_data(0, 5, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


Number of rows: 10841
Number of columns: 13


In [108]:
apple.explore_data(0, 5, True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']


Number of rows: 7197
Number of columns: 16


### Deleting wrong column from Google Play Store Data

In [109]:
google.delete_wrong_row(10472)

### Removing duplicate rows from Google Play Store Data

In [110]:
google.remove_duplicate_rows()

Number of duplicated rows: 1181
Correct number of rows without any duplicated: 9659
Actual number of rows after cleaning: 9659


### Removing non-English apps

In [111]:
google.remove_non_english_apps()

Actual size of dataset: 9614


In [112]:
apple.remove_non_english_apps()

Actual size of dataset: 7197


### Including only free apps

In [113]:
google.include_only_free_apps(7)

Actual size of dataset: 8864


In [114]:
apple.include_only_free_apps(4)

Actual size of dataset: 4056


### Inspecting data

#### prime_genre column at Apple Store Dataset

In [115]:
apple.display_table(-5)

Games : 55.64595660749507
Entertainment : 8.234714003944774
Photo & Video : 4.117357001972387
Social Networking : 3.5256410256410255
Education : 3.2544378698224854
Shopping : 2.983234714003945
Utilities : 2.687376725838264
Lifestyle : 2.3175542406311638
Finance : 2.0710059171597637
Sports : 1.947731755424063
Health & Fitness : 1.8737672583826428
Music : 1.6518737672583828
Book : 1.6272189349112427
Productivity : 1.5285996055226825
News : 1.4299802761341223
Travel : 1.3806706114398422
Food & Drink : 1.0601577909270217
Weather : 0.7642998027613412
Reference : 0.4930966469428008
Navigation : 0.4930966469428008
Business : 0.4930966469428008
Catalogs : 0.22189349112426035
Medical : 0.19723865877712032


#### Category column at Google Play Store Dataset

In [116]:
google.display_table(1)

FAMILY : 18.907942238267147
GAME : 9.724729241877256
TOOLS : 8.461191335740072
BUSINESS : 4.591606498194946
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.531137184115524
SPORTS : 3.395758122743682
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2378158844765346
HEALTH_AND_FITNESS : 3.0798736462093865
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.7937725631768955
MAPS_AND_NAVIGATION : 1.3989169675090252
FOOD_AND_DRINK : 1.2409747292418771
EDUCATION : 1.1620036101083033
ENTERTAINMENT : 0.9589350180505415
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
HOUSE_AND_HOME : 0.8235559566787004
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
PARENTING : 0.6543321299638989
ART_AND_DESIGN : 

#### Genres column at Google Play Store Dataset

In [117]:
google.display_table(-4)

Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.591606498194946
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.531137184115524
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2378158844765346
Action : 3.1024368231046933
Health & Fitness : 3.0798736462093865
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.8501805054151623
Video Players & Editors : 1.7712093862815883
Casual : 1.7599277978339352
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.9250902527075

### Average number of installs for each app genre at Apple Store dataset

In [118]:
genres_apple = apple.freq_table(-5)

for genre in genres_apple:
    total = 0
    len_genre = 0
    for app in apple.get_dataset():
        genre_app = app[-5]
        if genre_app == genre:
            number_ratings = float(app[5])
            total += number_ratings
            len_genre += 1
    avg_ratings = total/len_genre
    print(genre, ':', avg_ratings)

Social Networking : 53078.195804195806
Photo & Video : 27249.892215568863
Games : 18924.68896765618
Music : 56482.02985074627
Reference : 67447.9
Health & Fitness : 19952.315789473683
Weather : 47220.93548387097
Utilities : 14010.100917431193
Travel : 20216.01785714286
Shopping : 18746.677685950413
News : 15892.724137931034
Navigation : 25972.05
Lifestyle : 8978.308510638299
Entertainment : 10822.961077844311
Food & Drink : 20179.093023255813
Sports : 20128.974683544304
Book : 8498.333333333334
Finance : 13522.261904761905
Education : 6266.333333333333
Productivity : 19053.887096774193
Business : 6367.8
Catalogs : 1779.5555555555557
Medical : 459.75


#### Average number of installs for each category at Google App Store Dataset

In [119]:
categories_android = google.freq_table(1)

for category in categories_android:
    total = 0
    len_category = 0
    for app in google.get_dataset():
        category_app = app[1]
        if category_app == category:            
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category += 1
    avg_n_installs = total / len_category
    print(category, ':', avg_n_installs)

ART_AND_DESIGN : 1986335.0877192982
AUTO_AND_VEHICLES : 647317.8170731707
BEAUTY : 513151.88679245283
BOOKS_AND_REFERENCE : 8767811.894736841
BUSINESS : 1712290.1474201474
COMICS : 817657.2727272727
COMMUNICATION : 38456119.167247385
DATING : 854028.8303030303
EDUCATION : 1833495.145631068
ENTERTAINMENT : 11640705.88235294
EVENTS : 253542.22222222222
FINANCE : 1387692.475609756
FOOD_AND_DRINK : 1924897.7363636363
HEALTH_AND_FITNESS : 4188821.9853479853
HOUSE_AND_HOME : 1331540.5616438356
LIBRARIES_AND_DEMO : 638503.734939759
LIFESTYLE : 1437816.2687861272
GAME : 15588015.603248259
FAMILY : 3695641.8198090694
MEDICAL : 120550.61980830671
SOCIAL : 23253652.127118643
SHOPPING : 7036877.311557789
PHOTOGRAPHY : 17840110.40229885
SPORTS : 3638640.1428571427
TRAVEL_AND_LOCAL : 13984077.710144928
TOOLS : 10801391.298666667
PERSONALIZATION : 5201482.6122448975
PRODUCTIVITY : 16787331.344927534
PARENTING : 542603.6206896552
WEATHER : 5074486.197183099
VIDEO_PLAYERS : 24727872.452830188
NEWS_AND_

### Conclusions

In this project, we analyzed data about the App Store and Google Play mobile apps with the goal of recommending an app profile that can be profitable for both markets.

We concluded that taking a popular book (perhaps a more recent book) and turning it into an app could be profitable for both the Google Play and the App Store markets. The markets are already full of libraries, so we need to add some special features besides the raw version of the book. This might include daily quotes from the book, an audio version of the book, quizzes on the book, a forum where people can discuss the book, etc.