## Data Analytics for Mobile App Stores

This project is based on the guided project of the course "Python for Data Science: Fundamentals" on dataquest.io.
This is my first python project, so I'm doing it mainly to practice and master the fundamentals of python.

Here the dataset used is the Google Play Store data from a few years ago.

The aim of the projects are:

1. To open, read and handle datasets.
2. To clean the data set.
3. To understand the structure of datasets
4. To become familiar with the basic tools of python such as the various data types, loops, methods and functions
4. To create useful (but basic) statistics based on the data provided

Reminders: 

1. In the 3rd cell, del() should only be run once as it deletes an error row.

In [1]:
# Importing and reading the dataset

from csv import reader

# Google Play Store Dataset

opened_file = open('googleplaystore.csv', encoding="utf8")
read_file = reader(opened_file)
play_store = list(read_file)

# Here, we partition our dataset into the column headers and the data points

play_store_header=play_store[0]
play_store = play_store[1:]    

In [2]:
# To have a look at and understand the dataset
# To know the number of rows and columns in our dataset

def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print("\n")

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))
        
print(explore_data(play_store, 0,3, True))

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13
None


In [3]:
#Dealing with Row Error

#print(play_store[10472])
#print('\n')

#play_store_header_len= len(play_store_header)
#len_10472=len(play_store[10472])
#if len(play_store_header) != len(play_store[10472]):
    #print("not equal, error row")
    
#print('\n')
    
del play_store[10472]
print(play_store[10472])

['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up']


In [4]:
#Checking for Duplicate Apps

duplicate_apps=[]
unique_apps=[]

for app in play_store:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
        
print('Number of duplicate apps:', len(duplicate_apps))
print('\n')
print('Examples of duplicate apps:''\n', duplicate_apps[:10])

#There are many duplicate entries of the same app.

print('\n')

for app in play_store:
    name = app[0]
    if name == "Quick PDF Scanner + OCR FREE":
        print(app)
        
# Amongst duplicate entries, the number of reviews is different.
# This indicates that the entries were taken at different times.
# Hence, it would be useful to keep the most recent entry.
# The most recent entry has the most number of reviews.

Number of duplicate apps: 1181


Examples of duplicate apps:
 ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack']


['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80804', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']


In [5]:
#Expected length of data after removing duplicates.

print("Expected length:", len(play_store)-1181)
print('\n')

#To detect the apps with most number of reviews to be kept as originals

reviews_max={}
        
for app in play_store:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name]<n_reviews:
        reviews_max[name]=n_reviews
    if name not in reviews_max:
        reviews_max[name]=n_reviews
        
print("Actual length:", len(reviews_max))
print('\n')


Expected length: 9659


Actual length: 9659




In [6]:
# Removal of Duplicates 

play_store_clean=[]
already_added=[]

for app in play_store:
    name = app[0]
    n_reviews = float(app[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        play_store_clean.append(app)
        already_added.append(name)
        
#explore_data(play_store_clean, 0,5)

if len(play_store_clean) == len(reviews_max):
    print("Removal of duplicates is successful")

Removal of duplicates is successful


In [7]:
# Function to check if a string is in English

def check_eng(string):
    non_ascii = 0
    for char in string:
        if ord(char)>127:
          non_ascii += 1
        
    if non_ascii>3:
        return False
    else:
        return True
        
# Checking if the function works

#print(check_eng('Instagram'))
#print(check_eng('爱奇艺PPS -《欢乐颂2》电视剧热播'))
#print(check_eng('Docs To Go™ Free Office Suite'))
#print(check_eng('Instachat 😜'))

# Removing Non-English apps from Play Store Data

play_sce=[] # play store cleaned, english

for app in play_store_clean:
    name = app[0]
    if check_eng(name):
        play_sce.append(app)
        
#print(play_sce[:5])
print("Number of English apps:", len(play_sce))

Number of English apps: 9614


In [8]:
# Isolating the free apps

play_scef = [] # play store cleaned, english, free

for app in play_sce:
    price = app[7]
    if price == '0':
        play_scef.append(app)

print("Number of free English apps:", len(play_scef))

Number of free English apps: 8864


In [9]:
# To create a frequency table for genres

genre_freq={}

for app in play_scef:
    genre = app[9]
    if genre in genre_freq:
        genre_freq[genre]+=1
    elif genre not in genre_freq:
        genre_freq[genre] = 1
        
# To understand how the frequency table of genres looks
    
print("This is the dictionry of genres:""\n\n")

for key in genre_freq:
    print(key,":",genre_freq[key])

This is the dictionry of genres:


Art & Design : 53
Art & Design;Creativity : 6
Auto & Vehicles : 82
Beauty : 53
Books & Reference : 190
Business : 407
Comics : 54
Comics;Creativity : 1
Communication : 287
Dating : 165
Education : 474
Education;Creativity : 4
Education;Education : 30
Education;Pretend Play : 5
Education;Brain Games : 3
Entertainment : 538
Entertainment;Brain Games : 7
Entertainment;Creativity : 3
Entertainment;Music & Video : 15
Events : 63
Finance : 328
Food & Drink : 110
Health & Fitness : 273
House & Home : 73
Libraries & Demo : 83
Lifestyle : 345
Lifestyle;Pretend Play : 1
Card : 40
Arcade : 164
Puzzle : 100
Racing : 88
Sports : 307
Casual : 156
Simulation : 181
Adventure : 60
Trivia : 37
Action : 275
Word : 23
Role Playing : 83
Strategy : 81
Board : 34
Music : 18
Action;Action & Adventure : 9
Casual;Brain Games : 12
Educational;Creativity : 3
Puzzle;Brain Games : 15
Educational;Education : 35
Casual;Pretend Play : 21
Educational;Brain Games : 6
Art & Design;Prete

In [10]:
# To output the percentage of apps in each header column of interest (for example, categories, genres, etc.,)

def freq_table(dataset, index):
    freq_table={}
    total=0
    
    for row in dataset:
        total+=1
        value = row[index]
        if value in freq_table:
            freq_table[value]+=1
        elif value not in freq_table:
            freq_table[value]=1
            
    percent_table={}
    
    for key in freq_table:
        percentage = (freq_table[key]/total)*100
        percent_table[key]=percentage
        
    return percent_table
      
#print(freq_table(play_scef, 9))

# To print out the percentage of apps in each category of the Play Store in descending order

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        
print(display_table(play_scef, 1))

FAMILY : 18.907942238267147
GAME : 9.724729241877256
TOOLS : 8.461191335740072
BUSINESS : 4.591606498194946
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.531137184115524
SPORTS : 3.395758122743682
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2378158844765346
HEALTH_AND_FITNESS : 3.0798736462093865
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.7937725631768955
MAPS_AND_NAVIGATION : 1.3989169675090252
FOOD_AND_DRINK : 1.2409747292418771
EDUCATION : 1.1620036101083033
ENTERTAINMENT : 0.9589350180505415
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
HOUSE_AND_HOME : 0.8235559566787004
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
PARENTING : 0.6543321299638989
ART_AND_DESIGN : 

In [11]:
# To print out the percentage of apps in each download category of the Play Store in descending order

print(display_table(play_scef,5))

1,000,000+ : 15.726534296028879
100,000+ : 11.552346570397113
10,000,000+ : 10.548285198555957
10,000+ : 10.198555956678701
1,000+ : 8.393501805054152
100+ : 6.915613718411552
5,000,000+ : 6.825361010830325
500,000+ : 5.561823104693141
50,000+ : 4.7721119133574
5,000+ : 4.512635379061372
10+ : 3.5424187725631766
500+ : 3.2490974729241873
50,000,000+ : 2.3014440433213
100,000,000+ : 2.1322202166064983
50+ : 1.917870036101083
5+ : 0.78971119133574
1+ : 0.5076714801444043
500,000,000+ : 0.2707581227436823
1,000,000,000+ : 0.22563176895306858
0+ : 0.04512635379061372
0 : 0.01128158844765343
None


In [12]:
# To find the average number of app installs in each category of the Play Store

# First, to get a frequency table of the categories

def freq_table_norm(dataset, index):
    freq_table={}
    
    for row in dataset:
        value = row[index]
        if value in freq_table:
            freq_table[value]+=1
        elif value not in freq_table:
            freq_table[value]=1
        
    return freq_table

c_play = freq_table_norm(play_scef, 1)
print("Number of categories :", len(c_play))
print("\n")

# Second, to create a dictionary with average app installs of each category 

avg_install_dict={}

for category in c_play:
    total=0
    len_category=0
    for app in play_scef:
        app_category=app[1]
        if app_category == category:
            n_installs = app[5]
            n_installs = n_installs.replace(",","")
            n_installs = n_installs.replace("+","")
            n_installs = float(n_installs)
            total += float(n_installs)
            len_category+= 1
    avg_n_installs = total/len_category
    avg_install_dict[category]= avg_n_installs
      
    
#print(avg_install_dict)
#print("\n")

# Third, to print the categories in descending order of average app installs 

def display_table_dict(dict):
    table_display = []
    for key in dict:
        key_val_as_tuple = (dict[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        
display_table_dict(avg_install_dict)

Number of categories : 33


COMMUNICATION : 38456119.167247385
VIDEO_PLAYERS : 24727872.452830188
SOCIAL : 23253652.127118643
PHOTOGRAPHY : 17840110.40229885
PRODUCTIVITY : 16787331.344927534
GAME : 15588015.603248259
TRAVEL_AND_LOCAL : 13984077.710144928
ENTERTAINMENT : 11640705.88235294
TOOLS : 10801391.298666667
NEWS_AND_MAGAZINES : 9549178.467741935
BOOKS_AND_REFERENCE : 8767811.894736841
SHOPPING : 7036877.311557789
PERSONALIZATION : 5201482.6122448975
WEATHER : 5074486.197183099
HEALTH_AND_FITNESS : 4188821.9853479853
MAPS_AND_NAVIGATION : 4056941.7741935486
FAMILY : 3695641.8198090694
SPORTS : 3638640.1428571427
ART_AND_DESIGN : 1986335.0877192982
FOOD_AND_DRINK : 1924897.7363636363
EDUCATION : 1833495.145631068
BUSINESS : 1712290.1474201474
LIFESTYLE : 1437816.2687861272
FINANCE : 1387692.475609756
HOUSE_AND_HOME : 1331540.5616438356
DATING : 854028.8303030303
COMICS : 817657.2727272727
AUTO_AND_VEHICLES : 647317.8170731707
LIBRARIES_AND_DEMO : 638503.734939759
PARENTING : 5426