**Foundations of Computer Science: Project**

Notebook by Louis Fabrice Tshimanga

**1. Convert the app sizes to a number**

In [1]:
# General import
import pandas as pd
import numpy as np
import re

In [2]:
# Exploration/inspection of the data set

googleplay = pd.read_csv('googleplaystore.csv')
googleplay.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [3]:
googleplay.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [4]:
googleplay.shape

(10841, 13)

In [5]:
print(googleplay['Size'])

0                       19M
1                       14M
2                      8.7M
3                       25M
4                      2.8M
5                      5.6M
6                       19M
7                       29M
8                       33M
9                      3.1M
10                      28M
11                      12M
12                      20M
13                      21M
14                      37M
15                     2.7M
16                     5.5M
17                      17M
18                      39M
19                      31M
20                      14M
21                      12M
22                     4.2M
23                     7.0M
24                      23M
25                     6.0M
26                      25M
27                     6.1M
28                     4.6M
29                     4.2M
                ...        
10811                  3.9M
10812                   13M
10813                  2.7M
10814                   31M
10815               

In [6]:
#Convert Sizes to number values

#sizes look like they're in Millions ['M'] and kilos ['k'] [of bytes], or they're varying with device
#Let's account for it, putting sizes in units of Millions, eventually fractions

#function to convert single string
def sizeconverter(size_as_string):
    try:
        unit_measure = re.findall('[A-Za-z]$', size_as_string)[0]
        size_as_string = re.findall('^(\d+\.*\d*)\D+$', size_as_string)[0]
        
        size_as_number = float(size_as_string)
        if unit_measure == 'k':
            size_as_number = size_as_number / 1000
    except:
        size_as_number = size_as_string  #petty but useful way to check for exceptions in the data_set
        #print(size_as_number)
        #print(type(size_as_number))
    return size_as_number


In [7]:
googleplay['Size'] = googleplay['Size'].apply(lambda x: sizeconverter(x))
googleplay.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [8]:
googleplay.rename(columns={'Size':'Size [Mb]'}, inplace=True)
googleplay.head()

Unnamed: 0,App,Category,Rating,Reviews,Size [Mb],Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


**2. Convert the number of installs to a number**

In [9]:
#function to convert installs in numbers
def installconverter(install_as_string):
    try:
        if type(install_as_string) == float:
            install_as_number = install_as_string
        elif type(install_as_string) == int:
            install_as_number = float(install_as_string)
        else:       #the installs have a comma separating thousands, let's isolate, separate, and multiply for positional value
            install_as_string = re.findall('^(\d+,*\d*,*\d*,*\d*)\D*$', install_as_string)[0]
            units = re.findall('(.{1,3})$', install_as_string)[0]
            thousands = re.findall('(.{1,3}),.{1,3}$', install_as_string)
            millions = re.findall('(.{1,3}),.{1,3},.{1,3}$', install_as_string)
            billions = re.findall('(.{1,3}),.{1,3},.{1,3},.{1,3}$', install_as_string)
            units = float(units)
        
            install_as_number = units
            if thousands != []:
                install_as_number += 1000*float(thousands[0])
                if millions != []:
                    install_as_number += 1000000*float(millions[0])
                    if billions != []:
                        install_as_number += 1000000000*float(billions[0])
    except:
        install_as_number = install_as_string
        print(install_as_number)
        print(type(install_as_number))
    return install_as_number

In [10]:
googleplay['Installs'] = googleplay['Installs'].apply(lambda x: installconverter(x))
googleplay.head()

Free
<class 'str'>


Unnamed: 0,App,Category,Rating,Reviews,Size [Mb],Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000.0,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000.0,Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000.0,Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000.0,Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000.0,Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [11]:
googleplay[googleplay['Installs']=='Free'] = np.nan

**3. Transform “Varies with device” into a missing value**

In [12]:
def varies_transf(string):
    if string == 'Varies with device':
        string = np.nan
    return string

In [13]:
googleplay[googleplay['Size [Mb]'] == 'Varies with device'].head()

Unnamed: 0,App,Category,Rating,Reviews,Size [Mb],Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
37,Floor Plan Creator,ART_AND_DESIGN,4.1,36639,Varies with device,5000000.0,Free,0,Everyone,Art & Design,"July 14, 2018",Varies with device,2.3.3 and up
42,Textgram - write on photos,ART_AND_DESIGN,4.4,295221,Varies with device,10000000.0,Free,0,Everyone,Art & Design,"July 30, 2018",Varies with device,Varies with device
52,Used Cars and Trucks for Sale,AUTO_AND_VEHICLES,4.6,17057,Varies with device,1000000.0,Free,0,Everyone,Auto & Vehicles,"July 30, 2018",Varies with device,Varies with device
67,Ulysse Speedometer,AUTO_AND_VEHICLES,4.3,40211,Varies with device,5000000.0,Free,0,Everyone,Auto & Vehicles,"July 30, 2018",Varies with device,Varies with device
68,REPUVE,AUTO_AND_VEHICLES,3.9,356,Varies with device,100000.0,Free,0,Everyone,Auto & Vehicles,"May 25, 2018",Varies with device,Varies with device


In [14]:
#since 'Varies with device' appears in at least three columns...

for col in googleplay.columns:
    googleplay[col] = googleplay[col].apply(lambda x: varies_transf(x))

googleplay.head()

Unnamed: 0,App,Category,Rating,Reviews,Size [Mb],Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000.0,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000.0,Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000.0,Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000.0,Free,0,Teen,Art & Design,"June 8, 2018",,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000.0,Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [15]:
googleplay.loc[37, :]

App               Floor Plan Creator
Category              ART_AND_DESIGN
Rating                           4.1
Reviews                        36639
Size [Mb]                        NaN
Installs                       5e+06
Type                            Free
Price                              0
Content Rating              Everyone
Genres                  Art & Design
Last Updated           July 14, 2018
Current Ver                      NaN
Android Ver             2.3.3 and up
Name: 37, dtype: object

**4. Convert Current Ver and Android Ver into a dotted number (e.g. 4.0.3 or 4.2)**

In [16]:
print(googleplay.loc[:, 'Current Ver':'Android Ver'])

      Current Ver   Android Ver
0           1.0.0  4.0.3 and up
1           2.0.0  4.0.3 and up
2           1.2.4  4.0.3 and up
3             NaN    4.2 and up
4             1.1    4.4 and up
5             1.0    2.3 and up
6             1.1  4.0.3 and up
7        6.1.61.1    4.2 and up
8           2.9.2    3.0 and up
9             2.8  4.0.3 and up
10          1.0.4    4.1 and up
11         1.0.15    4.0 and up
12            3.8    4.1 and up
13          1.0.4    4.4 and up
14          1.2.3    2.3 and up
15            NaN    4.2 and up
16            3.1    4.1 and up
17            1.0    2.3 and up
18          2.2.5  4.0.3 and up
19          5.5.4    4.1 and up
20            4.0    4.1 and up
21            1.1  4.0.3 and up
22        2.2.6.2  4.0.3 and up
23          1.0.0    4.1 and up
24          1.1.3    4.1 and up
25            1.5    3.0 and up
26          1.0.8  4.0.3 and up
27           1.03  4.0.3 and up
28            6.0    2.3 and up
29            1.0    2.3 and up
...     

In [17]:
# only dotted number (the first column has dotted numbers of unusual length, but it's not specified wether it's problematic)

def ver_converter(version):
    try:
        if (type(version) == str) and (version != 'nan'):
            version = re.findall('\D*([\d+\.?]+)\D*', version)[0]
        elif (type(version) == int) or (type(version) == float):
            version = str(version)
    except:
        print(version)
        print(type(version))
        version = np.nan
    return version

In [18]:
for col in ['Current Ver', 'Android Ver']:
    googleplay[col] = googleplay[col].apply(lambda x: ver_converter(x))

googleplay.loc[:, 'Current Ver':'Android Ver']

Initial
<class 'str'>
Natalia Studio Development
<class 'str'>
closed
<class 'str'>
newversion
<class 'str'>
Final
<class 'str'>
opciÃ³n de cerrar
<class 'str'>
App copyright
<class 'str'>
Copyright
<class 'str'>
Gratis
<class 'str'>
KM
<class 'str'>
DH-Security Camera
<class 'str'>
HTTPs
<class 'str'>
Human Dx
<class 'str'>
Final
<class 'str'>
BlueOrange
<class 'str'>
MONEY
<class 'str'>


Unnamed: 0,Current Ver,Android Ver
0,1.0.0,4.0.3
1,2.0.0,4.0.3
2,1.2.4,4.0.3
3,,4.2
4,1.1,4.4
5,1.0,2.3
6,1.1,4.0.3
7,6.1.61.1,4.2
8,2.9.2,3.0
9,2.8,4.0.3


**5. Remove the duplicates**

In [19]:
googleplay.head()

Unnamed: 0,App,Category,Rating,Reviews,Size [Mb],Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000.0,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000.0,Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000.0,Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000.0,Free,0,Teen,Art & Design,"June 8, 2018",,4.2
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000.0,Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4


In [20]:
#count = 0
#for index, row in googleplay.iterrows():
#    for index2, row2 in googleplay.iterrows():
#        if row['App'] == row2['App'] and index != index2:
#            print((index, index2))
#            count += 1
#            googleplay_dedup = googleplay.drop(index2)
#print(count)

#but this is a nested for cycle on 10k+ elements; fortunately there's an appropriate method....

In [21]:
googleplay.drop_duplicates(subset='App', keep='first', inplace=True) #this operation doesn't check if App-duplicates contain
#different information or nan values; the duplicates are most probably [a subset of] actual duplicates,
#but there's no guarantee the most informative one is kept
googleplay.shape

# to check for duplicates with different information, first drop only complete duplicates, then apply a for cycle to print&check
#or drop duplicates with specific "safe" subset of columns, to restrict progressively and eventually check with for cycles

(9660, 13)

**6. For each category, compute the number of apps**

In [22]:
num_apps_withsize = googleplay.groupby('Category').size()

print(num_apps_withsize)

Category
ART_AND_DESIGN           64
AUTO_AND_VEHICLES        85
BEAUTY                   53
BOOKS_AND_REFERENCE     222
BUSINESS                420
COMICS                   56
COMMUNICATION           315
DATING                  171
EDUCATION               119
ENTERTAINMENT           102
EVENTS                   64
FAMILY                 1832
FINANCE                 345
FOOD_AND_DRINK          112
GAME                    959
HEALTH_AND_FITNESS      288
HOUSE_AND_HOME           74
LIBRARIES_AND_DEMO       84
LIFESTYLE               369
MAPS_AND_NAVIGATION     131
MEDICAL                 395
NEWS_AND_MAGAZINES      254
PARENTING                60
PERSONALIZATION         376
PHOTOGRAPHY             281
PRODUCTIVITY            374
SHOPPING                202
SOCIAL                  239
SPORTS                  325
TOOLS                   827
TRAVEL_AND_LOCAL        219
VIDEO_PLAYERS           163
WEATHER                  79
dtype: int64


**7. For each category, compute the average rating**

In [23]:
avg_rating = googleplay.groupby('Category')['Rating'].mean()
print(avg_rating)

Category
ART_AND_DESIGN         4.357377
AUTO_AND_VEHICLES      4.190411
BEAUTY                 4.278571
BOOKS_AND_REFERENCE    4.344970
BUSINESS               4.098479
COMICS                 4.181481
COMMUNICATION          4.121484
DATING                 3.970149
EDUCATION              4.364407
ENTERTAINMENT          4.135294
EVENTS                 4.435556
FAMILY                 4.179664
FINANCE                4.115563
FOOD_AND_DRINK         4.172340
GAME                   4.247368
HEALTH_AND_FITNESS     4.243033
HOUSE_AND_HOME         4.150000
LIBRARIES_AND_DEMO     4.178125
LIFESTYLE              4.093355
MAPS_AND_NAVIGATION    4.036441
MEDICAL                4.166552
NEWS_AND_MAGAZINES     4.121569
PARENTING              4.300000
PERSONALIZATION        4.332215
PHOTOGRAPHY            4.157414
PRODUCTIVITY           4.183389
SHOPPING               4.230000
SOCIAL                 4.247291
SPORTS                 4.216154
TOOLS                  4.039554
TRAVEL_AND_LOCAL       4.069519

**8.     Create two dataframes: one for the genres and one bridging apps and genres. So that, for instance, the app Pixel Draw - Number Art Coloring Book appears twice in the bridging table, once for Art & Design, once for Creativity**

In [24]:
genres_df = pd.DataFrame(googleplay['Genres'])
genres_df.drop_duplicates(keep='first', inplace=True) # to be sure, but happens to be useless
genres_df

Unnamed: 0,Genres
0,Art & Design
1,Art & Design;Pretend Play
4,Art & Design;Creativity
23,Art & Design;Action & Adventure
49,Auto & Vehicles
98,Beauty
139,Books & Reference
187,Business
297,Comics
301,Comics;Creativity


In [25]:
# scan the actual genre combinations and split by semicolon ';'
genres_dict={}
genres_df = genres_df.dropna(axis=0)
for attr in genres_df['Genres']:
    pieces = attr.split(';')
    
    for piece in pieces:
        # this dictionary works as a counter, though the count is irrelevant, it's efficient to decouple genres as keys        
        genres_dict[piece] = genres_dict.get(piece,0) + 1
#dictionaries are unsorted, lists are
genres = list(genres_dict.keys())

genres_df = pd.DataFrame(genres, columns=['Genre'])

genres_df


Unnamed: 0,Genre
0,Art & Design
1,Pretend Play
2,Creativity
3,Action & Adventure
4,Auto & Vehicles
5,Beauty
6,Books & Reference
7,Business
8,Comics
9,Communication


In [26]:
#bridging apps and genres. So that, for instance, the app Pixel Draw - Number Art Coloring Book appears twice
#in the bridging table, once for Art & Design, once for Creativity

#scan the apps, scan their genres, 
#create a tuple (app, genre) whenever the split genre from the dataframe is also a key of the dictionary
app_genres_join = []

for index, row in googleplay.iterrows():
    app = row['App']
    app_genres = row['Genres']
    
    try:
        app_genres = app_genres.split(';')
        if len(app_genres) > 1:
            for _ in range(len(app_genres)):
                app_genres_join.append((app, app_genres[_]))
        else:
            app_genres_join.append((app, app_genres[0]))
    
    except:
        app_genre = np.nan
        app_genres_join.append((app, app_genres))
    
#trasforma la lista di tuple in dataframe
app_genres_join = pd.DataFrame(app_genres_join)
app_genres_join.columns = ['App', 'Genres']
app_genres_join.head()

Unnamed: 0,App,Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art & Design
1,Coloring book moana,Art & Design
2,Coloring book moana,Pretend Play
3,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art & Design
4,Sketch - Draw & Paint,Art & Design


**9.     For each genre, create a new column of the original dataframe. The new columns must have boolean values (True if the app has a given genre)**

In [27]:
for genre in genres_dict.keys():
    googleplay.insert(len(googleplay.columns), genre, value = False)

for index, row in googleplay.iterrows():
    if pd.isna(row['Genres']):
        continue
    
    app_genres = row['Genres']
    app_genres = app_genres.split(';')
    for genre in app_genres:
        row[genre] = True

In [28]:
googleplay.head()

Unnamed: 0,App,Category,Rating,Reviews,Size [Mb],Installs,Type,Price,Content Rating,Genres,...,Photography,Travel & Local,Tools,Personalization,Productivity,Parenting,Weather,News & Magazines,Maps & Navigation,Casino
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000.0,Free,0,Everyone,Art & Design,...,False,False,False,False,False,False,False,False,False,False
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000.0,Free,0,Everyone,Art & Design;Pretend Play,...,False,False,False,False,False,False,False,False,False,False
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000.0,Free,0,Everyone,Art & Design,...,False,False,False,False,False,False,False,False,False,False
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000.0,Free,0,Teen,Art & Design,...,False,False,False,False,False,False,False,False,False,False
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000.0,Free,0,Everyone,Art & Design;Creativity,...,False,False,False,False,False,False,False,False,False,False


**10.     For each genre, compute the average rating. What is the genre with highest average?**

In [29]:
#
###join app-genre with the dataframe, so to have all app-genre combinations joint to all app-data

###groupby genre(from the app-genre coupling) and compute the mean of each group

merger_df = pd.merge(app_genres_join, googleplay, how='left', on='App', suffixes=('_x', '_y')) #table left-joining on app names so that each
avg_rating_bygenre = merger_df.groupby('Genres_x')['Rating'].mean()
avg_rating_bygenre = pd.DataFrame(avg_rating_bygenre)
avg_rating_bygenre


Unnamed: 0_level_0,Rating
Genres_x,Unnamed: 1_level_1
Action,4.247697
Action & Adventure,4.288542
Adventure,4.191026
Arcade,4.277838
Art & Design,4.35
Auto & Vehicles,4.190411
Beauty,4.278571
Board,4.287719
Books & Reference,4.343275
Brain Games,4.357143


In [30]:
#retrieve index of max item to retrieve complete item
idx = avg_rating_bygenre.idxmax(axis=0)
top_rated_genre = avg_rating_bygenre.loc[idx]
top_rated_genre 

Unnamed: 0_level_0,Rating
Genres_x,Unnamed: 1_level_1
Events,4.435556


**11.     For each app, compute the approximate income, obtain as a product of number of installs and price.**

In [31]:
def floatizer(stringa):
    try:
        if type(stringa) == type('str'):
            stringa = re.findall('^\$*(\d+\.*\d*)', stringa)
            stringa = stringa[0]
            stringa = float(stringa)
        elif (type(stringa) == int):
            stringa = float(stringa)
    except:
        print('uh-oh')
        print(stringa)
        print(type(stringa))
    return(stringa)

In [32]:
googleplay['Price'] = googleplay['Price'].apply(lambda x: floatizer(x))

In [33]:
googleplay['Price'].isnull().any().any()

True

In [34]:
ApproxIncome = googleplay['Price'] * googleplay['Installs']
ApproxIncome
googleplay.insert(len(googleplay.columns), 'Approximate Income', ApproxIncome)

In [35]:
googleplay[googleplay['Approximate Income'] > 0].head()

Unnamed: 0,App,Category,Rating,Reviews,Size [Mb],Installs,Type,Price,Content Rating,Genres,...,Travel & Local,Tools,Personalization,Productivity,Parenting,Weather,News & Magazines,Maps & Navigation,Casino,Approximate Income
234,TurboScan: scan documents and receipts in PDF,BUSINESS,4.7,11442,6.8,100000.0,Paid,4.99,Everyone,Business,...,False,False,False,False,False,False,False,False,False,499000.0
235,Tiny Scanner Pro: PDF Doc Scan,BUSINESS,4.8,10295,39.0,100000.0,Paid,4.99,Everyone,Business,...,False,False,False,False,False,False,False,False,False,499000.0
427,Puffin Browser Pro,COMMUNICATION,4.0,18247,,100000.0,Paid,3.99,Everyone,Communication,...,False,False,False,False,False,False,False,False,False,399000.0
476,"Moco+ - Chat, Meet People",DATING,4.2,1545,,10000.0,Paid,3.99,Mature 17+,Dating,...,False,False,False,False,False,False,False,False,False,39900.0
477,Calculator,DATING,2.6,57,6.2,1000.0,Paid,6.99,Everyone,Dating,...,False,False,False,False,False,False,False,False,False,6990.0


**12.  For each app, compute its minimum and maximum Sentiment_polarity**

In [36]:
googlereviews = pd.read_csv('googleplaystore_user_reviews.csv')
googlereviews.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [37]:
review_merge = pd.merge(googleplay, googlereviews, on='App', how='right') #every review relative to an app is joint to the data

In [38]:
app_max_sentpol = review_merge.groupby('App')['Sentiment_Polarity'].max()
app_min_sentpol = review_merge.groupby('App')['Sentiment_Polarity'].min()

print('Max Sentiment Polarity for each app:\n', app_max_sentpol)
print('Min Sentiment Polarity for each app:\n', app_min_sentpol)

Max Sentiment Polarity for each app:
 App
10 Best Foods for You                                 1.000000
104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室                      0.910000
11st                                                  1.000000
1800 Contacts - Lens Store                            0.838542
1LINE – One Line with One Touch                       1.000000
2018Emoji Keyboard 😂 Emoticons Lite -sticker&gif      1.000000
21-Day Meditation Experience                          0.587500
2Date Dating App, Love and matching                   1.000000
2GIS: directory & navigator                           1.000000
2RedBeans                                             1.000000
2ndLine - Second Phone Number                         1.000000
30 Day Fitness Challenge - Workout at Home            0.900000
365Scores - Live Scores                               1.000000
3D Blue Glass Water Keyboard Theme                         NaN
3D Color Pixel by Number - Sandbox Art Coloring            NaN
3D Live Neon 