In [1]:
# import pandas library
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('googleplaystore.csv')

In [3]:
# Note that some column titles consist of two seperate words. For further analysis it's more convenient
# to have column names which are single words.
df.columns = [s.strip().replace(' ', '_') for s in df.columns]
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content_Rating', 'Genres', 'Last_Updated', 'Current_Ver',
       'Android_Ver'],
      dtype='object')

In [4]:
# Define newdf and assign records without null values.
newdf = df.dropna()

In [5]:
# Convert to string dtype from object.
newdf['Size'] = newdf.Size.astype('string')
newdf.Size

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdf['Size'] = newdf.Size.astype('string')


0                       19M
1                       14M
2                      8.7M
3                       25M
4                      2.8M
                ...        
10834                  2.6M
10836                   53M
10837                  3.6M
10839    Varies with device
10840                   19M
Name: Size, Length: 9360, dtype: string

In [6]:
# Capture indices with value 'Varies with device' (Vwd)
index_with_Vwd = newdf[newdf['Size'].str.contains('Varies')].index # This works
index_with_Vwd

Int64Index([   37,    42,    52,    67,    68,    73,    85,    88,    89,
               92,
            ...
            10647, 10679, 10681, 10707, 10712, 10713, 10725, 10765, 10826,
            10839],
           dtype='int64', length=1637)

In [7]:
# Drop records with indices in index_with_Vwd
newdf.drop(index=index_with_Vwd, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [8]:
# Reset Index: Indices no longer sequential
newdf = newdf.reset_index(drop=True)

In [9]:
# To verify indices being sequential, view index of dropped key, reset index and view results of previously dropped key. Results should be different.
newdf.loc[37]

App               Drawing Clothes Fashion Ideas
Category                         ART_AND_DESIGN
Rating                                      4.2
Reviews                                     117
Size                                        15M
Installs                                10,000+
Type                                       Free
Price                                         0
Content_Rating                         Everyone
Genres                             Art & Design
Last_Updated                      July 20, 2018
Current_Ver                               2.0.1
Android_Ver                        4.0.3 and up
Name: 37, dtype: object

In [10]:
# Find index with of values containing trailing M
index_with_M = newdf[newdf['Size'].str.contains('M')].index # This works


In [11]:
index_with_M

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            7711, 7712, 7713, 7714, 7715, 7716, 7719, 7720, 7721, 7722],
           dtype='int64', length=7466)

In [12]:
# Strip trailing M using index_with_M values
def drop_M(indices):
        newdf['Size'] = [indices.strip().replace('M', '') for indices in newdf['Size']]
        return newdf.Size

In [13]:
#call method drop_M passing variable index_with_M
drop_M(index_with_M)

0         19
1         14
2        8.7
3         25
4        2.8
        ... 
7718    619k
7719     2.6
7720      53
7721     3.6
7722      19
Name: Size, Length: 7723, dtype: object

In [14]:
# Find index with of values containing trailing K
index_with_k = newdf[newdf['Size'].str.contains('k')].index # This works
index_with_k

Int64Index([  53,  158,  273,  308,  314,  538,  543,  698, 1030, 1032,
            ...
            7616, 7617, 7618, 7626, 7628, 7652, 7667, 7672, 7717, 7718],
           dtype='int64', length=257)

In [15]:
# Strip trailing k from Size using index_with_k
def drop_k(indices):
        newdf['Size'] = [indices.strip().replace('k', '') for indices in newdf['Size']]
        return newdf.Size

In [16]:
#call method drop_k passing variable index_with_k
drop_k(index_with_k)

0        19
1        14
2       8.7
3        25
4       2.8
       ... 
7718    619
7719    2.6
7720     53
7721    3.6
7722     19
Name: Size, Length: 7723, dtype: object

In [17]:
# verify dropped indices; previous length was 9360
newdf

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content_Rating,Genres,Last_Updated,Current_Ver,Android_Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7718,Chemin (fr),BOOKS_AND_REFERENCE,4.8,44,619,"1,000+",Free,0,Everyone,Books & Reference,"March 23, 2014",0.8,2.2 and up
7719,FR Calculator,FAMILY,4.0,7,2.6,500+,Free,0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up
7720,Sya9a Maroc - FR,FAMILY,4.5,38,53,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
7721,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up


In [18]:
# Convert column 'Size' to float
newdf['Size'] = newdf['Size'].astype('float')
newdf.Size

0        19.0
1        14.0
2         8.7
3        25.0
4         2.8
        ...  
7718    619.0
7719      2.6
7720     53.0
7721      3.6
7722     19.0
Name: Size, Length: 7723, dtype: float64

In [19]:
# multiply values in index_with_M by 2
def multiply_by_1000(indices):
    newdf['Size'] = [(indices * 1000) for indices in newdf.Size]
    return newdf.Size

In [20]:
multiply_by_1000(index_with_M)

0        19000.0
1        14000.0
2         8700.0
3        25000.0
4         2800.0
          ...   
7718    619000.0
7719      2600.0
7720     53000.0
7721      3600.0
7722     19000.0
Name: Size, Length: 7723, dtype: float64

In [21]:
# view data in excel
newdf.to_excel('check-values.xlsx')

In [22]:
newdf['Installs_percentiles'] = newdf.Installs.rank(pct=True)
newdf['Installs_percentiles']

0       0.330765
1       0.964457
2       0.789849
3       0.897708
4       0.606630
          ...   
7718    0.045060
7719    0.919979
7720    0.728085
7721    0.519876
7722    0.446847
Name: Installs_percentiles, Length: 7723, dtype: float64

In [23]:
# Indice that fall within 10th percentile
tenth_percentile = newdf[newdf.Installs_percentiles <= 0.10].index
tenth_percentile

Int64Index([ 246,  320,  427,  429,  436,  437,  449,  453,  466,  472,
            ...
            7654, 7659, 7662, 7671, 7677, 7682, 7700, 7707, 7712, 7718],
           dtype='int64', length=692)

In [24]:
# Indice that fall within 25th percentile
twentyfitfth_percentile = newdf[newdf.Installs_percentiles <= 0.25].index
twentyfitfth_percentile

Int64Index([   7,    8,   10,   11,   46,   57,   69,   71,   73,   76,
            ...
            7682, 7693, 7696, 7697, 7700, 7702, 7705, 7707, 7712, 7718],
           dtype='int64', length=1993)

In [25]:
# Indice that fall within 50th percentile
fiftieth_percentile = newdf[newdf.Installs_percentiles <= 0.50].index
fiftieth_percentile

Int64Index([   0,    7,    8,    9,   10,   11,   12,   16,   18,   23,
            ...
            7700, 7701, 7702, 7705, 7707, 7712, 7714, 7715, 7718, 7722],
           dtype='int64', length=3863)

In [26]:
# Indice that fall within 70th percentile
seventieth_percentile = newdf[newdf.Installs_percentiles <= 0.70].index
seventieth_percentile

Int64Index([   0,    4,    7,    8,    9,   10,   11,   12,   13,   14,
            ...
            7707, 7710, 7712, 7714, 7715, 7716, 7717, 7718, 7721, 7722],
           dtype='int64', length=5404)

In [27]:
# Indice that fall within 90th percentile
ninetieth_percentile = newdf[newdf.Installs_percentiles <= 0.90].index
ninetieth_percentile

Int64Index([   0,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            7712, 7713, 7714, 7715, 7716, 7717, 7718, 7720, 7721, 7722],
           dtype='int64', length=7006)

In [28]:
# Indice that fall within 95th percentile
ninetyfifth_percentile = newdf[newdf.Installs_percentiles <= 0.95].index
ninetyfifth_percentile

Int64Index([   0,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            7713, 7714, 7715, 7716, 7717, 7718, 7719, 7720, 7721, 7722],
           dtype='int64', length=7203)

In [29]:
# Indice that fall within 99th percentile
ninetyninth_percentile = newdf[newdf.Installs_percentiles <= 0.99].index
ninetyninth_percentile

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            7713, 7714, 7715, 7716, 7717, 7718, 7719, 7720, 7721, 7722],
           dtype='int64', length=7693)

In [None]:
# Find average installs of each percentile using the indices found in steps above
# use newdf with mean on Installs colum using specific indices.
# newdf.Installs.index.mean
tenth_percentile_average

In [34]:
newdf.Installs[[ninetyninth_percentile[7692]]]

7722    10,000,000+
Name: Installs, dtype: object

In [None]:
ninetyninth_percentile

In [None]:
# Function to get the average of column by passing in specific indices.
# Use For-loop
def average_of_installs(indices):
    temp_var = newdf.Installs[indices[0]]
    for i in indices:
        temp_var += newdf.Installs[[indices[i]]]
    tenth_percentile_average = temp_var / indices.size
    return tenth_percentile_average