In [1]:
import numpy as np
import pandas as pd

# Loading Dataset

In [2]:
def load_dataset():
    df = pd.read_csv('googleplaystore.csv')
    return df

apps_data = load_dataset()
apps_data.sample(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
3720,"Free TV Shows App:News, TV Series, Episode, Mo...",NEWS_AND_MAGAZINES,4.6,29706,11M,"1,000,000+",Free,0,Teen,News & Magazines,"August 2, 2018",2.71,4.1 and up
8928,Proverbes du monde,FAMILY,,38,2.8M,"10,000+",Free,0,Everyone,Entertainment,"March 12, 2018",1.1.2.0,1.6 and up
6059,Millionaire Quiz Free: Be Rich,FAMILY,3.8,66033,6.8M,"10,000,000+",Free,0,Everyone,Casual,"January 1, 2018",2.5.1,4.0 and up
3575,Baby Names,PARENTING,4.5,86,2.8M,"10,000+",Free,0,Everyone,Parenting,"August 3, 2018",1.7,4.0.3 and up
3017,CBS Sports Fantasy,SPORTS,4.0,43611,27M,"1,000,000+",Free,0,Everyone,Sports,"June 27, 2018",3.17.0+3,5.0 and up
1706,Plants vs. Zombies FREE,GAME,4.4,4066980,69M,"100,000,000+",Free,0,Everyone 10+,Strategy,"July 6, 2018",2.2.00,4.1 and up
7910,CU SoCal Mobile Banking,FINANCE,4.0,253,25M,"10,000+",Free,0,Everyone,Finance,"May 2, 2018",4.6.68,4.0.3 and up
2048,ABC Kids - Tracing & Phonics,FAMILY,4.5,36606,Varies with device,"10,000,000+",Free,0,Everyone,Educational;Education,"June 22, 2018",Varies with device,4.0 and up
4980,RAM Cleanup Ad-Free Option,TOOLS,4.4,457,1.1M,"10,000+",Paid,$2.99,Everyone,Tools,"July 26, 2016",3.0,4.1 and up
8095,Alpha Bank CY,FINANCE,4.7,39,20M,"1,000+",Free,0,Everyone,Finance,"March 20, 2018",1.11,4.3 and up


# Printing summarize dataset

In [3]:
def print_summarize_dataset(dataset):
    print("Dataset dimension:")
    print(dataset.shape)
    print("First 10 rows of dataset:")
    print(dataset.head(10))
    print("10 random rows from the data set:")
    print(dataset.sample(10))
    print("Statistical summary:")
    print(dataset.describe())
    print("Additional informations:")
    print(dataset.info())

print_summarize_dataset(apps_data)

Dataset dimension:
(10841, 13)
First 10 rows of dataset:
                                                 App        Category  Rating  \
0     Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN     4.1   
1                                Coloring book moana  ART_AND_DESIGN     3.9   
2  U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN     4.7   
3                              Sketch - Draw & Paint  ART_AND_DESIGN     4.5   
4              Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN     4.3   
5                         Paper flowers instructions  ART_AND_DESIGN     4.4   
6            Smoke Effect Photo Maker - Smoke Editor  ART_AND_DESIGN     3.8   
7                                   Infinite Painter  ART_AND_DESIGN     4.1   
8                               Garden Coloring Book  ART_AND_DESIGN     4.4   
9                      Kids Paint Free - Drawing Fun  ART_AND_DESIGN     4.7   

  Reviews  Size     Installs  Type Price Content Rating  \
0  

# Cleaning dataset

In [4]:
apps_data.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [5]:
def clean_dataset(dataset):
  # We fill the NaN values in the ranking column with the previous values.
  dataset['Rating'].fillna(method='ffill', inplace=True)
  # We discard NaN values in the Content Rating column.
  dataset.dropna(subset=['Content Rating'], inplace=True)
  # The type column only has 1 NaN value. We fill it with the most popular "Free" values.
  dataset['Type'].fillna(value = 'Free', inplace = True)
  # We discard the unnecessary columns
  dataset.drop(columns=['Last Updated', 'Current Ver', 'Android Ver'], inplace=True)
  # Correct the data type of numeric columns
  dataset['Reviews'] = dataset['Reviews'].astype(int)
    
  dataset['Size'] = dataset['Size'].map(lambda x: x.strip('+').replace(',', ''))
  dataset['Size'] = dataset['Size'].str.replace('M', 'e+6')
  dataset['Size'] = dataset['Size'].str.replace('k', 'e+3')
  dataset['Size'] = dataset['Size'].replace('Varies with device', np.NaN)
  dataset.dropna(subset = ['Size'], inplace=True)