## Exploring Google Play Store Dataset

Cleaning the Dataset

In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv('https://raw.githubusercontent.com/krishnaik06/playstore-Dataset/main/googleplaystore.csv')

print("Columns:", df.columns)

Columns: Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')


In [17]:
df.head(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up


In [22]:
print("Shape:", df.shape)

df.info()
# Here we see only rating is float64 type and the rest are object dtypes

Shape: (10841, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [23]:
# Rating statistics
df.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


In [24]:
# We see missing values mostly in rating column but not only
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [4]:
# Check all values are numeric - count the difference between the dataset length and the sum of numeric values in the column

# Starting with Reviews Column which has a value 3.0M in one of the rows 
column_name = 'Reviews'
if df[column_name].dtype == 'object':
    non_numeric_values_count = df.shape[0] - df[column_name].str.isnumeric().sum()
    print("Number of non numeric values in 'Reviews' column before:", len(df[~df[column_name].str.isnumeric()]))
    if non_numeric_values_count > 0:
        # 3.0M -> 3000000
        df[column_name] = df[column_name].str.replace('.0M', '000')
        print("Number of non numeric values in 'Reviews' column after:", len(df[~df[column_name].str.isnumeric()]))
    else:
        print("all values are numeric")
    # Convert to dtype int64
    df[column_name]=df[column_name].astype(int)


# df['Reviews']

Number of non numeric values in 'Reviews' column before: 1
Number of non numeric values in 'Reviews' column after: 0


In [5]:
# Handle size

column_name = 'Size'
df[column_name].value_counts()
# Has different values of numbers with M (megabyte), k (kilobyte) or Varies with device, 1,000+
if df[column_name].dtype == 'object':
    df[column_name] = df[column_name].str.replace('k', '')
    df[column_name] = df[column_name].str.replace('M', '000')
    df[column_name] = df[column_name].str.replace(',', '')
    df[column_name] = df[column_name].str.replace('+', '')
    df[column_name] = df[column_name].replace('Varies with device', np.nan)
    df[column_name]=df[column_name].astype(float)


# 1000 could be more than 1000 (1,000+), we need to decide whether to leave it as is or to drop it (just one row)

In [6]:
# Clean Price and Installs columns
characters_to_remove = [',', '+', '$']
columns_to_clean = ['Price', 'Installs']


if all(df[col].dtype == 'object' for col in columns_to_clean):
    for character_to_remove in characters_to_remove:
        for column_to_clean in columns_to_clean:
            df[column_to_clean] = df[column_to_clean].str.replace(character_to_remove, '')
            

    df['Installs'] = df['Installs'].str.replace('Free', '0')
    df['Price'] = df['Price'].str.replace('Everyone', '0')

    df['Installs']=df['Installs'].astype(int)
    df['Price']=df['Price'].astype(float)

In [308]:
df['Installs']

0           10000
1          500000
2         5000000
3        50000000
4          100000
           ...   
10836        5000
10837         100
10838        1000
10839        1000
10840    10000000
Name: Installs, Length: 10841, dtype: int64

In [7]:
# Handle Last Updated Feature

# Found an invalid date = 1.0.2019
# invalid_dates = df[~df['Last Updated'].str.match(r'^\w+ \d{1,2}, \d{4}$', na=False)]
# print("invalid_dates:", invalid_dates)

# Replace invalid dates with NaT
df['Last Updated'] = pd.to_datetime(df['Last Updated'], format='%B %d, %Y', errors='coerce')

# Drop rows with NaT
df = df.dropna(subset=['Last Updated'])

# Reset index
df.reset_index(drop=True, inplace=True)

# Create 3 new integer features from Last Updated - day, month, year
df['last_updated_day'] = df['Last Updated'].dt.day
df['last_updated_month'] = df['Last Updated'].dt.month
df['last_updated_year'] = df['Last Updated'].dt.year

# Removes the column "Last Updated"
df = df.drop(columns=['Last Updated'])

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Current Ver,Android Ver,last_updated_day,last_updated_month,last_updated_year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000.0,10000,Free,0.0,Everyone,Art & Design,1.0.0,4.0.3 and up,7,1,2018
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14000.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2.0.0,4.0.3 and up,15,1,2018
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,1.2.4,4.0.3 and up,1,8,2018


In [8]:
# Save the cleaned dataset to csv file
df.to_csv('google_cleaned.csv', index=False)