In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [21]:
df = pd.read_csv('googleplaystore.csv')
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [23]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

## Insights and Observations
#The dataset have missing values in several columns like Rating, Type, Content Rating, Genres, Last Updated, Current Ver and Android Ver.
#The dataset contains a total of 10841 entries and 13 columns.

In [24]:
#To begin with we can conver the 'Reviews' column to numeric datatype

#df['Reviews'].astype('int32') 

#we got an error because there is a value '3.0M' in the column which cannot be converted to integer. So we will have to take care of it first.

df[~df['Reviews'].str.isnumeric()]

#We need to clean this record.

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,


## Data Cleaning

In [25]:
#To begin with will create a copy of Dataframe
df1 = df.copy()
#df1.loc[10472,'Reviews'] = '3000000' #Replacing '3.0M' with '3000000'
#df1['Reviews'] = df1['Reviews'].astype('int32') #Now converting to int32
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [26]:
#Its better to drop the row/record as it has multiple troublesome values -- kind of an outlier
df1.drop(df1.index[10472], inplace=True)
#df1.isnull().sum()
df1[~df1['Reviews'].str.isnumeric()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver


In [28]:
df1['Reviews'] = df1['Reviews'].astype('int')

In [29]:
df1['Size'].unique()

array(['19M', '14M', '8.7M', '25M', '2.8M', '5.6M', '29M', '33M', '3.1M',
       '28M', '12M', '20M', '21M', '37M', '2.7M', '5.5M', '17M', '39M',
       '31M', '4.2M', '7.0M', '23M', '6.0M', '6.1M', '4.6M', '9.2M',
       '5.2M', '11M', '24M', 'Varies with device', '9.4M', '15M', '10M',
       '1.2M', '26M', '8.0M', '7.9M', '56M', '57M', '35M', '54M', '201k',
       '3.6M', '5.7M', '8.6M', '2.4M', '27M', '2.5M', '16M', '3.4M',
       '8.9M', '3.9M', '2.9M', '38M', '32M', '5.4M', '18M', '1.1M',
       '2.2M', '4.5M', '9.8M', '52M', '9.0M', '6.7M', '30M', '2.6M',
       '7.1M', '3.7M', '22M', '7.4M', '6.4M', '3.2M', '8.2M', '9.9M',
       '4.9M', '9.5M', '5.0M', '5.9M', '13M', '73M', '6.8M', '3.5M',
       '4.0M', '2.3M', '7.2M', '2.1M', '42M', '7.3M', '9.1M', '55M',
       '23k', '6.5M', '1.5M', '7.5M', '51M', '41M', '48M', '8.5M', '46M',
       '8.3M', '4.3M', '4.7M', '3.3M', '40M', '7.8M', '8.8M', '6.6M',
       '5.1M', '61M', '66M', '79k', '8.4M', '118k', '44M', '695k', '1.6M',
     

In [34]:
#Now we take care of the 'Size' column which has values like '19M', '14k' etc. We will convert them to numeric values in bytes.
#Basically we will just convert Million and keep all the values to k
df1['Size'] = df1['Size'].str.replace('M','000')
df1['Size'] = df1['Size'].str.replace('.','')
df1['Size'] = df1['Size'].str.replace('k','')

#To check if there are any other values in Size

print(df1['Size'].unique())
#There is a value called 'Varies with device'

df1.isnull().sum()

['19000' '14000' '87000' '25000' '28000' '56000' '29000' '33000' '31000'
 '12000' '20000' '21000' '37000' '27000' '55000' '17000' '39000' '42000'
 '70000' '23000' '60000' '61000' '46000' '92000' '52000' '11000' '24000'
 'Varies with device' '94000' '15000' '10000' '26000' '80000' '79000'
 '57000' '35000' '54000' '201' '36000' '86000' '16000' '34000' '89000'
 '38000' '32000' '18000' '22000' '45000' '98000' '90000' '67000' '30000'
 '71000' '74000' '64000' '82000' '99000' '49000' '95000' '50000' '59000'
 '13000' '73000' '68000' '40000' '72000' '91000' '23' '65000' '75000'
 '51000' '41000' '48000' '85000' '83000' '43000' '47000' '78000' '88000'
 '66000' '79' '84000' '118' '44000' '695' '62000' '18' '53000' '58000'
 '96000' '63000' '77000' '69000' '93000' '100000' '81000' '97000' '556'
 '526' '76000' '334' '232' '624' '85' '41' '292' '11' '704' '862' '899'
 '378' '266' '375' '975' '980' '696' '544' '525' '920' '779' '853' '720'
 '713' '772' '318' '58' '241' '196' '857' '51' '953' '865' '251

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       0
Genres               0
Last Updated         0
Current Ver          8
Android Ver          2
dtype: int64

In [35]:
df1['Size'] = df1['Size'].replace('Varies with device',np.nan)

#Here we use this because I was getting a Valueerror when converting the 'size' column from String to Float

df1['Size'] = pd.to_numeric(df1['Size'], errors='coerce')

In [36]:
df1.head(2)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000.0,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14000.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up


In [37]:
#Now we take care of 'Installs' and 'Price' column

print(df1['Installs'].unique())
print(df1['Price'].unique())

['10,000+' '500,000+' '5,000,000+' '50,000,000+' '100,000+' '50,000+'
 '1,000,000+' '10,000,000+' '5,000+' '100,000,000+' '1,000,000,000+'
 '1,000+' '500,000,000+' '50+' '100+' '500+' '10+' '1+' '5+' '0+' '0']
['0' '$4.99' '$3.99' '$6.99' '$1.49' '$2.99' '$7.99' '$5.99' '$3.49'
 '$1.99' '$9.99' '$7.49' '$0.99' '$9.00' '$5.49' '$10.00' '$24.99'
 '$11.99' '$79.99' '$16.99' '$14.99' '$1.00' '$29.99' '$12.99' '$2.49'
 '$10.99' '$1.50' '$19.99' '$15.99' '$33.99' '$74.99' '$39.99' '$3.95'
 '$4.49' '$1.70' '$8.99' '$2.00' '$3.88' '$25.99' '$399.99' '$17.99'
 '$400.00' '$3.02' '$1.76' '$4.84' '$4.77' '$1.61' '$2.50' '$1.59' '$6.49'
 '$1.29' '$5.00' '$13.99' '$299.99' '$379.99' '$37.99' '$18.99' '$389.99'
 '$19.90' '$8.49' '$1.75' '$14.00' '$4.85' '$46.99' '$109.99' '$154.99'
 '$3.08' '$2.59' '$4.80' '$1.96' '$19.40' '$3.90' '$4.59' '$15.46' '$3.04'
 '$4.29' '$2.60' '$3.28' '$4.60' '$28.99' '$2.95' '$2.90' '$1.97'
 '$200.00' '$89.99' '$2.56' '$30.99' '$3.61' '$394.99' '$1.26' '$1.20'
 '$1.04']


In [38]:
#We have to remove '+', '$', ',', '.' from Datapoints

char_to_remove = ['+','$',',','.']
columns_to_remove_from = ['Installs','Price']

for item in char_to_remove:
    for cols in columns_to_remove_from:
        df1[cols] = df1[cols].str.replace(item,'')

In [39]:
df1['Installs'] = df1['Installs'].astype('int')
df1['Price'] = df1['Price'].astype('float')
df1.info()


<class 'pandas.core.frame.DataFrame'>
Index: 10840 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10840 non-null  object 
 1   Category        10840 non-null  object 
 2   Rating          9366 non-null   float64
 3   Reviews         10840 non-null  int64  
 4   Size            9145 non-null   float64
 5   Installs        10840 non-null  int64  
 6   Type            10839 non-null  object 
 7   Price           10840 non-null  float64
 8   Content Rating  10840 non-null  object 
 9   Genres          10840 non-null  object 
 10  Last Updated    10840 non-null  object 
 11  Current Ver     10832 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(3), int64(2), object(8)
memory usage: 1.2+ MB


In [40]:
#Now we will clean 'Last Updated'

df1['Last Updated'].unique()

#We will convert it to last updated

df1['Last Updated'] = pd.to_datetime(df1['Last Updated'])
df1['day']=df1['Last Updated'].dt.day
df1['month']=df1['Last Updated'].dt.month
df1['year']=df1['Last Updated'].dt.year


In [41]:
df1.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,day,month,year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,7,1,2018
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14000.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,15,1,2018
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,87000.0,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,1,8,2018
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25000.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,8,6,2018
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,28000.0,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,20,6,2018


In [45]:
##Now the data is almost clean, lets save the cleaned data to csv
df1.info()
df1.to_csv('cleaned_data/google_playstore_cleaned1.csv')

<class 'pandas.core.frame.DataFrame'>
Index: 10840 entries, 0 to 10840
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             10840 non-null  object        
 1   Category        10840 non-null  object        
 2   Rating          9366 non-null   float64       
 3   Reviews         10840 non-null  int64         
 4   Size            9145 non-null   float64       
 5   Installs        10840 non-null  int64         
 6   Type            10839 non-null  object        
 7   Price           10840 non-null  float64       
 8   Content Rating  10840 non-null  object        
 9   Genres          10840 non-null  object        
 10  Last Updated    10840 non-null  datetime64[ns]
 11  Current Ver     10832 non-null  object        
 12  Android Ver     10838 non-null  object        
 13  day             10840 non-null  int32         
 14  month           10840 non-null  int32         
 15  year   

In [46]:
df2 = df1.copy()
df2.head(2)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,day,month,year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,7,1,2018
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14000.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,15,1,2018


In [47]:
##We have a cleaned copy of csv file but we lets use the same df1
df1.columns

#Lets start by 'APP' columns and check if the column has duplicated values

df2[df2.duplicated('App')].shape

(1181, 16)

In [48]:
#This indicates that the dataset has many duplicate records

##So Lets drop all the duplicated and keep one record

df2 = df2.drop_duplicates(subset=['App'],keep='first')
df2[df2.duplicated('App')].shape

(0, 16)

In [49]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9659 entries, 0 to 10840
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             9659 non-null   object        
 1   Category        9659 non-null   object        
 2   Rating          8196 non-null   float64       
 3   Reviews         9659 non-null   int64         
 4   Size            8432 non-null   float64       
 5   Installs        9659 non-null   int64         
 6   Type            9658 non-null   object        
 7   Price           9659 non-null   float64       
 8   Content Rating  9659 non-null   object        
 9   Genres          9659 non-null   object        
 10  Last Updated    9659 non-null   datetime64[ns]
 11  Current Ver     9651 non-null   object        
 12  Android Ver     9657 non-null   object        
 13  day             9659 non-null   int32         
 14  month           9659 non-null   int32         
 15  year    

## EXPLORE DATA

In [50]:
numeric_values = [feature for feature in df2.columns if df2[feature].dtype != 'O']
categorical_values = [feature for feature in df2.columns if df2[feature].dtype == 'O']

print('We have {} numerical values : {}'.format(len(numeric_values),numeric_values))
print('We have {} categorical values : {}'.format(len(categorical_values),categorical_values))

We have 9 numerical values : ['Rating', 'Reviews', 'Size', 'Installs', 'Price', 'Last Updated', 'day', 'month', 'year']
We have 7 categorical values : ['App', 'Category', 'Type', 'Content Rating', 'Genres', 'Current Ver', 'Android Ver']


In [None]:
## Now we will be focusing on Categorical columns

Feature Information    

1.'App' - Name of the app
2.'Category' - Category under which the app is
3.'Rating' - Apps rating
4.'Reviews' - number of reviews
5.'Size' - size of the all
6.'Installs' - number of intalls
7.'Type' - if the app is free paid
8.'Price' - app price(0 if free)
9.'Content Rating' - appropriate target audience
10.'Genres' - app genre
11.'Last Updated' - app last updated date
12.'Current Ver' - current version of app
13.'Android Ver' - min adndriod version to run app

In [65]:
#Proportion of count data on Categorical columns

for col in categorical_values:
    print(df2[col].value_counts(normalize=True)*100) #to see the each count in percentage
    print('------------------------------------------')

App
iHoroscope - 2018 Daily Horoscope & Astrology         0.010353
Photo Editor & Candy Camera & Grid & ScrapBook        0.010353
Coloring book moana                                   0.010353
U Launcher Lite – FREE Live Cool Themes, Hide Apps    0.010353
Sketch - Draw & Paint                                 0.010353
                                                        ...   
Learn To Draw Kawaii Characters                       0.010353
3D Color Pixel by Number - Sandbox Art Coloring       0.010353
Mandala Coloring Book                                 0.010353
Tattoo Name On My Photo Editor                        0.010353
Name Art Photo Editor - Focus n Filters               0.010353
Name: proportion, Length: 9659, dtype: float64
------------------------------------------
Category
FAMILY                 18.966767
GAME                    9.928564
TOOLS                   8.561963
BUSINESS                4.348276
MEDICAL                 4.089450
PERSONALIZATION         3.892743
PRODUC

In [96]:
#Proportion of count data in Numeric columns
plt.figure(figsize=(20,20))
plt.suptitle('Univariate analysis of Numerical feature', fontsize=20, fontweight='bold',alpha=0.9, y=1.0)

for i in range(0,len(numeric_values)):
    plt.subplot(5,3,i+1) 
    sns.kdeplot(x=df2[numeric_values[i]], shade=True, color=)
    plt.xlabel(numeric_values[i])
    plt.tight_layout()

SyntaxError: invalid syntax (4153886543.py, line 7)