In [None]:
#Install Pandas (For CI Success) - Have to install Locally
%pip install pandas

In [None]:
#Install Matplotlib (For CI Success) - Have to install Locally
%pip install matplotlib

In [1]:
#importing libraries and creating a data frame of the dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Read Google Play Store CSV
df=pd.read_csv('googleplaystore.csv') 

In [None]:
#printing top rows
df.head()

In [None]:
df.info()

In [None]:
#Get Shape of Data Set
print("The Shape of data in (sample, features): ", df.shape)
#Get Data Types of Data Set
print("Data Types: \n", df.dtypes.value_counts())

In [None]:
#First Clean Size Column
#Count frequency of each item in the Size Column
df.Size.value_counts()

In [None]:
#Removing M's and k's to convert string to numeric values
df.Size = df.Size.str.replace('M','e+6')
df.Size = df.Size.str.replace('k','e+3')
df.Size.head()

In [None]:
#Creating a function to check if every item from Size is convertible to numeric data or not
def is_item_convertible_to_numeric(item):
    try:
        float(item)
        return True
    except ValueError:
        return False

In [None]:
#Create a temp data frame which is a boolean array contains true for all float items
df_temp = df.Size.apply(lambda x: is_item_convertible_to_numeric(x))
df_temp.head()

In [None]:
df.Size[~df_temp].value_counts()

In [None]:
#Replacing "Nan" by "Varies with Device"
df.Size = df.Size.replace('Varies with device', np.nan)

In [None]:
#Replacing 1,000+ by 1000 to convert it into numeric
df.Size = df.Size.replace('1,000+', 1000) 

In [None]:
#Now converting the Cleaned Data of Column Size into Numeric by using Pandas
df.Size = pd.to_numeric(df.Size)

In [None]:
#Plotting histogram for Column Size (Frequency)
df.hist(column='Size')
plt.xlabel('Size')
plt.ylabel('Frequency')

In [None]:
#Renaming the column name for removing spaces
df.rename(columns={'Content Rating':'Content_Rating', 'Last Updated':'Last_Updated', 'Current Ver':'Current_Ver','Android Ver':'Android_Ver'}, inplace=True)
df.head()

In [None]:
df.Category.unique()

In [None]:
df[df.Category=='1.9']

In [None]:
#Finding index of category 1.9 and dropping that row
df.drop(10472,axis=0,inplace=True)
df[df.Category=='1.9']

In [None]:
#Dropping null rating by creating a list of those indices which have null values
drop_nullrating_index=df[df.Rating.isnull()].index
drop_nullrating_index=list(drop_nullrating_index)
df.drop(drop_nullrating_index,inplace=True)
df[df.Rating.isnull()]

In [None]:
df.Installs.unique()

In [None]:
#Removing character from the values
df.Installs = df.Installs.str.replace('+','')
df.Installs = df.Installs.str.replace(',','')
df.Installs=df.Installs.astype('int')
df.Installs.unique()

In [None]:
df.Price.unique()

In [None]:
#Cleaning Price
#Removing dollar sign
df.Price = df.Price.str.replace('$','')

In [None]:
#Printing unique price values
df.Price.unique()

In [None]:
#Converting Price object to float
df.Price=df.Price.astype('float')
#Price cleaned..

In [None]:
#Cleaning Genres
#Finding out indices of rows which have multiple genres
temp = np.array(df.Genres.str.find(';'))
ls = np.where(temp>0)

In [None]:
#Creating a temp dataframe which includes row data which have multiple genres
df2 = df.iloc[np.r_[ls],:]

In [None]:
#Getting shape of original dataframe
df.shape

In [None]:
#Getting shape of temp data frame
df2.shape

In [None]:
#Splitting the genre column's data into an array and keeping the first genre only
df.Genres = df.Genres.apply(lambda x: x.split(';')[0])

In [None]:
#Displaying the head of temp dataframe
df2.head()

In [None]:
#Splitting the genre column's data into an array and keeping the seconf genre only
df2.Genres = df2.Genres.apply(lambda x: x.split(';')[1])

In [None]:
#Displaying the head of temp dataframe
df2.head()

In [None]:
#Appending the data of second dataframe into the first one.  
df = df.append(df2, ignore_index=True)
#It clears out the problem of multiple genre and now there will be 2 entries of that app with different genres

In [None]:
#Displaying shape of original dataframe
df.shape

In [None]:
#Deleting the temp dataframe
del df2
#Genre Colum Cleaned.

In [None]:
#Displaying data info
df.info()

In [None]:
#Changing the data type of Last updated to datetime from object.
df.Last_Updated = pd.to_datetime(df['Last_Updated']) 

In [None]:
#Displaying current data info
df.info()
#Last_Updated Clean

In [None]:
#Cleaning Current_Version
#Getting data with version varies with devices
df[df.Current_Ver == "Varies with device"]

In [None]:
#Finding out the indices of row with version varies with data
vwd_indices = df[df['Current_Ver']=='Varies with device'].index

In [None]:
#Getting mode of the column Current Ver
df.Current_Ver.mode()

In [None]:
#Create a temp copy dataframe
df_temp=df.copy()

In [None]:
#Dropping the rows from temp dataframe which have current version value as varies with data
df_temp.drop(vwd_indices, axis=0, inplace=True)

In [None]:
#Now, Getting mode of temp dataframe
df_temp.Current_Ver.mode()

In [None]:
#Replacing Varies with device in original dataframe with next best most occurances of values in Current_Ver column
df.Current_Ver = df.Current_Ver.replace('Varies with device', '1.0')
#Current_Ver column cleaned

In [None]:
#Deleting the temp dataframe
del df_temp

In [None]:
#Checking if any duplicate data are available
df[df.duplicated()]

In [None]:
#Dropping duplicates
df.drop_duplicates(keep='first', inplace=True)

In [None]:
#Reset the index values
df.reset_index(inplace=True)
df.drop('index',axis=1, inplace=True)
df.head()