In [None]:
!pip install pyforest

In [None]:
import os
import pyforest

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data  = pd.read_csv("/kaggle/input/playstore-dataset/playstore-analysis.csv")

data.head(3)

In [None]:
data.shape


In [None]:
data.describe()

In [None]:
data.columns

## DATA CLEANING

In [None]:
# Drop specified columns from the DataFrame
data.drop(["App", "Current Ver", "Last Updated", "Current Ver", "Genres"], axis=1, inplace=True)

# Display information about the DataFrame
data.info()

In [None]:
#Drop irrelevant columns from the dataset

data = data.drop_duplicates(keep = False)

#check length of the dataset to confirm duplicate rows have been deleted
d_l = len(data)
d_l

In [None]:
data.isna().sum()

In [None]:
#Category column cleaning steps
#1. Find blanks

data[data["Category"].isna()]

#Find the unique values in the column
data["Category"].unique()
#Find the row with a value of 1.9 as category
data.loc[data["Category"] == "1.9"]

In [None]:
#Drop the row

data.drop(index = 10472, inplace = True)


In [None]:
#Ratings cleaning
#data["Rating"].unique()
print("Max value is: ", data["Rating"].max())
print("Min value is: ", data["Rating"].min())

The values fall within the acceptable range for ratings which is between 1 and 5

In [None]:
#data["Rating"].isna().sum()- There are 1393 missing values in the column
#Replace missing values with mode
filler = data["Rating"].mode()
data["Rating"].fillna(filler [0], inplace = True)
data["Rating"].unique()

In [None]:
#Reviews Cleaning 
data["Reviews"] = data["Reviews"].astype(int)
data.info()

In [None]:
#Find null values in the column
data["Reviews"].isna().sum()

There are no null values in the column Reviews

In [None]:
#Find blanks int the Size column
data["Size"].isna().sum()

There are no null values in the column Size

In [None]:
# CLeaning the Installs Column
data["Installs"].isna().sum()

Theer are no null values in the column Installs

In [None]:
# Remove the '+' at the end and convert to integer
data["Installs"] = pd.to_numeric(data["Installs"].str.replace(r'\D', '', regex=True))
data["Installs"].unique()

In [None]:
# Cleaning the Type column
# data[data["Type"].isna()]   #find the index of the row with the missing value
# data.drop(index = 9148, inplace = True) #Drop the row 
data["Type"].unique()

In [None]:
#Cleaning the Price Column
#Remove $ and convert to float
data["Price"] = pd.to_numeric(data["Price"].str.replace('$', ''))
data["Price"].unique()

In [None]:
# #Cleaning the Android Version column
data[data["Android Ver"].isna()] #fnd indices for ros with missing values
indices = [4453, 4490] #pass the index values to a list
data.drop(indices, inplace = True)   # Drop the rows based on the index values
data["Android Ver"].unique()

In [None]:
data["Android Ver"] = data["Android Ver"].str.replace('and up', '')
data["Android Ver"] = data["Android Ver"].str.replace('W', '')
data["Android Ver"].unique()

In [None]:
data["Android Ver"].mode()

## DATA ANALYSIS AND VISUALIZATION

In [None]:
data.head()

In [None]:
plt.style.use('ggplot')

In [None]:
#Create a figure and add two plots  side-by-side
fig = plt.figure(figsize = [18,8])
ax0 = fig.add_subplot(1,2,1) #-> First plot on the left
ax1 = fig.add_subplot(1,2,2) #->Second plot on the right
#First plot
#Find the categories with the most apps on playstore
top_10 = (data['Category'].value_counts()).iloc[:10]
#create bar chart to visualize them
top_10.plot(ax = ax0, kind='bar', 
            ylabel="No of Apps",
            title ="Top 10 Categories on Playstore by Number of Apps")
plt.xlabel("Category")


#Second Plot
#get total installs per category and divide by a Billion for easier visualization
totals = data.groupby("Category")["Installs"].sum()/1000000000
#sorting in descending order and getting the top ten values
totals = totals.sort_values(ascending = False).head(10)
#Visualization
totals.plot(ax = ax1, kind = 'bar', title = "Top 10 apps by Installs")
plt.xlabel("Category")
plt.ylabel("No of Installs in Billions")
fig.show()

The Games category has the most installed apps in terms of Installs while the Family category has the most number of apps on playstore

In [None]:
#Pass the top categories in terms of installs to a list
categories = totals.index.to_list()
categories

In [None]:
#Create a figure and add two plots  side-by-side
fig = plt.figure(figsize = [16,8])
ax0 = fig.add_subplot(1,2,1) #-> First plot on the left
ax1 = fig.add_subplot(1,2,2) #->Second plot on the right

#First Histogram on the left
df = data
df["Size"] = df["Size"]/1024

count, bin_edges = np.histogram(df["Size"])
xmin = bin_edges[0]  #  first bin value 
xmax = bin_edges[-1]#  last bin value 


df["Size"].plot(ax = ax0, kind="hist",
                title = "Distribution of apps based on their size",
#                 bins = 10,
                xticks=bin_edges,
                xlim=(xmin, xmax),
                xlabel="Size in Mbs",
                ylabel="No of Apps")

#Second Histogram on the right
data["Rating"].plot(ax = ax1, 
                    kind = "hist", 
                    title ="Distribution of apps based on their ratings",
                    ylabel="No of Apps")
plt.margins(x=0.01)      #->x-axis to start at 0
plt.xlabel("Rating")

In [None]:
fig = plt.figure()
ax0 = fig.add_subplot(1,2,1) #-> First plot on the left
ax1 = fig.add_subplot(1,2,2) #->Second plot on the right
#First plot
df_in = data.groupby("Type")["Installs"].sum()
 
df_in.plot(ax=ax0, kind="pie", 
           title = "Installs per type", 
           figsize = [10,6], 
           labeldistance = None,
           startangle=90,
           ylabel=(None),
           autopct='%1.1f%%')
ax0.legend(bbox_to_anchor=(0.5, 0, 0.5, 1))

#Second plot
cr_in = data.groupby("Content Rating")["Installs"].sum()
 
cr_in.plot(ax=ax1, kind="pie", 
           figsize = [10,6], 
           labeldistance = None, 
           startangle=90, 
           autopct='%1.2f%%')

plt.title("Installs per Content Rating")
plt.ylabel(None)
plt.legend(bbox_to_anchor=(1, 0, 0.5, 1))

In [None]:
df = data
df["Reviews"] = df["Reviews"]/1000000
data.plot(kind="scatter", x="Rating", y="Reviews", alpha=0.5, figsize =[8,6])
plt.title("Ratings vs Price")

In [None]:
df = data
df.plot(kind="scatter", x="Rating", y="Price", figsize =[8,6])
plt.title("Ratings vs Price")

In [None]:
data.plot(kind="scatter", x="Rating", y="Size", alpha=0.5, figsize =[8,6])
plt.title("Ratings vs Size")