In [None]:
!pip install pyforest

In [None]:
import os
import pyforest

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data  = pd.read_csv("/kaggle/input/playstore-dataset/playstore-analysis.csv")

data.head(3)

The above code shows the first 3 rows of the data DataFrame, which contains the following information about each app:

- **App:** The name of the app
- **Category:** The category of the app
- **Rating:** The average rating of the app
- **Reviews:** The number of reviews for the app
- **Size:** The size of the app in megabytes
- **Installs:** The number of times the app has been installed
- **Type:** Whether the app is free or paid
- **Price:** The price of the app, if it is paid
- **Content Rating:** The content rating of the app
- **Genres:** The genres of the app
- **Last Updated:** The date the app was last updated 
- Current version and Android  Version 

In [None]:
#Remove all characters except numbers, periods, and spaces from 'Android Ver' clumn to make the data more consistent and easier to work with.
data['Android Ver'] = data['Android Ver'].str.replace('[^0-9. ]', '', regex=True)

In [None]:
data.head(3)

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.columns

## **DATA CLEANING**

In [None]:
# Drop specified columns from the DataFrame
data.drop(["App", "Current Ver", "Last Updated", "Current Ver", "Genres"], axis=1, inplace=True)

data.head(3)

In [None]:
#View the new dataframe
data.head(2)

In [None]:
data.isna().sum()

In [None]:
#fill Rating column
mean_rating = data['Rating'].mean()

# Fill missing values in the 'Rating' column with the mean
data['Rating'].fillna(mean_rating, inplace=True)

In [None]:
#Fill Type column

mode_type = data['Type'].mode()[0] 

# Fill missing values in the 'Type' column with the mode
data['Type'].fillna(mode_type, inplace=True)

In [None]:
#fill "Android Ver " column

mode_android_ver = data['Android Ver'].mode()[0]  

# Fill missing values in the 'Android Ver' column with the mode
data['Android Ver'].fillna(mode_android_ver, inplace=True)

In [None]:
#Fill "Content Rating" column
most_common_content_rating = data['Content Rating'].mode()[0]

# Fill missing values in the 'Content Rating' column with the most common content rating
data['Content Rating'].fillna(most_common_content_rating, inplace=True)

In [None]:
data.isna().sum()

In [None]:
# Summary statistics
summary_stats = data.describe()
print("Summary Statistics:")
summary_stats

In [None]:
# Distribution of Ratings
plt.figure(figsize=(10, 6))
sns.histplot(data['Rating'], bins=20, kde=True)
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Relationship between Reviews and Installs
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='Reviews', y='Installs')
plt.title('Relationship between Reviews and Installs')
plt.xlabel('Number of Reviews')
plt.ylabel('Number of Installs')
plt.show()

In [None]:
# App Type Distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=data, x='Type')
plt.title('App Type Distribution')
plt.xlabel('Type (Free or Paid)')
plt.ylabel('Count')
plt.show()

In [None]:
# Content Rating Distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Content Rating')
plt.title('Content Rating Distribution')
plt.xlabel('Content Rating')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Android Version Distribution
plt.figure(figsize=(12, 6))
sns.countplot(data=data, x='Android Ver')
plt.title('Android Version Distribution')
plt.xlabel('Android Version')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()