`Clone repository to extract the data`

In [4]:
!git clone https://github.com/gauthamp10/Google-Playstore-Dataset.git
%cd Google-Playstore-Dataset/dataset/
!for f in *.tar.gz; do tar -xvf "$f"; done
!cat Part*.csv > Google-Playstore-Dataset.csv

Cloning into 'Google-Playstore-Dataset'...
remote: Enumerating objects: 95, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 95 (delta 0), reused 3 (delta 0), pack-reused 92[K
Receiving objects: 100% (95/95), 268.08 MiB | 25.23 MiB/s, done.
Resolving deltas: 100% (45/45), done.
/content/Google-Playstore-Dataset/dataset
Part1.csv
Part2.csv
Part3.csv


In [5]:
import pandas as pd

# Reads the combined CSV file
data = pd.read_csv("Google-Playstore-Dataset.csv")
print(data)

                                                  App Name  \
0                                                  Gakondo   
1                                      Ampere Battery Info   
2                                                   Vibook   
3        Smart City Trichy Public Service Vehicles 17UC...   
4                                                  GROW.me   
...                                                    ...   
2312939                                           大俠客—熱血歸來   
2312940                                         ORU Online   
2312941                                     Data Structure   
2312942                                        Devi Suktam   
2312943                       Biliyor Musun - Sonsuz Yarış   

                                       App Id       Category  Rating  \
0                         com.ishakwe.gakondo      Adventure     0.0   
1                  com.webserveis.batteryinfo          Tools     4.4   
2                        com.doantiepvi

In [6]:
# Remove specific columns
data1 = data.drop(['App Id','Rating Count','Currency','Price', 'Developer Website', 'Privacy Policy', 'Developer Id', 'Developer Email', 'Scraped Time', ], axis=1)

# Check the number of missing values in each column.
print(data1.isnull().sum())

# Remove rows with missing values
df2 = data1.dropna()

# Fill missing values with a specific value
df3 = df2.fillna(value=0)
df3['Minimum Installs'] = df3['Minimum Installs'].astype(int)

# Detect and remove duplicate rows
duplicates = df3.duplicated()
df4 = df3[~duplicates]

# Filter applications that do not contain special characters and that the name has a maximum of 3 words.
df_filtered = df4[df4['App Name'].str.split().apply(len) <= 3]
df_1 = df_filtered[df_filtered['App Name'].str.match(r'^[a-zA-Z0-9 ]+$')]

App Name                2
Category                0
Rating              22883
Installs              107
Minimum Installs      107
Maximum Installs        0
Free                    0
Size                  196
Minimum Android      6530
Released            71053
Last Updated            0
Content Rating          0
Ad Supported            0
In App Purchases        0
Editors Choice          0
dtype: int64


In [7]:
# Shows the number of unique values and their count in the 'Minimum Android' column.
print(df_1['Minimum Android'].value_counts())

df_data = df_1.copy()
# Reemplazar los valores de 0 en la columna 'Minimum Android' con 'unknown'
df_data['Minimum Android'].replace(0, 'unknown', inplace=True)

4.1 and up      255393
5.0 and up      194660
4.4 and up      182126
4.0.3 and up     78646
4.0 and up       69045
                 ...  
4.0 - 4.4W           1
2.2 - 4.1.1          1
1.6 - 4.4            1
4.2 - 4.3            1
2.3.3 - 6.0          1
Name: Minimum Android, Length: 121, dtype: int64


In [8]:
# Shows the number of unique values and their count in the 'ContentRating' column.
print(df_data['Content Rating'].value_counts())

# Define a mapping dictionary to standardize values
mapping = {
    'Everyone 10+': 'Everyone',
    'Mature 17+': 'Adults',
    'Adults only 18+': 'Adults'
}

df_data['Content Rating'] = df_data['Content Rating'].map(mapping).fillna(df_data['Content Rating'])

print(df_data['Content Rating'].unique())

Everyone           953317
Teen                70508
Mature 17+          20500
Everyone 10+        13064
Unrated                74
Adults only 18+        58
Name: Content Rating, dtype: int64
['Everyone' 'Teen' 'Adults' 'Unrated']


In [9]:
catego_columns = df_data.select_dtypes(include=['object'])
numeric_columns = df_data.select_dtypes(include=['int', 'float'])

In [10]:
# Iterate over each column and check for strange characters
for column in catego_columns:
    strange_characters = df_data[column].apply(lambda x: not x.isascii() if isinstance(x, str) else False).any()
    if strange_characters:
        print(f'The column "{column}" has strange characters.')
    else:
        print(f'The column "{column}" has no strange characters.')

The column "App Name" has no strange characters.
The column "Category" has no strange characters.
The column "Installs" has no strange characters.
The column "Size" has no strange characters.
The column "Minimum Android" has no strange characters.
The column "Released" has no strange characters.
The column "Last Updated" has no strange characters.
The column "Content Rating" has no strange characters.


In [11]:
# check if there are empty spaces

def check_empty_spaces(catego_columns):
    empty_spaces = {}

    for column in catego_columns:
        empty_spaces[column] = df_data[column].apply(lambda x: isinstance(x, str) and x.strip() == '').sum()
    return empty_spaces

empty_spaces = check_empty_spaces(catego_columns)


for column, recount in empty_spaces.items():
    print(f'Column "{column}" has {recount} values that are empty spaces.')


Column "App Name" has 0 values that are empty spaces.
Column "Category" has 0 values that are empty spaces.
Column "Installs" has 0 values that are empty spaces.
Column "Size" has 0 values that are empty spaces.
Column "Minimum Android" has 0 values that are empty spaces.
Column "Released" has 0 values that are empty spaces.
Column "Last Updated" has 0 values that are empty spaces.
Column "Content Rating" has 0 values that are empty spaces.


In [12]:
# Calculate the interquartile range (IQR) for each numerical column.
Q1 = numeric_columns.quantile(0.25)
Q3 = numeric_columns.quantile(0.75)
IQR = Q3 - Q1

# Identify the outliers in each numerical column
outliers = ((numeric_columns < (Q1 - 1.5 * IQR)) | (numeric_columns > (Q3 + 1.5 * IQR))).sum()

# Sum total of outliers in all numeric columns
total_outliers = outliers.sum()

print("Total outliers in numeric columns:", total_outliers)

Total outliers in numeric columns: 417066


In [13]:
# Calculate the median for each numerical column
median_values = numeric_columns.median()

# Replace outliers with the corresponding median
for column in numeric_columns.columns:
    median = median_values[column]
    Q1 = numeric_columns[column].quantile(0.25)
    Q3 = numeric_columns[column].quantile(0.75)
    IQR = Q3 - Q1
    outliers_low = numeric_columns[column] < (Q1 - 1.5 * IQR)
    outliers_high = numeric_columns[column] > (Q3 + 1.5 * IQR)
    numeric_columns.loc[outliers_low, column] = median
    numeric_columns.loc[outliers_high, column] = median

print(numeric_columns)

         Rating  Minimum Installs  Maximum Installs
0           0.0                10                15
1           4.4               100              7662
2           0.0                50                58
5           0.0                50                89
7           2.0               500               702
...         ...               ...               ...
2312935     0.0              1000              1302
2312936     0.0               100               353
2312940     0.0               100               430
2312941     0.0               100               202
2312942     3.5              1000              2635

[1057521 rows x 3 columns]


In [14]:
df_1end = pd.concat([catego_columns, numeric_columns], axis=1)
data_apps = df_1end.dropna()
print(data_apps)

                      App Name         Category Installs  Size  \
0                      Gakondo        Adventure      10+   10M   
1          Ampere Battery Info            Tools   5,000+  2.9M   
2                       Vibook     Productivity      50+  3.7M   
5                       IMOCCI           Social      50+   46M   
7        The Everyday Calendar        Lifestyle     500+   16M   
...                        ...              ...      ...   ...   
2312935       Floral Wallpaper  Personalization   1,000+   29M   
2312936      Engineers Careers         Business     100+   21M   
2312940             ORU Online        Education     100+   44M   
2312941         Data Structure        Education     100+   29M   
2312942            Devi Suktam    Music & Audio   1,000+   10M   

        Minimum Android      Released  Last Updated Content Rating  Rating  \
0            7.1 and up  Feb 26, 2020  Feb 26, 2020       Everyone     0.0   
1            5.0 and up  May 21, 2020  May 06, 2021

In [15]:
df_final = data_apps.copy()

df_final.insert(0, 'ID', range(1, len(df_final) + 1))

print(type(df_final))
df_final.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,ID,App Name,Category,Installs,Size,Minimum Android,Released,Last Updated,Content Rating,Rating,Minimum Installs,Maximum Installs
0,1,Gakondo,Adventure,10+,10M,7.1 and up,"Feb 26, 2020","Feb 26, 2020",Everyone,0.0,10,15
1,2,Ampere Battery Info,Tools,"5,000+",2.9M,5.0 and up,"May 21, 2020","May 06, 2021",Everyone,4.4,100,7662
2,3,Vibook,Productivity,50+,3.7M,4.0.3 and up,"Aug 9, 2019","Aug 19, 2019",Everyone,0.0,50,58
5,4,IMOCCI,Social,50+,46M,6.0 and up,"Dec 24, 2018","Dec 20, 2019",Teen,0.0,50,89
7,5,The Everyday Calendar,Lifestyle,500+,16M,5.0 and up,"Jun 21, 2019","Jun 21, 2019",Everyone,2.0,500,702


`Save clean dataset`

In [20]:
ruta_archivo = "/content/Google-Playstore-Dataset-Clean.csv"
df_final.to_csv(ruta_archivo, index=False, encoding='utf-8')

from google.colab import files
buffer = df_final.to_csv(index=False)