In [2]:
import pandas as pd
import numpy as np
import re
import subprocess
from pathlib import Path
import io
import dvc.api, dvc.repo

# Google play store

In [3]:
url = "https://github.com/mostafa-fallaha/hackathon-apps-dvc"
data = dvc.api.read("data/googleplaystore.csv", encoding='utf-8', repo=url)
df = pd.read_csv(io.StringIO(data))
df.sample(10)



Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=710796635688-iivsgbgsb6uv1fap6635dhvuei09o66c.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.appdata&access_type=offline&response_type=code&approval_prompt=force



In [3]:
df.shape

(10841, 13)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [5]:
df.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


## Cleaning

### fixing to the left - Rating

In [6]:
df.loc[df['Rating'] == df['Rating'].max()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,


In [7]:
cols = df.columns.to_list()
cols.pop(0)
cols

['Category',
 'Rating',
 'Reviews',
 'Size',
 'Installs',
 'Type',
 'Price',
 'Content Rating',
 'Genres',
 'Last Updated',
 'Current Ver',
 'Android Ver']

In [8]:
for i in range(len(cols)-1, 0, -1):
    print(cols[i])
    if cols[i] == 'Rating':
        df.loc[10472, cols[i]] = float(df.loc[10472, cols[i-1]])
    else:
        df.loc[10472, cols[i]] = df.loc[10472, cols[i-1]]

Android Ver
Current Ver
Last Updated
Genres
Content Rating
Price
Type
Installs
Size
Reviews
Rating


In [9]:
df.loc[[10472]]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up


In [10]:
df.loc[10472, 'Category'] = 'LIFESTYLE'
df.loc[10472, 'Genres'] = 'LIFESTYLE'

In [11]:
df.loc[[10472]]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,LIFESTYLE,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,LIFESTYLE,"February 11, 2018",1.0.19,4.0 and up


In [12]:
df.fillna({'Rating': df['Rating'].mean()}, inplace=True)

### Type

In [13]:
df.loc[df['Type'].isna()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
9148,Command & Conquer: Rivals,FAMILY,4.191513,0,Varies with device,0,,0,Everyone 10+,Strategy,"June 28, 2018",Varies with device,Varies with device


In [14]:
df.loc[df['Type'] == 'Free'].shape

(10040, 13)

In [15]:
df.loc[df['Type'] == 'Paid'].shape

(800, 13)

In [16]:
df.fillna({'Type':'Free'}, inplace=True)

In [17]:
df.loc[df['Type'].isna()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver


In [18]:
df.loc[df['Type'] == 'Free'].shape

(10041, 13)

In [19]:
df[df['Size'].str.endswith('G')]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver


### Installs

In [20]:
# Step 1: Remove non-numeric characters (commas and plus sign)
df['Installs'] = df['Installs'].replace('[\+,]', '', regex=True)
# Step 2: Convert the resulting string to an integer
df['Installs'] = df['Installs'].astype(int)
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,10000,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,5000000,Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,50000000,Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,100000,Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


### Last Updated

In [21]:
df['Last Updated'] = pd.to_datetime(df['Last Updated'])

In [22]:
df.iloc[[5583]]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
5583,The Aether: Life as a God,FAMILY,4.1,1407,3.4M,100000,Free,0,Everyone 10+,Role Playing,2018-03-12,1.0.5,4.0 and up


### Reviews

In [23]:
df['Reviews'] = df['Reviews'].astype(int)

### Price

In [24]:
def clean_price(price):
    # Ensure the price is a string
    if isinstance(price, str):
        # Remove any non-numeric characters, except for the dot
        cleaned_price = price.replace('$', '').replace('£', '').replace(',', '')
        try:
            # Convert to float
            return float(cleaned_price)
        except ValueError:
            return None
    return price


In [25]:
df['Price'] = df['Price'].apply(clean_price)

### Fixing the size

In [26]:
df[df['Size'].str.endswith('k')].head(2)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
58,Restart Navigator,AUTO_AND_VEHICLES,4.0,1403,201k,100000,Free,0.0,Everyone,Auto & Vehicles,2014-08-26,1.0.1,2.2 and up
209,Plugin:AOT v5.0,BUSINESS,3.1,4034,23k,100000,Free,0.0,Everyone,Business,2015-09-11,3.0.1.11 (Build 311),2.2 and up


In [27]:
def convert_to_megabytes(size):
  """Converts a size string from KB to MB"""
  if size.endswith('k'):
    # Remove the 'K' and convert to float
    size_in_kb = float(size[:-1])
    # Convert to MB by dividing by 1024
    size_in_mb = size_in_kb / 1024
    return f"{size_in_mb:.2f}M"  # Format to 2 decimal places and add 'M'
  else:
    return size

df['Size'] = df['Size'].apply(convert_to_megabytes)

In [28]:
df.iloc[[58]]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
58,Restart Navigator,AUTO_AND_VEHICLES,4.0,1403,0.20M,100000,Free,0.0,Everyone,Auto & Vehicles,2014-08-26,1.0.1,2.2 and up


In [29]:
df['Size'].value_counts()

Size
Varies with device    1695
11M                    198
12M                    196
14M                    194
13M                    191
                      ... 
0.44M                    1
0.10M                    1
0.75M                    1
0.48M                    1
1.00M                    1
Name: count, Length: 276, dtype: int64

In [30]:
df.sample(7)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
4740,Dictionary - WordWeb,BOOKS_AND_REFERENCE,4.6,124970,Varies with device,5000000,Free,0.0,Teen,Books & Reference,2018-05-26,Varies with device,Varies with device
1929,Kick the Buddy,GAME,4.3,1003269,Varies with device,50000000,Free,0.0,Teen,Action,2018-07-05,Varies with device,4.4 and up
4943,Ad Detect Plugin - Handy Tool,PRODUCTIVITY,3.8,5107,1.4M,500000,Free,0.0,Everyone,Productivity,2015-12-28,1.6.3,2.3 and up
4568,"360 Security - Free Antivirus, Booster, Cleaner",TOOLS,4.6,16771865,Varies with device,100000000,Free,0.0,Everyone,Tools,2018-08-04,Varies with device,Varies with device
2821,HD Camera,PHOTOGRAPHY,4.3,49680,8.7M,5000000,Free,0.0,Everyone,Photography,2018-07-11,1.8.5,4.2 and up
4330,EXO-L,FAMILY,4.6,67410,0.84M,1000000,Free,0.0,Everyone,Entertainment,2017-11-02,1.0.9,2.2 and up
4322,"letgo: Buy & Sell Used Stuff, Cars & Real Estate",SHOPPING,4.5,972256,20M,50000000,Free,0.0,Teen,Shopping,2018-08-02,2.4.8,4.1 and up


In [31]:
# def convert_size(size):
#     if 'M' in size:
#         return float(size.replace('M', ''))
#     elif 'k' in size:
#         return float(size.replace('k', '')) / 1024
#     return np.nan
# df['Size_num'] = df['Size'].apply(lambda x: convert_size(x) if x != 'Varies with device' else np.nan)
# # Calculate average size per category
# df_sorted = df.sort_values(by='Size_num', ascending=False)
# top_5_per_category = df_sorted.groupby('Category').head(3)
# category_avg_size = top_5_per_category.groupby('Category')['Size_num'].mean().round(1)
# print(category_avg_size)
# # Replace 'Varies with device' with the average size of the category
# df['Size_num'] = df.apply(
#     lambda row: category_avg_size[row['Category']] if pd.isna(row['Size_num']) else row['Size_num'],
#     axis=1
# )

In [32]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up


In [33]:
# # Convert the numeric size back to string with 'M' for display purposes
# df['Size'] = df['Size_num'].apply(lambda x: f"{x:.1f}M")
# # Drop the helper column
# df.drop(columns=['Size_num'], inplace=True)

In [34]:
df.iloc[[139]]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
139,Wattpad 📖 Free Books,BOOKS_AND_REFERENCE,4.6,2914724,Varies with device,100000000,Free,0.0,Teen,Books & Reference,2018-08-01,Varies with device,Varies with device


In [35]:
# df.loc[df['Size'] == 'Varies with device']

In [36]:
df.loc[df['Size'] == 'Varies with device'].shape[0]

1695

In [37]:
# def convert_size_to_float(size):
#     if 'M' in size:
#         return float(size.replace('M', ''))
# df['Size'] = df['Size'].apply(convert_size_to_float)

In [38]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up


### Content Rating

In [39]:
df['Content Rating'].value_counts()

Content Rating
Everyone           8715
Teen               1208
Mature 17+          499
Everyone 10+        414
Adults only 18+       3
Unrated               2
Name: count, dtype: int64

In [40]:
df.loc[df['Content Rating'] == 'Unrated']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
7312,Best CG Photography,FAMILY,4.191513,1,2.5M,500,Free,0.0,Unrated,Entertainment,2015-06-24,5.2,3.0 and up
8266,DC Universe Online Map,TOOLS,4.1,1186,6.4M,50000,Free,0.0,Unrated,Tools,2012-02-27,1.3,2.3.3 and up


### Current Version

In [41]:
df.loc[df['Current Ver'].isna()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
15,Learn To Draw Kawaii Characters,ART_AND_DESIGN,3.2,55,2.7M,5000,Free,0.0,Everyone,Art & Design,2018-06-06,,4.2 and up
1553,Market Update Helper,LIBRARIES_AND_DEMO,4.1,20145,0.01M,1000000,Free,0.0,Everyone,Libraries & Demo,2013-02-12,,1.5 and up
6322,Virtual DJ Sound Mixer,TOOLS,4.2,4010,8.7M,500000,Free,0.0,Everyone,Tools,2017-05-10,,4.0 and up
6803,BT Master,FAMILY,4.191513,0,0.22M,100,Free,0.0,Everyone,Education,2016-11-06,,1.6 and up
7333,Dots puzzle,FAMILY,4.0,179,14M,50000,Paid,0.99,Everyone,Puzzle,2018-04-18,,4.0 and up
7407,Calculate My IQ,FAMILY,4.191513,44,7.2M,10000,Free,0.0,Everyone,Entertainment,2017-04-03,,2.3 and up
7730,UFO-CQ,TOOLS,4.191513,1,0.23M,10,Paid,0.99,Everyone,Tools,2016-07-04,,2.0 and up
10342,La Fe de Jesus,BOOKS_AND_REFERENCE,4.191513,8,0.64M,1000,Free,0.0,Everyone,Books & Reference,2017-01-31,,3.0 and up


In [42]:
df.fillna({'Current Ver':'1.0.0.0'}, inplace=True)

In [43]:
df.iloc[[15, 1553]]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
15,Learn To Draw Kawaii Characters,ART_AND_DESIGN,3.2,55,2.7M,5000,Free,0.0,Everyone,Art & Design,2018-06-06,1.0.0.0,4.2 and up
1553,Market Update Helper,LIBRARIES_AND_DEMO,4.1,20145,0.01M,1000000,Free,0.0,Everyone,Libraries & Demo,2013-02-12,1.0.0.0,1.5 and up


In [44]:
df['Current Ver'].isna().sum()

0

In [45]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up


In [46]:
# Calculate the most common version per category
# most_common_version = df[df['Current Ver'] != 'Varies with device'].groupby('Category')['Current Ver'].agg(lambda x: x.mode()[0])
# # Replace 'Varies with device' with the most common version for the category
# df['Current Ver'] = df.apply(
#  lambda row: most_common_version[row['Category']] if row['Current Ver'] == 'Varies with device' else row['Current Ver'],
#  axis=1
# )

In [47]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up


### Android Ver

In [48]:
df.loc[df['Android Ver'].isna()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
4453,[substratum] Vacuum: P,PERSONALIZATION,4.4,230,11M,1000,Paid,1.49,Everyone,Personalization,2018-07-20,4.4,
4490,Pi Dark [substratum],PERSONALIZATION,4.5,189,2.1M,10000,Free,0.0,Everyone,Personalization,2018-03-27,1.1,


In [49]:
df.fillna({'Android Ver': '3.0 and up'}, inplace=True)

In [50]:
df['Android Ver'].isna().sum()

0

In [51]:
def extract_float(value):
    match = re.search(r'\d+\.\d+', value)
    return float(match.group(0)) if match else value

In [52]:
df['Android Ver'] = df['Android Ver'].apply(extract_float)

In [53]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4


In [54]:
df.loc[df['Android Ver'] == 'Varies with device']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
42,Textgram - write on photos,ART_AND_DESIGN,4.4,295221,Varies with device,10000000,Free,0.0,Everyone,Art & Design,2018-07-30,Varies with device,Varies with device
52,Used Cars and Trucks for Sale,AUTO_AND_VEHICLES,4.6,17057,Varies with device,1000000,Free,0.0,Everyone,Auto & Vehicles,2018-07-30,Varies with device,Varies with device
67,Ulysse Speedometer,AUTO_AND_VEHICLES,4.3,40211,Varies with device,5000000,Free,0.0,Everyone,Auto & Vehicles,2018-07-30,Varies with device,Varies with device
68,REPUVE,AUTO_AND_VEHICLES,3.9,356,Varies with device,100000,Free,0.0,Everyone,Auto & Vehicles,2018-05-25,Varies with device,Varies with device
85,CarMax – Cars for Sale: Search Used Car Inventory,AUTO_AND_VEHICLES,4.4,21777,Varies with device,1000000,Free,0.0,Everyone,Auto & Vehicles,2018-08-04,Varies with device,Varies with device
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10713,My Earthquake Alerts - US & Worldwide Earthquakes,WEATHER,4.4,3471,Varies with device,100000,Free,0.0,Everyone,Weather,2018-07-24,Varies with device,Varies with device
10765,Chat For Strangers - Video Chat,SOCIAL,3.4,622,Varies with device,100000,Free,0.0,Mature 17+,Social,2018-05-23,Varies with device,Varies with device
10826,Frim: get new friends on local chat rooms,SOCIAL,4.0,88486,Varies with device,5000000,Free,0.0,Mature 17+,Social,2018-03-23,Varies with device,Varies with device
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,1000,Free,0.0,Mature 17+,Books & Reference,2015-01-19,Varies with device,Varies with device


In [55]:
df.iloc[[139]]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
139,Wattpad 📖 Free Books,BOOKS_AND_REFERENCE,4.6,2914724,Varies with device,100000000,Free,0.0,Teen,Books & Reference,2018-08-01,Varies with device,Varies with device


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             10841 non-null  object        
 1   Category        10841 non-null  object        
 2   Rating          10841 non-null  float64       
 3   Reviews         10841 non-null  int32         
 4   Size            10841 non-null  object        
 5   Installs        10841 non-null  int32         
 6   Type            10841 non-null  object        
 7   Price           10841 non-null  float64       
 8   Content Rating  10841 non-null  object        
 9   Genres          10841 non-null  object        
 10  Last Updated    10841 non-null  datetime64[ns]
 11  Current Ver     10841 non-null  object        
 12  Android Ver     10841 non-null  object        
dtypes: datetime64[ns](1), float64(2), int32(2), object(8)
memory usage: 1016.5+ KB


In [59]:
filepath = Path('data/googleplaystore.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(filepath, index=False)

# Commits

In [60]:
commit_message = "cleaned the data for backend"

subprocess.run(["powershell", "-File", "run_versioning_apps.ps1", commit_message], check=True)

CompletedProcess(args=['powershell', '-File', 'run_versioning_apps.ps1', 'cleaned the data for backend'], returncode=0)