In [4]:
import pandas as pd
from datetime import datetime


#prepare data before importing to database
chunksize = 100000
chunks = pd.read_csv("data/Google-Playstore.csv",
                     chunksize=chunksize
                     )
row_sizes = [10000, 100000, 500000, 1000000]
new_dfs = {}

def convert_size(value):
    if pd.isna(value):
        return None
    value = value.replace(",", "")

    if "G" in value:
        return float(value.replace("G", "")) * 1_000_000_000
    elif "M" in value:
        return float(value.replace("M", "")) * 1_000_000  # Convert M to numeric
    elif "k" in value:
        return float(value.replace("k", "")) * 1_000  # Convert K to numeric
    elif "Varies with device" in value:
        return None  # Handle special case
    else:
        return float(value)  # Already numeric

def convert_dates(date):
    if pd.isna(date):
        return None
    try:
        date_obj = datetime.strptime(date, "%b %d, %Y")
        return date_obj.strftime("%d-%m-%Y")
    except ValueError:
        return None

df_list = []
for chunk in chunks:
    chunk.dropna(how='all', inplace=True)
    bool_columns = ["Free", "Ad Supported", "In App Purchases"]
    chunk[bool_columns] = chunk[bool_columns].astype(bool).replace({True: 1, False: 0}).astype(str)
    chunk["Installs"] = pd.to_numeric(chunk["Installs"], errors='coerce')
    chunk["Installs"] = chunk["Installs"].fillna("").astype(str)
    chunk["Size"] = chunk["Size"].astype(str).apply(convert_size)
    chunk["Minimum Android"] = chunk["Minimum Android"].astype(str).replace(r"(W|and up)\b", "", regex=True).str.strip()
    chunk["Released"] = chunk["Released"].astype(str).apply(convert_dates)
    chunk["Last Updated"] = chunk["Last Updated"].astype(str).apply(convert_dates)
    df_list.append(chunk)

df = pd.concat(df_list, ignore_index=True)
df.drop("Scraped Time", axis=1, inplace=True)
df.drop("Editors Choice", axis=1, inplace=True)

start_index = 0
for size in row_sizes:
    end_index = start_index + size
    new_df = df.iloc[start_index:end_index]
    new_dfs[f"df_{size}_rows"] = new_df
    start_index = end_index

    output_filename = f"data/Google-Playstore_cleaned_{size}_rows.csv"
    new_df.to_csv(output_filename, index=False)
    print(f"Saved {output_filename} with {len(new_df)} rows")


  chunk[bool_columns] = chunk[bool_columns].astype(bool).replace({True: 1, False: 0}).astype(str)
  chunk[bool_columns] = chunk[bool_columns].astype(bool).replace({True: 1, False: 0}).astype(str)
  chunk[bool_columns] = chunk[bool_columns].astype(bool).replace({True: 1, False: 0}).astype(str)
  chunk[bool_columns] = chunk[bool_columns].astype(bool).replace({True: 1, False: 0}).astype(str)
  chunk[bool_columns] = chunk[bool_columns].astype(bool).replace({True: 1, False: 0}).astype(str)
  chunk[bool_columns] = chunk[bool_columns].astype(bool).replace({True: 1, False: 0}).astype(str)
  chunk[bool_columns] = chunk[bool_columns].astype(bool).replace({True: 1, False: 0}).astype(str)
  chunk[bool_columns] = chunk[bool_columns].astype(bool).replace({True: 1, False: 0}).astype(str)
  chunk[bool_columns] = chunk[bool_columns].astype(bool).replace({True: 1, False: 0}).astype(str)
  chunk[bool_columns] = chunk[bool_columns].astype(bool).replace({True: 1, False: 0}).astype(str)
  chunk[bool_columns

Saved data/Google-Playstore_cleaned_10000_rows.csv with 10000 rows
Saved data/Google-Playstore_cleaned_100000_rows.csv with 100000 rows
Saved data/Google-Playstore_cleaned_500000_rows.csv with 500000 rows
Saved data/Google-Playstore_cleaned_1000000_rows.csv with 1000000 rows
