In [None]:
import pandas as pd
from datetime import datetime


#prepare data before importing to database
chunksize = 100000
chunks = pd.read_csv("data/Google-Playstore.csv", chunksize=chunksize)

def convert_size(value):
    value = value.replace(",", "")

    if "G" in value:
        return float(value.replace("G", "")) * 1_000_000_000
    elif "M" in value:
        return float(value.replace("M", "")) * 1_000_000  # Convert M to numeric
    elif "k" in value:
        return float(value.replace("k", "")) * 1_000  # Convert K to numeric
    elif "Varies with device" in value:
        return None  # Handle special case
    else:
        return float(value)  # Already numeric

def convert_dates(date):
    if date != "nan":
        date_obj = datetime.strptime(date, "%b %d, %Y")
        return date_obj.strftime("%d-%m-%Y")
    else:
        return None


df_list = []
for chunk in chunks:
    bool_columns = ["Free", "Ad Supported", "In App Purchases", "Editors Choice"]
    chunk[bool_columns] = chunk[bool_columns].astype(bool).replace({True: 1, False: 0}).astype(str)
    chunk["Installs"] = chunk["Installs"].astype(str).replace(r"[+,]", "", regex=True).astype(str)
    chunk["Size"] = chunk["Size"].astype(str).apply(convert_size)
    chunk["Minimum Android"] = chunk["Minimum Android"].astype(str).replace(r"(W|and up)\b", "", regex=True).str.strip()
    chunk["Released"] = chunk["Released"].astype(str).apply(convert_dates)
    chunk["Last Updated"] = chunk["Last Updated"].astype(str).apply(convert_dates)
    df_list.append(chunk)

df = pd.concat(df_list, ignore_index=True)
df.drop("Scraped Time", axis=1, inplace=True)
df.to_csv("data/Google-Playstore_cleaned.csv", index=False)
df.head(50)