Imports

In [5]:
import pandas as pd

Functions to get a pandas dataframe from the csv file and clean the data

In [6]:
def get_df():
    df = pd.read_csv("googleplaystore.csv")
    #this is a manually detected bad row
    df = df.drop(index=10472)
    return df

def clean_df(df):
    #this function cleans the columns in order
    #first make a copy of the df
    df = df.copy()
    
    #App is alright

    #Category needs to be a category dtype
    df["Category"] = df["Category"].astype("category")

    #Rating is alright

    #Reviews needs to be int dtype
    df["Reviews"] = convert_reviews_to_int(df["Reviews"])

    #Size needs to be float dtype
    df["Size"] = size_convert_to_float(df["Size"])

    #Installs needs to be int dtype
    df["Installs"] = installs_convert_to_int(df["Installs"])

    #Type needs to be category dtype
    df["Type"] = df["Type"].astype("category")

    #Price needs to be float dtype
    df["Price"] = price_convert_to_float(df["Price"])

    #Content rating needs to be category dtype
    df["Content Rating"] = df["Content Rating"].astype("category")

    #Genres needs to be lists
    df["Genres"] = genres_convert_to_lists(df["Genres"])

    #Last updated needs to be list dtype
    df["Last Updated"] = df["Last Updated"].apply(pd.to_datetime)

    #Current Ver needs to be category dtype
    df["Current Ver"] = df["Current Ver"].astype("category")

    #Android Ver needs to be category dtype
    df["Android Ver"] = df["Android Ver"].astype("category")

    return df

def convert_reviews_to_int(reviews):
    reviews = reviews.copy()
    reviews = reviews.apply(pd.to_numeric, errors="ignore")

    for i, value in enumerate(reviews):
        if type(value) == type("string"):
            if value[-1] == "M":
                new_value = int(float(value[:-1]) * 1000000)
                reviews[i] = new_value

    return reviews.apply(pd.to_numeric)

def size_turn_value_to_float(value):
    if value[-1] == "M":
        return float(value[:-1]) * 1000000
    elif value[-1] == "k":
        return float(value[:-1]) * 1000
    elif value[-1] == "+":
        return float(value[:-1].replace(",", ""))
    else:
        return value

def size_convert_to_float(size):
    size = size.copy()
    size = size.apply(size_turn_value_to_float)
    size = size.apply(pd.to_numeric, errors="coerce")
    return size

def installs_convert_value_to_int(value):
    if value[-1] == "+":
        return int(value[:-1].replace(",", ""))
    elif value == "0":
        return int(value)
    else:
        return value

def installs_convert_to_int(installs):
    installs = installs.copy()
    installs = installs.apply(installs_convert_value_to_int)
    installs = installs.apply(pd.to_numeric)
    return installs

def price_convert_value_to_float(value):
    if value == "0":
        return float(value)
    elif value[0] == "$":
        return float(value[1:])
    else:
        return value

def price_convert_to_float(price):
    price = price.copy()
    price = price.apply(price_convert_value_to_float)
    price = price.apply(pd.to_numeric)
    return price

def genres_seperate_doubles(value):
    if (loc := value.find(";")) == -1:
        return [value]
    else:
        return [value[:loc], value[loc + 1:]]


def genres_convert_to_lists(genres):
    genres = genres.copy()
    genres = genres.apply(genres_seperate_doubles)
    return genres





Functions to create extra columns in the database for added functionality

In [19]:
def create_quality(df):
    """Create another column that represents the quality of an app"""
    df = df.copy()
    df["Quality"] = df.apply(lambda row: create_row_quality(row), axis=1)

    return df

def create_profit(df):
    """Create another column that represents the profit made by the app"""
    df = df.copy()
    df["Profit"] = df.apply(lambda row: create_row_profit(row), axis=1)
    return df

def create_row_quality(row):
    return row["Rating"] * row["Reviews"] * row["Installs"]

def create_row_profit(row):
    return row["Installs"] * row["Price"]


A function to automate the other functions

In [None]:
def get_complete_df():
    df = get_df()
    df = clean_df(df)
    df = create_profit(df)
    df = create_quality(df)
    return df