In [1]:
import logging
import pandas as pd
import plotly.express as px


def setup_logging():
    logging.basicConfig(
        level=logging.INFO, 
        format='%(asctime)s - %(levelname)s %(funcName)s | %(message)s',
        handlers=[
            logging.FileHandler('76-ipynb.log')
        ]
    )

    logging.info("Logging setup complete.")


def fetch_df(file_path: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError as notfound_error:
        logging.error(f"File not found: {file_path}. Error: {notfound_error}")
        raise notfound_error
    except pd.errors.EmptyDataError as empty_error:
        logging.error(f"File is empty: {file_path}. Error: {empty_error}")
        raise empty_error
    except pd.errors.ParserError as parse_error:
        logging.error(f"Error parsing file: {file_path}. Error: {parse_error}")
        raise parse_error
    except Exception as e:
        logging.error(f"An unexpected error occurred while reading {file_path}. Error: {e}")
        raise e
    else:
        logging.info(f"DataFrame loaded successfully from {file_path}.")
        return df
    

def log_df_info(df: pd.DataFrame) -> None:
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    # print(f"Sample:\n{df.sample(5)}")
    print(f'NaN: {df.isna().values.any()}')
    print(f'Duplicate: {df.duplicated().any()}')
    

# Data Cleaning: Removing NaN Values and Duplicates

In [2]:
def return_cleaned_df() -> pd.DataFrame:
    app_df = fetch_df('input-76.csv')
    # dropping columns that are not needed for analysis
    app_df = app_df.drop(columns=['Last_Updated', 'Android_Ver'], axis=1)
    app_df = app_df.dropna().drop_duplicates(subset=['App', 'Category', 'Type', 'Price'])
    return app_df

# Preliminary Exploration: The Highest Ratings, Most Reviews, and Largest Size

In [3]:
if __name__ == "__main__":
    setup_logging()

    app_df = return_cleaned_df()

    print(f'Top 5 highest rated apps:\n{app_df.sort_values(by="Rating", ascending=False).head(5)}')
    # print(f'Top 5 highest rated apps:\n{app_df.sort_values(by=["Reviews", "Rating"], ascending=False).head(5)}')
    print(f'Top 5 largest apps by size:\n{app_df.sort_values(by="Size_MBs", ascending=False).head(5)}')
    print(f'Top 5 highest reviews app:\n{app_df.sort_values(by="Reviews", ascending=False).head(50)}')

Top 5 highest rated apps:
                                                 App       Category  Rating  \
21                               KBA-EZ Health Guide        MEDICAL     5.0   
99   Anatomy & Physiology Vocabulary Exam Review App        MEDICAL     5.0   
126                                  Tablet Reminder        MEDICAL     5.0   
181                                     EJ messenger  COMMUNICATION     5.0   
179                                 Bh Public School         FAMILY     5.0   

     Reviews  Size_MBs Installs  Type Price Content_Rating         Genres  
21         4      25.0        1  Free     0       Everyone        Medical  
99         1       4.6        5  Free     0       Everyone        Medical  
126        4       2.5        5  Free     0       Everyone        Medical  
181        1      25.0       10  Free     0           Teen  Communication  
179        2       8.7       10  Free     0       Everyone      Education  
Top 5 largest apps by size:
               

# Data Visualisation with Plotly: Create Pie and Donut Charts

In [4]:
if __name__ == "__main__":
    setup_logging()

    app_df = return_cleaned_df()

    ratings = app_df.Content_Rating.value_counts()
    
    fig = px.pie(
        labels=ratings.index, 
        values=ratings.values, 
        names=ratings.index, 
        title='Content Rating',
        hole=0.5
    )
    fig.update_traces(
        textposition='outside', 
        textinfo='percent+label'
    )

    fig.show()

# Numeric Type Conversions for the Installations & Price Data

In [5]:
if __name__ == "__main__":
    setup_logging()

    app_df = return_cleaned_df()
    print(f'Column type:\n{app_df.Installs.describe()}')
    print(f'Column type:\n{app_df.Price.describe()}')

    app_df.Price = app_df.Price.replace('[\$,]', '', regex=True).astype(float)
    app_df.Installs = app_df.Installs.replace('[\+,]', '', regex=True).astype(int)

    # print(f'App count with download rate:\n{app_df[['App', 'Installs']].groupby('Installs').count().sort_values(by='Installs', ascending=False).reset_index()}')
    
    app_df = app_df[app_df.Price < 250]
    print(app_df.shape)

    app_df['Revenue_Estimate'] = app_df.Installs * app_df.Price
    app_df = app_df.sort_values(by='Revenue_Estimate', ascending=False).head(10)
    print(f'Top 10 apps by revenue:\n{app_df}')
    print(f'Games among top 10 highest revenue apps:\n{app_df[app_df.Category == "GAME"]}')


Column type:
count          8282
unique           19
top       1,000,000
freq           1428
Name: Installs, dtype: object
Column type:
count     8282
unique      73
top          0
freq      7676
Name: Price, dtype: object
(8267, 10)
Top 10 apps by revenue:
                                App     Category  Rating  Reviews   Size_MBs  \
9220                      Minecraft       FAMILY     4.5  2376564  19.000000   
8825                  Hitman Sniper         GAME     4.6   408292  29.000000   
7151  Grand Theft Auto: San Andreas         GAME     4.4   348962  26.000000   
7477            Facetune - For Free  PHOTOGRAPHY     4.4    49553  48.000000   
7977        Sleep as Android Unlock    LIFESTYLE     4.5    23966   0.851562   
6594            DraStic DS Emulator         GAME     4.6    87766  12.000000   
6082                   Weather Live      WEATHER     4.5    76593   4.750000   
6746     Card Wars - Adventure Time       FAMILY     4.3   129603  23.000000   
7954                  

# Plotly Bar Charts & Scatter Plots: The Most Competitive & Popular App Categories

In [9]:
if __name__ == "__main__":
    setup_logging()

    app_df = return_cleaned_df()
    app_df.Installs = app_df.Installs.replace('[\+,]', '', regex=True).astype(int)

    # print(f'Unique categories: {app_df.Category.nunique()}')
    top10_categories = app_df.Category.value_counts().head(10)
    bar_fig = px.bar(
        data_frame=top10_categories, 
        x=top10_categories.index, 
        y=top10_categories.values, 
        title='Top 10 Categories by App Count',
        labels={'x': 'Category', 'y': 'App Count'}
    )
    bar_fig.show()

    category_installs = app_df.groupby('Category').agg({'Installs': pd.Series.sum}).sort_values(by='Installs', ascending=True)
    h_bar_fig = px.bar(
        data_frame=category_installs, 
        x=category_installs['Installs'], 
        y=category_installs.index, 
        orientation='h', 
        title='Total Installs by Category',
        labels={'x': 'Total Installs', 'y': 'Category'}
    )
    h_bar_fig.show()

    cat_count_install = pd.merge(
        left=app_df.groupby('Category').agg({'App': pd.Series.count}),
        right=category_installs,
        on='Category',
        how='inner'
    ).sort_values(by='Installs', ascending=False).reset_index()
    
    scatter_fig = px.scatter(
        data_frame=cat_count_install, 
        x='App', 
        y='Installs', 
        title='Apps vs Installs by Category',
        size='App',
        color='Installs',
        hover_data=['Category']
    )
    scatter_fig.update_layout(
        xaxis_title="Number of Apps (Lower=More Concentrated)",
        yaxis_title="Installs",
        yaxis=dict(type='log')
    )

    scatter_fig.show()

# Extracting Nested Column Data using .stack()

In [36]:
if __name__ == "__main__":
    setup_logging()

    app_df = return_cleaned_df()
    # print(f'Number of unique genres: {app_df.Genres.nunique()}')
    # print(f'Unique Genres: {app_df.Genres.unique()}')
    stack = app_df.Genres.str.split(';', expand=True).stack()
    num_genres = stack.value_counts().head(15).reset_index()
    # print(f'Stack:\n{len(num_genres)}')

    bar_fig = px.bar(
        data_frame=num_genres, 
        x='index', 
        y='count', 
        title='Top 15 Genres by App Count',
        hover_name='index',
        color='count',
        color_continuous_scale='Agsunset',
        labels={'index': 'Genre', 'count': 'App Count'}
    )
    bar_fig.update_layout(coloraxis_showscale=False)
    bar_fig.show()

# Grouped Bar Charts and Box Plots with Plotly

In [57]:
if __name__ == "__main__":
    setup_logging()

    app_df = return_cleaned_df()
    app_df.Installs = app_df.Installs.replace('[\+,]', '', regex=True).astype(int)
    app_df.Price = app_df.Price.replace('[\$,]', '', regex=True).astype(float)
    print(f'Type count: {app_df.Type.value_counts()}')

    df_free_vs_paid = app_df.groupby(
        ['Category', 'Type'], 
        as_index=False
        ).agg({'App': pd.Series.count}).sort_values(
            by='App',
            ascending=False
        )
    
    bar_fig = px.bar(
        data_frame=df_free_vs_paid, 
        x='Category', 
        y='App', 
        color='Type', 
        title='Free vs Paid Apps by Category',
        labels={'Category': 'Category', 'App': 'App Count'},
        barmode='group'
    )
    bar_fig.update_layout(
        xaxis={'categoryorder': 'total descending'},
        yaxis=dict(type='log'),
        xaxis_tickangle=45,
        legend_title_text='Type'
    )
    bar_fig.show()

    box_fig = px.box(
        data_frame=app_df, 
        x='Type',
        y='Installs',
        color='Type',
        notched=True,
        points='all',
        title='Installs Distribution by Type'
    )
    box_fig.update_layout(
        yaxis=dict(type='log'),
        xaxis_tickangle=45
    )
    box_fig.show()

    paid_apps = app_df[app_df.Type == 'Paid']
    paid_apps = paid_apps[paid_apps.Price < 250]
    paid_apps['Revenue_Estimate'] = paid_apps.Installs * paid_apps.Price
    
    box_fig_paid = px.box(
        data_frame=paid_apps, 
        x='Category', 
        y='Revenue_Estimate',
        title='How Much Can Paid Apps Earn?'
    )
    box_fig_paid.update_layout(
        yaxis=dict(type='log'),
        xaxis={'categoryorder': 'min ascending'},
        xaxis_tickangle=45
    )
    box_fig_paid.show()

    box_fig_paid2 = px.box(
        data_frame=paid_apps, 
        x='Category', 
        y='Price',
        title='Price per Category'
    )
    box_fig_paid2.update_layout(
        yaxis=dict(type='log'),
        xaxis={'categoryorder': 'max descending'},
        xaxis_tickangle=45
    )
    box_fig_paid2.show()

Type count: Type
Free    7676
Paid     606
Name: count, dtype: int64
