In [3]:
import pandas as pd 
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [4]:
pd.options.display.float_format = "{:,.2f}".format

In [5]:
df = pd.read_csv("apps.csv")

In [6]:
df.drop(["Android_Ver","Last_Updated"],axis=1,inplace=True)

# Data Cleaning

In [7]:
df_clean = df.dropna()

In [8]:
df_clean.shape

(9367, 10)

In [9]:
df_clean = df_clean.drop_duplicates(subset=["App","Price","Type"])

In [10]:
df_clean.shape

(8199, 10)

# Find Highest Rated Apps


In [11]:
df_highest_rating = df_clean.sort_values("Rating",ascending=False,ignore_index=True,)

In [12]:
df_highest_rating


Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
0,KBA-EZ Health Guide,MEDICAL,5.00,4,25.00,1,Free,0,Everyone,Medical
1,Sway Medical,MEDICAL,5.00,3,22.00,100,Free,0,Everyone,Medical
2,AJ Men's Grooming,LIFESTYLE,5.00,2,22.00,100,Free,0,Everyone,Lifestyle
3,FK Dedinje BGD,SPORTS,5.00,36,2.60,100,Free,0,Everyone,Sports
4,CB VIDEO VISION,PHOTOGRAPHY,5.00,13,2.60,100,Free,0,Everyone,Photography
...,...,...,...,...,...,...,...,...,...,...
8194,CR Magazine,BUSINESS,1.00,1,7.80,100,Free,0,Everyone,Business
8195,FE Mechanical Engineering Prep,FAMILY,1.00,2,21.00,1000,Free,0,Everyone,Education
8196,Speech Therapy: F,FAMILY,1.00,1,16.00,10,Paid,$2.99,Everyone,Education
8197,Familial Hypercholesterolaemia Handbook,MEDICAL,1.00,2,33.00,100,Free,0,Everyone,Medical


In [13]:
df_highest_size =df_clean.sort_values("Size_MBs",ascending=False,ignore_index=True)

In [14]:
df_highest_size

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
0,Talking Babsy Baby: Baby Games,LIFESTYLE,4.00,140995,100.00,10000000,Free,0,Everyone,Lifestyle;Pretend Play
1,Hungry Shark Evolution,GAME,4.50,6074334,100.00,100000000,Free,0,Teen,Arcade
2,Miami crime simulator,GAME,4.00,254518,100.00,10000000,Free,0,Mature 17+,Action
3,Gangster Town: Vice District,FAMILY,4.30,65146,100.00,10000000,Free,0,Mature 17+,Simulation
4,Vi Trainer,HEALTH_AND_FITNESS,3.60,124,100.00,5000,Free,0,Everyone,Health & Fitness
...,...,...,...,...,...,...,...,...,...,...
8194,Ad Remove Plugin for App2SD,PRODUCTIVITY,4.10,66,0.02,1000,Paid,$1.29,Everyone,Productivity
8195,ExDialer PRO Key,COMMUNICATION,4.50,5474,0.02,100000,Paid,$3.99,Everyone,Communication
8196,My baby firework (Remove ad),FAMILY,4.10,30,0.01,1000,Paid,$0.99,Everyone,Entertainment
8197,Market Update Helper,LIBRARIES_AND_DEMO,4.10,20145,0.01,1000000,Free,0,Everyone,Libraries & Demo


In [15]:
df_highest_reviews = df_clean.sort_values("Reviews",ascending=False,ignore_index=True)

In [16]:
df_highest_reviews

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
0,Facebook,SOCIAL,4.10,78158306,5.30,1000000000,Free,0,Teen,Social
1,WhatsApp Messenger,COMMUNICATION,4.40,69119316,3.50,1000000000,Free,0,Everyone,Communication
2,Instagram,SOCIAL,4.50,66577313,5.30,1000000000,Free,0,Teen,Social
3,Messenger – Text and Video Chat for Free,COMMUNICATION,4.00,56642847,3.50,1000000000,Free,0,Everyone,Communication
4,Clash of Clans,GAME,4.60,44891723,98.00,100000000,Free,0,Everyone 10+,Strategy
...,...,...,...,...,...,...,...,...,...,...
8194,Wowkwis aq Ka'qaquj,FAMILY,5.00,1,49.00,10,Free,0,Everyone,Education;Education
8195,CB Fit,HEALTH_AND_FITNESS,5.00,1,7.80,10,Free,0,Everyone,Health & Fitness
8196,ES Billing System (Offline App),PRODUCTIVITY,5.00,1,4.20,100,Free,0,Everyone,Productivity
8197,Ek Kahani Aisi Bhi Season 3 - The Horror Story,FAMILY,3.00,1,5.80,100,Free,0,Teen,Entertainment


In [17]:
content_rating = df_clean.Content_Rating.value_counts()


In [18]:
content_rating_df = pd.DataFrame({'Content_Rating': content_rating.index, 'Count': content_rating.values})

fig = px.pie(
    content_rating_df,
    values='Count',
    names='Content_Rating',
    title='Content Rating Distribution',
    labels=content_rating_df.index,
    hole=0.6,
    color_discrete_sequence=px.colors.sequential.Plasma
    
)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(
    plot_bgcolor='rgba(0, 0, 0, 0)',  # Set plot background color to transparent
    paper_bgcolor='rgb(0, 0, 0)',     # Set paper (the outer background) color to dark
)


fig.show()


In [19]:
df_clean.Installs = pd.to_numeric(df_clean.Installs.astype(str).str.replace(",",""))
df_intalls =df_clean[["App","Installs"]].groupby("Installs").count()

In [20]:
df_intalls = df_intalls.sort_values(by="App",ascending=False)
df_intalls.index = pd.Series(df_intalls.index).apply(lambda x: "+" + str(x) if x > 10000 else str(x))


In [21]:
df_installs =df_clean[["App","Installs"]]
df_installs = df_installs.sort_values(by="Installs",ascending=False)
df_installs["Installs"] = df_installs["Installs"].apply(lambda x: "+" + str(x) if x >= 10000 else str(x))


In [22]:
df_intalls

Unnamed: 0_level_0,App
Installs,Unnamed: 1_level_1
1000000,1417
100000,1096
10000,988
10000000,933
1000,698
5000000,607
500000,504
50000,457
5000,425
100,303


In [23]:
fig =px.pie(
    df_intalls,
    values=df_intalls.App,
    names=df_intalls.index,
)
fig.update_traces(textposition='inside', textinfo='percent+label')


fig.show()

In [24]:
df_clean.Price = pd.to_numeric(df_clean.Price.astype(str).str.replace("$","",regex=False))

In [25]:
df_clean.sort_values(by="Price",ascending=False,ignore_index=True)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
0,I'm Rich - Trump Edition,LIFESTYLE,3.60,275,7.30,10000,Paid,400.00,Everyone,Lifestyle
1,I AM RICH PRO PLUS,FINANCE,4.00,36,41.00,1000,Paid,399.99,Everyone,Finance
2,I Am Rich Premium,FINANCE,4.10,1867,4.70,50000,Paid,399.99,Everyone,Finance
3,I am rich(premium),FINANCE,3.50,472,0.94,5000,Paid,399.99,Everyone,Finance
4,💎 I'm rich,LIFESTYLE,3.80,718,26.00,10000,Paid,399.99,Everyone,Lifestyle
...,...,...,...,...,...,...,...,...,...,...
8194,myAir™ for Air10™ by ResMed,MEDICAL,3.70,236,18.00,50000,Free,0.00,Everyone,Medical
8195,AK Math Coach,FAMILY,3.60,283,18.00,50000,Free,0.00,Everyone,Education
8196,Forgotten Hill: Fall,GAME,4.40,1063,18.00,50000,Free,0.00,Teen,Adventure
8197,AE Video Poker,GAME,4.00,721,18.00,50000,Free,0.00,Teen,Casino


In [26]:
df_highest_revenue = df_clean.sort_values(by="Revenue",ascending=False)

KeyError: 'Revenue'

In [None]:
df_highest_revenue

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres,Revenue
9220,Minecraft,FAMILY,4.50,2376564,19.00,10000000,Paid,6.99,Everyone 10+,Arcade;Action & Adventure,69900000.00
5765,I am rich,LIFESTYLE,3.80,3547,1.80,100000,Paid,399.99,Everyone,Lifestyle,39999000.00
4606,I Am Rich Premium,FINANCE,4.10,1867,4.70,50000,Paid,399.99,Everyone,Finance,19999500.00
8825,Hitman Sniper,GAME,4.60,408292,29.00,10000000,Paid,0.99,Mature 17+,Action,9900000.00
7151,Grand Theft Auto: San Andreas,GAME,4.40,348962,26.00,1000000,Paid,6.99,Mature 17+,Action,6990000.00
...,...,...,...,...,...,...,...,...,...,...,...
4508,myAir™ for Air10™ by ResMed,MEDICAL,3.70,236,18.00,50000,Free,0.00,Everyone,Medical,0.00
4507,AK Math Coach,FAMILY,3.60,283,18.00,50000,Free,0.00,Everyone,Education,0.00
4506,Forgotten Hill: Fall,GAME,4.40,1063,18.00,50000,Free,0.00,Teen,Adventure,0.00
4505,AE Video Poker,GAME,4.00,721,18.00,50000,Free,0.00,Teen,Casino,0.00


In [None]:
df_clean.Category.nunique()
df_clean.shape

(8199, 11)

In [None]:
top_categories= df_clean.Category.value_counts()

In [None]:
top_categories

FAMILY                 1610
GAME                    910
TOOLS                   719
FINANCE                 302
LIFESTYLE               302
PRODUCTIVITY            301
PERSONALIZATION         298
MEDICAL                 292
PHOTOGRAPHY             263
BUSINESS                262
SPORTS                  260
COMMUNICATION           257
HEALTH_AND_FITNESS      243
NEWS_AND_MAGAZINES      204
SOCIAL                  203
TRAVEL_AND_LOCAL        187
SHOPPING                180
BOOKS_AND_REFERENCE     169
VIDEO_PLAYERS           148
DATING                  134
EDUCATION               118
MAPS_AND_NAVIGATION     118
ENTERTAINMENT           102
FOOD_AND_DRINK           94
AUTO_AND_VEHICLES        73
WEATHER                  72
LIBRARIES_AND_DEMO       64
HOUSE_AND_HOME           62
ART_AND_DESIGN           61
COMICS                   54
PARENTING                50
EVENTS                   45
BEAUTY                   42
Name: Category, dtype: int64

In [None]:
bar = px.bar(x=top_categories.index,y=top_categories.values)

In [None]:
bar.show()

In [None]:
top_categories_with_highes_dowload = df_clean.groupby("Category").agg({"Installs":pd.Series.sum})
top_categories_with_highes_dowload.sort_values("Installs",ascending=False,inplace=True)
top_categories_with_highes_dowload

Unnamed: 0_level_0,Installs
Category,Unnamed: 1_level_1
GAME,13858762717
COMMUNICATION,11039241530
TOOLS,8099724500
PRODUCTIVITY,5788070180
SOCIAL,5487841475
PHOTOGRAPHY,4649143130
FAMILY,4437579590
VIDEO_PLAYERS,3916897200
TRAVEL_AND_LOCAL,2894859300
NEWS_AND_MAGAZINES,2369110650


In [None]:
horizontal_bar = px.bar(x=top_categories_with_highes_dowload.Installs,
                        y=top_categories_with_highes_dowload.index,
                        orientation="h")

horizontal_bar.update_layout(xaxis_title ="Number of Dowloads",
                             yaxis_title="Category"
)
horizontal_bar.show()

# Colour Scales in Plotly Charts and  Extracting Nested Data from a Column


In [None]:
genres = df_clean.Genres.str.split(";",expand=True).stack()
num_genres = genres.value_counts()
num_genres

Tools                      719
Education                  587
Entertainment              502
Action                     304
Lifestyle                  303
Finance                    302
Productivity               301
Personalization            298
Medical                    292
Sports                     270
Photography                263
Business                   262
Communication              258
Health & Fitness           245
Casual                     216
News & Magazines           204
Social                     203
Simulation                 200
Travel & Local             187
Arcade                     185
Shopping                   180
Books & Reference          171
Video Players & Editors    150
Dating                     134
Puzzle                     124
Maps & Navigation          118
Role Playing               111
Racing                     103
Action & Adventure          96
Strategy                    95
Food & Drink                94
Educational                 93
Adventur

In [None]:
color_bar= px.bar(x=num_genres.index,
                  y=num_genres.values,
                  color=num_genres.values,
                  hover_name=num_genres.index,
                  color_continuous_scale="Agsunset")

color_bar.update_layout(xaxis_title="Genre",
                  yaxis_title ="Number of Aoos",
                  coloraxis_showscale=True
                  )

color_bar.show()

# Grouped Bar Charts: Free vs. Paid Apps per Category

In [None]:
df_free_vs_paid = df_clean.groupby(["Category","Type"],as_index=False).agg({"App":pd.Series.count})

In [None]:

df_free_vs_paid.sort_values("App",ascending=False)

Unnamed: 0,Category,Type,App
19,FAMILY,Free,1456
25,GAME,Free,834
53,TOOLS,Free,656
21,FINANCE,Free,289
31,LIFESTYLE,Free,284
...,...,...,...
17,ENTERTAINMENT,Paid,2
24,FOOD_AND_DRINK,Paid,2
40,PARENTING,Paid,2
38,NEWS_AND_MAGAZINES,Paid,2


In [None]:
g_bar = px.bar(df_free_vs_paid,
               x="Category",
               y="App",
               color="Type",
               title="Free vs Paid Apps by Category",
               barmode="group")

g_bar.update_layout(xaxis_title='Category',
                    yaxis_title='Number of Apps',
                    xaxis={'categoryorder':'total descending'},
                    yaxis=dict(type='log'))

g_bar.show()


# Plotly Box Plots: Lost Downloads for Paid Apps


In [None]:
box_plot = px.box(df_clean,
                  y="Installs",
                  x="Type",
                  color="Type",
                  notched=True,
                  points="all",
                  title= "Me")

box_plot.update_layout(yaxis=dict(type='log'))
box_plot.show()

# Plotly Box Plots: Revenue by App Category


In [None]:
df_paid_apps = df_clean[df_clean['Type'] == 'Paid']
box = px.box(df_paid_apps,
             x='Category',
             y='Price',
             title='How Much Can Paid Apps Earn?')

box.update_layout(xaxis_title='Category',
                  yaxis_title='Paid App Price',
                  xaxis={'categoryorder':'max descending'},
                  yaxis=dict(type='log'))

box.show()

In [27]:
df_clean.sort_values("Size_MBs",ascending=False)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
9942,Talking Babsy Baby: Baby Games,LIFESTYLE,4.00,140995,100.00,10000000,Free,0.00,Everyone,Lifestyle;Pretend Play
10687,Hungry Shark Evolution,GAME,4.50,6074334,100.00,100000000,Free,0.00,Teen,Arcade
9943,Miami crime simulator,GAME,4.00,254518,100.00,10000000,Free,0.00,Mature 17+,Action
9944,Gangster Town: Vice District,FAMILY,4.30,65146,100.00,10000000,Free,0.00,Mature 17+,Simulation
3144,Vi Trainer,HEALTH_AND_FITNESS,3.60,124,100.00,5000,Free,0.00,Everyone,Health & Fitness
...,...,...,...,...,...,...,...,...,...,...
2648,Ad Remove Plugin for App2SD,PRODUCTIVITY,4.10,66,0.02,1000,Paid,1.29,Everyone,Productivity
5798,ExDialer PRO Key,COMMUNICATION,4.50,5474,0.02,100000,Paid,3.99,Everyone,Communication
2684,My baby firework (Remove ad),FAMILY,4.10,30,0.01,1000,Paid,0.99,Everyone,Entertainment
7966,Market Update Helper,LIBRARIES_AND_DEMO,4.10,20145,0.01,1000000,Free,0.00,Everyone,Libraries & Demo
