In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import linregress
from pprint import pprint

In [2]:
data_df=pd.read_csv("Google-Playstore-Full.csv",low_memory=False)
data_df=data_df[["App Name","Category","Rating","Reviews","Installs","Size","Price","Content Rating"]]
data_df.dropna()
data_df=data_df.rename(columns={"App Name":"App_Name"})
data_df

FileNotFoundError: [Errno 2] File b'Google-Playstore-Full.csv' does not exist: b'Google-Playstore-Full.csv'

DATA CLEANING

In [None]:
#clean raw data
data_df=data_df[~data_df.App_Name.str.contains("?",na=False,regex=False)]

In [None]:
# clean price
cleaned_df = data_df
cleaned_df=cleaned_df[~cleaned_df.Price.str.contains("M",na=False,regex=False)]
cleaned_df=cleaned_df[~cleaned_df.Price.str.contains("+",na=False,regex=False)]
cleaned_df = cleaned_df[cleaned_df["Price"] != 'Varies with device']
cleaned_df["Price"] = cleaned_df["Price"].str.replace("$","").astype(float)

In [None]:
#clean install number
cleaned_df = cleaned_df[cleaned_df["Installs"] != 'EDUCATION']
cleaned_df = cleaned_df[cleaned_df["Installs"] != ' Xmax X']
cleaned_df["Installs"] = cleaned_df['Installs'].str.split("+").str[0]
cleaned_df["Installs"] = cleaned_df["Installs"].replace(',', "", regex=True)
cleaned_df["Installs"] = cleaned_df["Installs"].astype(float)

In [None]:
# clean rating
cleaned_df = cleaned_df[cleaned_df["Rating"] != 'NEWS_AND_MAGAZINES']
cleaned_df = cleaned_df[cleaned_df["Rating"] != 'GAME_STRATEGY']
cleaned_df["Rating"] = cleaned_df["Rating"].astype(float)
cleaned_df.head()

In [None]:
#Clean Category
cleaned_df["Category"]=cleaned_df["Category"].replace('_', ' ', regex=True)

In [None]:
#Clean Reviews
cleaned_df["Reviews"]=cleaned_df["Reviews"].astype(float)

In [None]:
#finish data cleaning 
cleaned_df=cleaned_df.rename(columns={"App_Name":"App Name"})
value_counts = cleaned_df["Category"].value_counts()
to_remove = value_counts[value_counts <= 1].index
cleaned_df= cleaned_df[~cleaned_df.Category.isin(to_remove)]

In [None]:
# cleaned_df1 is for general categories
cleaned_df1 = cleaned_df.replace(
    {"GAME PUZZLE": "GAME",
"GAME CASUAL": "GAME",
"GAME ARCADE": "GAME",
"GAME ACTION": "GAME",
"GAME SIMULATION": "GAME",
"GAME CARD": "GAME",
"GAME EDUCATIONAL": "GAME",
"GAME ROLE PLAYING": "GAME",
"GAME STRATEGY": "GAME",
"GAME RACING": "GAME",
"GAME ADVENTURE": "GAME",
"GAME SPORTS": "GAME",
"GAME WORD": "GAME",
"GAME BOARD": "GAME",
"GAME CASINO": "GAME",
"GAME TRIVIA": "GAME",
"GAME MUSIC": "GAME",
})
cleaned_df1["Category"].value_counts()

Top Install Apps

In [None]:
Top_5_install_app = cleaned_df1.sort_values(by='Installs', ascending=False).loc[:,('App Name', 'Category', 'Rating', 'Reviews', 'Installs')].reset_index()
Top_5_install_app = Top_5_install_app.drop(['index'], axis=1)
Top_5_install_app.head(5)

Top Rated Apps

In [None]:
Top_5_Rating_app = cleaned_df1.sort_values(by='Rating', ascending=False).loc[:,('App Name', 'Category', 'Rating', 'Reviews', 'Installs')].reset_index()
Top_5_Rating_app = Top_5_Rating_app.drop(['index'], axis=1)
Top_5_Rating_app.head(5)

Most Reviewed Apps

In [None]:
Top_5_Reviews_app = cleaned_df1.sort_values(by='Reviews', ascending=False).loc[:,('App Name', 'Category', 'Rating', 'Reviews', 'Installs')].reset_index()
Top_5_Reviews_app = Top_5_Reviews_app.drop(['index'], axis=1)
Top_5_Reviews_app.head(5)

Summary Tables & Scatter Plots

In [None]:
cleaned_df2 = cleaned_df1
cleaned_df2=cleaned_df2.rename(columns={"Content Rating":"Number of Apps"})
groupby = cleaned_df2.groupby(["Category"], as_index=False)
install_sum = groupby["Installs"].sum()

In [None]:
category_count = groupby["Number of Apps"].count()

In [None]:
rating_mean = groupby["Rating"].mean()

In [None]:
review_sum = groupby["Reviews"].sum()

In [None]:
result = pd.merge(category_count, install_sum, on='Category', how='left')
result2 = pd.merge(result, rating_mean, on='Category', how='left')
result3 = pd.merge(result2, review_sum, on='Category', how='left')
result3['Average Installs Per App'] = result3['Installs']/result3['Number of Apps']
result3

In [None]:
plt.scatter(result3['Reviews'],result3['Installs'],c=result3['Rating'])
clb = plt.colorbar()
plt.xlabel("Reviews")
plt.ylabel("Installs")
clb.set_label("Rating")
plt.title('Reviews & Total Installs by Category',y=1.05)
plt.show()

In [None]:
plt.savefig("Results/Reviews & Installs by Category.png")

In [None]:
plt.scatter(result3['Rating'],result3['Installs'],c=result3['Reviews'])
clb = plt.colorbar()
plt.xlabel("Rating")
plt.ylabel("Installs")
clb.set_label("Reviews")
plt.title('Rating & Total Installs by Category')
plt.show()

In [None]:
plt.savefig("Results/Rating & Installs by Category.png")

In [None]:
plt.scatter(result3['Number of Apps'],result3['Installs'],c=result3['Rating'])
clb = plt.colorbar()
plt.xlabel("Number of Apps")
plt.ylabel("Installs")
clb.set_label("Rating")
plt.title('Number of Apps & Total Installs by Category',y=1.05)
plt.show()

In [None]:
plt.savefig("Results/Number of Apps & Installs by Category.png")

In [None]:
plt.scatter(result3['Number of Apps'],result3['Average Installs Per App'],c=result3['Rating'])
clb = plt.colorbar()
plt.xlabel("Number of Apps")
plt.ylabel("Average Installs")
clb.set_label("Rating")
plt.title('Number of Apps & Average Installs by Category')
plt.show()

In [None]:
plt.savefig("Results/Number of Apps & Average Installs by Category.png")

In [None]:
plt.scatter(result3['Rating'],result3['Average Installs Per App'],c=result3['Number of Apps'])
clb = plt.colorbar()
plt.xlabel("Rating")
plt.ylabel("Average Installs")
clb.set_label("Number of Apps")
plt.title('Rating & Average Installs by Category')
plt.show()

In [None]:
plt.savefig("Results/Rating & Average Installs by Category.png")

APP ANALYSIS BY CATEGORY

In [None]:
apps=cleaned_df1.groupby(["Category"])
review_category=apps["Reviews"].sum()/1000

In [None]:
category_key=apps.groups.keys()

In [None]:
fig, ax = plt.subplots(figsize=(100,50))
x_index = np.arange(len(apps["Reviews"]))  
total_review = ax.bar(x_index[0:], review_category[0:], color='lightblue')
ax.set_title('Google Play - Total Reviews For Each Category',fontsize=100)
ax.set_ylabel('Total Number of Reviews (in 1,000)',fontsize=80)
ax.set_xlabel('App Category',fontsize=80)
ax.set_xticks(x_index+0.3)
ax.tick_params(axis="y", labelsize=60)
ax.set_xticklabels(category_key,fontsize=50)
plt.xticks(rotation=60, ha='right')
ax.grid(True)
def label(numbers,fontsize,labelheight,roundnumber):
    for x in numbers:
        height = x.get_height()
        xloc=x.get_x() + x.get_width() / 2
        label=(f'{round(height,roundnumber)}')
        ax.text(xloc,height+labelheight,label,ha='center', va='bottom',color="black",fontsize=fontsize)
label(total_review,40,100,0)

In [None]:
fig.savefig("Results/Category vs. Total Review.png")

In [None]:
avg_rating=apps["Rating"].mean()

In [None]:
fig,ax = plt.subplots(figsize=(30,10))
x_index = np.arange(len(apps["Rating"]))  
avg_rating1 = ax.bar(x_index[0:], avg_rating[0:], color='lightblue')
ax.set_title('Google Play - Average App Rating For Each Category',fontsize=40)
ax.set_ylabel('Average Rating',fontsize=30)
ax.set_xlabel('App Category',fontsize=30)
ax.set_xticks(x_index+0.2)
ax.tick_params(axis="y", labelsize=20)
ax.set_xticklabels(category_key,fontsize=20)
plt.xticks(rotation=60, ha='right')
ax.grid(True)
label(avg_rating1,15,0.05,2)

In [None]:
fig.savefig("Results/Category vs. Agv Rating.png")

In [None]:
groupby1 = cleaned_df1.groupby(["Category"])
install_sum1 = groupby1["Installs"].sum()/10000

In [None]:
fig,ax = plt.subplots(figsize=(100,50))
x_index = np.arange(len(groupby["Installs"]))  
total_installs = ax.bar(x_index[0:], install_sum1[0:], color='lightblue')
ax.set_title('Google Play - Total Installs For Each Category',fontsize=100)
ax.set_ylabel('Total Number of Installs (in 10,000)',fontsize=80)
ax.set_xlabel('App Category',fontsize=80)
ax.set_xticks(x_index+0.3)
ax.tick_params(axis="y", labelsize=60)
ax.set_xticklabels(category_key,fontsize=50)
plt.xticks(rotation=60, ha='right')
ax.grid(True)
label(total_installs,40,100,0)

In [None]:
fig.savefig("Results/Category vs. Total Installs.png")

In [None]:
category_count = groupby1["App Name"].count()
average_installs = install_sum1/category_count
fig,ax = plt.subplots(figsize=(30,10))
x_index = np.arange(len(average_installs))  
avg_installs = ax.bar(x_index[0:], average_installs[0:], color='lightblue')
ax.set_title('Google Play - Average App Installs For Each Category',fontsize=40)
ax.set_ylabel('Average Installs (in 10,000)',fontsize=30)
ax.set_xlabel('App Category',fontsize=30)
ax.set_xticks(x_index+0.2)
ax.tick_params(axis="y", labelsize=20)
ax.set_xticklabels(category_key,fontsize=20)
plt.xticks(rotation=60, ha='right')
ax.grid(True)
label(avg_installs,15,0.05,2)

In [None]:
fig.savefig("Results/Category vs. Average Installs.png")

In [None]:
paid_app=cleaned_df1[cleaned_df1["Price"]!=0]
free_app=cleaned_df1[cleaned_df1["Price"]==0]
free_count=free_app["App Name"].count()
paid_count=paid_app["App Name"].count()
free_install=free_app["Installs"].sum()
paid_install=paid_app["Installs"].sum()
avg_install_free=free_install/free_count
avg_install_paid=paid_install/paid_count

In [None]:
#Take out Google & YouTube
cleaned_df3=cleaned_df2
cleaned_df3=cleaned_df3[cleaned_df3["App Name"] != 'Google']
cleaned_df3=cleaned_df3[cleaned_df3["App Name"] != 'YouTube']
groupby2 = cleaned_df3.groupby(["Category"])
install_sum2 = groupby2["Installs"].sum()/10000
average_installs1= install_sum2/category_count

In [None]:
fig,ax = plt.subplots(figsize=(30,10))
x_index = np.arange(len(average_installs1))  
avg_installs = ax.bar(x_index[0:], average_installs1[0:], color='lightblue')
ax.set_title('Google Play - Average App Installs For Each Category (w/o Default Apps)',fontsize=40)
ax.set_ylabel('Average Installs (in 10,000)',fontsize=30)
ax.set_xlabel('App Category',fontsize=30)
ax.set_xticks(x_index+0.2)
ax.tick_params(axis="y", labelsize=20)
ax.set_xticklabels(category_key,fontsize=20)
plt.xticks(rotation=60, ha='right')
ax.grid(True)
label(avg_installs,15,0.05,2)

In [None]:
fig.savefig("Results/Category vs. Average Installs (no Default Apps).png")

In [None]:
install_median1 = groupby1["Installs"].median()
install_median1

In [None]:
fig,ax = plt.subplots(figsize=(30,10))
x_index = np.arange(len(install_median1))  
install_med = ax.bar(x_index[0:], install_median1[0:], color='lightblue')
ax.set_title('Google Play - Median Installs',fontsize=40)
ax.set_ylabel('Median Installs',fontsize=30)
ax.set_xlabel('App Category',fontsize=30)
ax.set_xticks(x_index+0.2)
ax.tick_params(axis="y", labelsize=20)
ax.set_xticklabels(category_key,fontsize=20)
plt.xticks(rotation=60, ha='right')
ax.grid(True)
label(install_med,15,0.05,2)

In [None]:
fig.savefig("Results/Category vs. Median Installs.png")

In [None]:
fig,ax = plt.subplots(figsize=(5,5))
x_index = (np.arange(2))  
y_index=(avg_install_free,avg_install_paid)
price_install= ax.bar(x_index[0:], y_index[0:], color='lightblue',width=0.5)
ax.set_title('Google Play - Average Installs for Paid Apps and Free Apps',fontsize=15)
ax.set_ylabel('Average Installs',fontsize=15)
ax.set_xlabel('App Type',fontsize=15)
ax.set_xticks(x_index)
ax.set_ylim([0, 820000])
ax.tick_params(axis="y", labelsize=10)
ax.set_xticklabels(["Free App","Paid App"],fontsize=10)
ax.yaxis.grid(True)
label(price_install,12,5000,0)

In [None]:
fig.savefig("Results/Price vs. Total Install.png")

GAME APP ANALYSIS

In [None]:
game_df= cleaned_df
game_df=game_df[game_df['Category'].str.contains('GAME', regex=False)]
game_df.head()

In [None]:
game_sub=game_df.groupby("Category")
game_review=game_sub["Reviews"].sum()/1000
game_key=game_sub.groups.keys()
game_app_count=game_sub["App Name"].count()
game_install_sum=game_sub["Installs"].sum()/1000
game_install_avg=game_sub["Installs"].sum()/game_app_count

In [None]:
fig, ax = plt.subplots(figsize=(100,50))
x_index = np.arange(len(game_sub["Rating"]))  
game_total_review = ax.bar(x_index[0:], game_review[0:], color='lightblue')
ax.set_title('Google Play - Total Reviews For Each Game Sub-category',fontsize=100)
ax.set_ylabel('Total Number of Reviews (in 1,000)',fontsize=80)
ax.set_xlabel('Game App Category',fontsize=80)
ax.set_xticks(x_index-0.3)
ax.tick_params(axis="y", labelsize=60)
ax.set_xticklabels(game_key,fontsize=50,rotation=60)
ax.grid(True)
label(game_total_review,60,100,0)

In [None]:
fig.savefig("Results/Game Category vs. Total Review.png")

In [None]:
game_avg_rating=game_sub["Rating"].mean()

In [None]:
fig,ax = plt.subplots(figsize=(30,10))
x_index = np.arange(len(game_sub["Rating"]))  
game_avg_rating1 = ax.bar(x_index[0:], game_avg_rating[0:], color='lightblue')
ax.set_title('Google Play - Average Rating For Each Game Sub-category',fontsize=35)
ax.set_ylabel('Average Rating',fontsize=30)
ax.set_xlabel('Game App Category',fontsize=30)
ax.set_xticks(x_index+0.2)
ax.tick_params(axis="y", labelsize=20)
ax.set_xticklabels(game_key,fontsize=20)
plt.xticks(rotation=40, ha='right')
ax.grid(True)
label(game_avg_rating1,15,0.05,2)

In [None]:
fig.savefig("Results/Game Category vs. Avg Reviw.png")

In [None]:
fig,ax = plt.subplots(figsize=(100,50))
x_index = np.arange(len(game_sub["Installs"]))  
total_installs_game = ax.bar(x_index[0:], game_install_sum[0:], color='lightblue')
ax.set_title('Google Play - Total Installs For Each Game Sub-category',fontsize=100)
ax.set_ylabel('Total Number of Installs (in 1,000)',fontsize=80)
ax.set_xlabel('Game App Category',fontsize=80)
ax.set_xticks(x_index+0.3)
ax.tick_params(axis="y", labelsize=60)
ax.set_xticklabels(game_key,fontsize=50)
plt.xticks(rotation=60, ha='right')
ax.grid(True)
label(total_installs_game,50,100,0)

In [None]:
fig.savefig("Results/Game Category vs. Total Installs.png")

In [None]:
fig,ax = plt.subplots(figsize=(100,50))
x_index = np.arange(len(game_install_avg))  
avg_installs_game = ax.bar(x_index[0:], game_install_avg[0:], color='lightblue')
ax.set_title('Google Play - Average Installs For Each Category (Game)',fontsize=100)
ax.set_ylabel('Average Number of Installs',fontsize=80)
ax.set_xlabel('Game App Category',fontsize=80)
ax.set_xticks(x_index+0.3)
ax.tick_params(axis="y", labelsize=60)
ax.set_xticklabels(game_key,fontsize=50)
plt.xticks(rotation=60, ha='right')
ax.grid(True)
label(avg_installs_game,55,1000,0)

In [None]:
fig.savefig("Results/Game Category vs. Average Installs.png")

In [None]:
paid_game_app=game_df[game_df["Price"]!=0]
free_game_app=game_df[game_df["Price"]==0]
free_game_count=free_game_app["App Name"].count()
paid_game_count=paid_game_app["App Name"].count()
free_game_install=free_game_app["Installs"].sum()
paid_game_install=paid_game_app["Installs"].sum()
avg_install_free_game=free_game_install/free_game_count
avg_install_paid_game=paid_game_install/paid_game_count

In [None]:
fig,ax = plt.subplots(figsize=(6,5))
x_index = (np.arange(2))  
y_index=(avg_install_free_game,avg_install_paid_game)
game_price_install= ax.bar(x_index[0:], y_index[0:], color='lightblue',width=0.5)
ax.set_title('Google Play - Average Installs for Paid Apps and Free Apps (Game)',fontsize=15)
ax.set_ylabel('Average Installs',fontsize=15)
ax.set_xlabel('App Type',fontsize=15)
ax.set_xticks(x_index)
ax.set_ylim([0, 2800000])
ax.tick_params(axis="y", labelsize=10)
ax.set_xticklabels(["Free App","Paid App"],fontsize=10)
ax.yaxis.grid(True)
label(game_price_install,13,50000,0)

In [None]:
fig.savefig("Results/Game: Price vs. Total Install.png")