# Imports

In [70]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os

# Importing the Data Set and Defining its Dimensions

In [71]:
google_data = pd.read_csv('../input/google-play-store-apps/googleplaystore.csv')

nRow, nCol = google_data.shape
print(f'There are {nRow} rows and {nCol} columns')

# Gathering the Amount of Data Points and Variable Types

In [72]:
google_data.info()

# Describing the Rating Values

In [73]:
google_data.describe()

# Checking for Outliers Using Boxplot and Histogram

In [74]:
google_data.boxplot()

In [75]:
google_data.hist()

# Based on the histogram's maximum value and the boxplot's horizontal range, we know there is a rating value that is greater than 5. Since the max of the boxplot is above 17.5, there is an outlier cleaned out of the data.

# Checking the Amount of Null Values

In [76]:
google_data.isnull().sum()

# Check for Rating Values Greater than 5

In [77]:
google_data[google_data.Rating > 5]

# Since there is a rating greater than 5, we are going to drop that app from the data set.

In [78]:
google_data.drop(10472, inplace=True)

# Redisplaying the boxplot and histogram to show the updated rating values without the outlier:

In [79]:
google_data.boxplot()

In [80]:
google_data.hist()

# **Data Manipulation**

Since the "Ratings" variable has over 1,000 null entries, we are going to fill in these values with the median "Ratings" value across all data points.

In [81]:
def calc_median(series):
    return series.fillna(series.median())

In [82]:
google_data.Rating = google_data['Rating'].transform(calc_median)

In [83]:
google_data.isnull().sum()

# As seen in the above table, there are no more null values for the "Ratings" variable.

We are now going to print the mode values for non-number variables, starting with "Type".

In [84]:
print(google_data['Type'].mode())

Next, "Current Version".

In [85]:
print(google_data['Current Ver'].mode())

Lastly, "Android Version".

In [86]:
print(google_data['Android Ver'].mode())

We are now going to fill in all the remaining "null" values with the modes for their respective variables.

In [87]:
google_data['Type'].fillna(str(google_data['Type'].mode().values[0]), inplace=True)
google_data['Current Ver'].fillna(str(google_data['Current Ver'].mode().values[0]), inplace=True)
google_data['Android Ver'].fillna(str(google_data['Android Ver'].mode().values[0]), inplace=True)

# Now, let's check if there are any more null values left.

In [88]:
google_data.isnull().sum()

# Now that there are no more null reviews left, we can convert the "Price", "Reviews", and "Installs" variables into numbers using some methods.

In [89]:
google_data['Price'] = google_data['Price'].apply(lambda x: str(x).replace('$', '') if '$' in str(x) else str(x))
google_data['Price'] = google_data['Price'].apply(lambda x: float(x))
google_data['Reviews'] = pd.to_numeric(google_data['Reviews'], errors='coerce')

In [90]:
google_data['Installs'] = google_data['Installs'].apply(lambda x: str(x).replace('+', '') if '+' in str(x) else str(x))
google_data['Installs'] = google_data['Installs'].apply(lambda x: str(x).replace(',', '') if ',' in str(x) else str(x))
google_data['Installs'] = google_data['Installs'].apply(lambda x: float(x))

# Now displaying the new set of data:

In [91]:
google_data.head(10)

# Re-describing the data with the new numerical values:

In [92]:
google_data.describe()

# We are now going to show the average rating values for the different app categories.

In [93]:
group = google_data.groupby('Category')
rtg = group['Rating'].agg(np.mean)
print(rtg)


# Average price values:

In [94]:
prc = group['Price'].agg(np.mean)
print(prc)

# Average review values:

In [95]:
rvs = group['Reviews'].agg(np.mean)
print(rvs)

# Graphing the average ratings for every category:

In [96]:
plt.figure(figsize = (12, 5))
plt.plot(rtg, "ro", color ="b")
plt.xticks(rotation = 90)
plt.show()

# Creating the same graph with more titles:

In [97]:
plt.figure(figsize = (12, 5))
plt.plot(rtg, "ro", color ="b")
plt.xticks(rotation = 90)
plt.title("Average Ratings for Categories")
plt.xlabel('Category')
plt.ylabel('Rating')
plt.show()

# Graphing the price values for each category:

In [98]:
plt.figure(figsize=(16, 5))
plt.plot(prc, 'r--', color = 'r')
plt.xticks(rotation = 90)
plt.title('Average Price for Categories')
plt.xlabel('Category')
plt.ylabel('Price')
plt.plot()

# Lastly, plotting the review values for each category:

In [99]:
plt.figure(figsize = (16, 5))
plt.plot(rvs, 'bs', color = 'g')
plt.xticks(rotation = 90)
plt.title('Average Reviews for Categories')
plt.xlabel('Category')
plt.ylabel('Reviews')
plt.plot()