# Google Play Store Data Analysis (Simplified)

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('googleplaystore.csv')
df.drop_duplicates(subset='App', inplace=True)

In [None]:

# Data Cleaning
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')
df = df[df['Installs'].str.match(r'^[\d,]+[\+]?$', na=False)]
df['Installs'] = df['Installs'].str.replace('[+,]', '', regex=True).astype(float)
df['Price'] = df['Price'].str.replace('$', '', regex=True).astype(float)
df['Size'] = df['Size'].replace('Varies with device', np.nan)
df['Size'] = df['Size'].str.replace('M', '').str.replace('k', 'e-3').astype(float)
df['Rating'].fillna(df['Rating'].median(), inplace=True)


In [None]:
# 1. Most Expensive App
df[['App', 'Price']].sort_values('Price', ascending=False).head(1)

In [None]:
# 2. Genre with Most Apps
df['Genres'].value_counts().head(1)

In [None]:
# 3. Avg Size Free vs Paid
df.groupby('Type')['Size'].mean()

In [None]:
# 4. Top 5 Expensive Perfect Rating
df[df['Rating'] == 5.0].sort_values('Price', ascending=False).head(5)

In [None]:
# 5. Apps with >50K Reviews
len(df[df['Reviews'] > 50000])

In [None]:
# 6. Avg Price by Genre & Installs
df.groupby(['Genres', 'Installs'])['Price'].mean().head()

In [None]:
# 7. Apps Rating >4.7 & Avg Price
df[df['Rating'] > 4.7]['Price'].mean()

In [None]:
# 8. Google Revenue (5M+ Installs)
revenue = (df[df['Installs'] >= 5000000]['Price'] * df['Installs']).sum() * 0.3
revenue

In [None]:
# 9. Max/Min Size Free vs Paid
df.groupby('Type')['Size'].agg(['min', 'max'])

In [None]:
# 10. Correlation Matrix
df[['Rating', 'Reviews', 'Size', 'Price']].corr()

In [None]:
# 11. Apps per Type & Content Rating
df.groupby(['Content Rating', 'Type']).size().unstack(fill_value=0)

In [None]:
# 12. Apps for Android 4.x
df[df['Android Ver'].str.contains('4', na=False)].shape[0]