In [1]:
# Import Dependencies
import pandas as pd
from sqlalchemy import create_engine
import mysql_conn

### Import Data

In [2]:
# Read in Apple store data
apple = pd.read_csv('Raw_Data/Apple/AppleStore.csv')
apple.head()

Unnamed: 0.1,Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,1,281656475,PAC-MAN Premium,100788224,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1
1,2,281796108,Evernote - stay organized,158578688,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1
2,3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1
3,4,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,USD,0.0,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,5,9,1
4,5,282935706,Bible,92774400,USD,0.0,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1


In [3]:
# Read in Play Store data
google = pd.read_csv('Raw_Data/Google/googleplaystore.csv')
google.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


### Rename Columns

In [4]:
# Rename Apple columns
a_cols = ['track_name', 'size_bytes', 'price', 'user_rating', 'cont_rating', 'prime_genre']
apple = apple[a_cols]
apple = apple.rename(columns = {'track_name':'a_name', 
                                'size_bytes':'a_size_bytes', 
                                'price':'a_price', 
                                'user_rating':'a_user_rating', 
                                'cont_rating':'a_content_rating', 
                                'prime_genre':'a_category'})
apple.head()

Unnamed: 0,a_name,a_size_bytes,a_price,a_user_rating,a_content_rating,a_category
0,PAC-MAN Premium,100788224,3.99,4.0,4+,Games
1,Evernote - stay organized,158578688,0.0,4.0,4+,Productivity
2,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,0.0,3.5,4+,Weather
3,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,0.0,4.0,12+,Shopping
4,Bible,92774400,0.0,4.5,4+,Reference


In [5]:
#google.loc[google.g_name.str.contains('eBay')]

In [6]:
# Rename Google columns
g_cols = ['App', 'Size', 'Price', 'Rating', 'Content Rating', 'Category']
google = google[g_cols]
google = google.rename(columns = {'App':'g_name', 
                        'Size':'g_size_mb', 
                        'Price':'g_price', 
                        'Rating':'g_user_rating', 
                        'Content Rating':'g_content_rating', 
                        'Category':'g_category'})
google.head()

Unnamed: 0,g_name,g_size_mb,g_price,g_user_rating,g_content_rating,g_category
0,Photo Editor & Candy Camera & Grid & ScrapBook,19M,0,4.1,Everyone,ART_AND_DESIGN
1,Coloring book moana,14M,0,3.9,Everyone,ART_AND_DESIGN
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",8.7M,0,4.7,Everyone,ART_AND_DESIGN
3,Sketch - Draw & Paint,25M,0,4.5,Teen,ART_AND_DESIGN
4,Pixel Draw - Number Art Coloring Book,2.8M,0,4.3,Everyone,ART_AND_DESIGN


### Remove Apps that do not appear on both datasets

In [7]:
# Drop duplicates
google = google.drop_duplicates(subset = 'g_name')
apple = apple.drop_duplicates(subset = 'a_name')

In [8]:
apple_merge = apple.rename(columns = {'a_name':'name'})
google_merge = google.rename(columns = {'g_name':'name'})

In [9]:
# Remove apps that do not appear in both datasets
apps_merge = pd.merge(apple_merge, google_merge, on = 'name')

In [10]:
apps_merge.head()

Unnamed: 0,name,a_size_bytes,a_price,a_user_rating,a_content_rating,a_category,g_size_mb,g_price,g_user_rating,g_content_rating,g_category
0,Bible,92774400,0.0,4.5,4+,Reference,Varies with device,0,4.7,Teen,BOOKS_AND_REFERENCE
1,Facebook,389879808,0.0,3.5,4+,Social Networking,Varies with device,0,4.1,Teen,SOCIAL
2,LinkedIn,273844224,0.0,3.5,4+,Social Networking,Varies with device,0,4.2,Everyone,SOCIAL
3,Google Earth,37214208,0.0,3.5,4+,Travel,Varies with device,0,4.3,Everyone,TRAVEL_AND_LOCAL
4,PAC-MAN,100849664,0.0,3.0,4+,Games,37M,0,4.2,Everyone,GAME


In [11]:
common_apps = list(apps_merge.name)

# Column Formatting

### Format app size columns

In [12]:
#print(apple.a_size_mb.unique())
#print(google.g_size_mb.unique())

In [13]:
# Convert Apple app size from bytes to megabytes
a_size_mb = []

for byte in list(apps_merge.a_size_bytes):
    megabyte = byte / 1024 / 1024
    a_size_mb.append(megabyte)

apps_merge.a_size_bytes = a_size_mb
apps_merge = apps_merge.rename(columns = {'a_size_bytes':'a_size_mb'})

# Round decimal places
apps_merge.a_size_mb = apps_merge.a_size_mb.round(1)

In [14]:
# Remove 'M' from Google app sizes
apps_merge.g_size_mb = apps_merge.g_size_mb.str[:-1]

In [15]:
apps_merge.head()

Unnamed: 0,name,a_size_mb,a_price,a_user_rating,a_content_rating,a_category,g_size_mb,g_price,g_user_rating,g_content_rating,g_category
0,Bible,88.5,0.0,4.5,4+,Reference,Varies with devic,0,4.7,Teen,BOOKS_AND_REFERENCE
1,Facebook,371.8,0.0,3.5,4+,Social Networking,Varies with devic,0,4.1,Teen,SOCIAL
2,LinkedIn,261.2,0.0,3.5,4+,Social Networking,Varies with devic,0,4.2,Everyone,SOCIAL
3,Google Earth,35.5,0.0,3.5,4+,Travel,Varies with devic,0,4.3,Everyone,TRAVEL_AND_LOCAL
4,PAC-MAN,96.2,0.0,3.0,4+,Games,37,0,4.2,Everyone,GAME


### Format price columns

In [16]:
print(apps_merge.a_price.unique())
print(apps_merge.g_price.unique())

[ 0.    0.99  2.99  4.99  1.99  3.99 19.99  9.99 14.99  6.99  7.99  5.99]
['0' '$2.99' '$4.99' '$0.99' '$3.99' '$1.99' '$24.99' '$14.99' '$9.99'
 '$1.20' '$7.99' '$6.99' '$5.99']


In [17]:
apps_merge.loc[apps_merge.g_price.str.contains('Everyone')]

Unnamed: 0,name,a_size_mb,a_price,a_user_rating,a_content_rating,a_category,g_size_mb,g_price,g_user_rating,g_content_rating,g_category


In [18]:
# Drop rows with errors
apps_merge = apps_merge[~apps_merge.g_price.str.contains("Everyone")]

# Format data to match Apple dataset
apps_merge['g_price'] = apps_merge.g_price.str.lstrip('$')


(apps_merge.g_price.unique())

array(['0', '2.99', '4.99', '0.99', '3.99', '1.99', '24.99', '14.99',
       '9.99', '1.20', '7.99', '6.99', '5.99'], dtype=object)

In [19]:
apps_merge.head()

Unnamed: 0,name,a_size_mb,a_price,a_user_rating,a_content_rating,a_category,g_size_mb,g_price,g_user_rating,g_content_rating,g_category
0,Bible,88.5,0.0,4.5,4+,Reference,Varies with devic,0,4.7,Teen,BOOKS_AND_REFERENCE
1,Facebook,371.8,0.0,3.5,4+,Social Networking,Varies with devic,0,4.1,Teen,SOCIAL
2,LinkedIn,261.2,0.0,3.5,4+,Social Networking,Varies with devic,0,4.2,Everyone,SOCIAL
3,Google Earth,35.5,0.0,3.5,4+,Travel,Varies with devic,0,4.3,Everyone,TRAVEL_AND_LOCAL
4,PAC-MAN,96.2,0.0,3.0,4+,Games,37,0,4.2,Everyone,GAME


In [20]:
apps_merge.head()

Unnamed: 0,name,a_size_mb,a_price,a_user_rating,a_content_rating,a_category,g_size_mb,g_price,g_user_rating,g_content_rating,g_category
0,Bible,88.5,0.0,4.5,4+,Reference,Varies with devic,0,4.7,Teen,BOOKS_AND_REFERENCE
1,Facebook,371.8,0.0,3.5,4+,Social Networking,Varies with devic,0,4.1,Teen,SOCIAL
2,LinkedIn,261.2,0.0,3.5,4+,Social Networking,Varies with devic,0,4.2,Everyone,SOCIAL
3,Google Earth,35.5,0.0,3.5,4+,Travel,Varies with devic,0,4.3,Everyone,TRAVEL_AND_LOCAL
4,PAC-MAN,96.2,0.0,3.0,4+,Games,37,0,4.2,Everyone,GAME


### Format content rating

In [21]:
apps_merge.g_content_rating.unique()

array(['Teen', 'Everyone', 'Mature 17+', 'Everyone 10+'], dtype=object)

In [22]:
apps_merge.a_content_rating.unique()

array(['4+', '12+', '17+', '9+'], dtype=object)

In [23]:
# Create new DataFrame to identify equivalent content ratings
content_ratings = pd.DataFrame(data = {'id':[1, 2, 3, 4],  
                     'apple':['4+', '9+', '12+', '17+'], 
                     'google':['Everyone', 'Everyone 10+', 'Teen', 'Mature 17+']})
content_ratings = content_ratings.set_index('id')
content_ratings

Unnamed: 0_level_0,apple,google
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4+,Everyone
2,9+,Everyone 10+
3,12+,Teen
4,17+,Mature 17+


In [24]:
# Rename content ratings to match

# Apple
apps_merge['a_content_rating'] = apps_merge.a_content_rating.replace('4+', '1')
apps_merge['a_content_rating'] = apps_merge.a_content_rating.replace('9+', '2')
apps_merge['a_content_rating'] = apps_merge.a_content_rating.replace('12+', '3')
apps_merge['a_content_rating'] = apps_merge.a_content_rating.replace('17+', '4')

# Google
apps_merge['g_content_rating'] = apps_merge.g_content_rating.replace('Everyone', '1')
apps_merge['g_content_rating'] = apps_merge.g_content_rating.replace('Everyone 10+', '2')
apps_merge['g_content_rating'] = apps_merge.g_content_rating.replace('Teen', '3')
apps_merge['g_content_rating'] = apps_merge.g_content_rating.replace('Mature 17+', '4')

In [25]:
apps_merge.head()

Unnamed: 0,name,a_size_mb,a_price,a_user_rating,a_content_rating,a_category,g_size_mb,g_price,g_user_rating,g_content_rating,g_category
0,Bible,88.5,0.0,4.5,1,Reference,Varies with devic,0,4.7,3,BOOKS_AND_REFERENCE
1,Facebook,371.8,0.0,3.5,1,Social Networking,Varies with devic,0,4.1,3,SOCIAL
2,LinkedIn,261.2,0.0,3.5,1,Social Networking,Varies with devic,0,4.2,1,SOCIAL
3,Google Earth,35.5,0.0,3.5,1,Travel,Varies with devic,0,4.3,1,TRAVEL_AND_LOCAL
4,PAC-MAN,96.2,0.0,3.0,1,Games,37,0,4.2,1,GAME


### Categories

In [31]:
apps_merge.a_category.unique()

array(['Reference', 'Social Networking', 'Travel', 'Games',
       'Food & Drink', 'Utilities', 'Finance', 'Entertainment',
       'Business', 'Shopping', 'News', 'Photo & Video', 'Productivity',
       'Navigation', 'Sports', 'Education', 'Health & Fitness',
       'Lifestyle', 'Medical', 'Weather', 'Book', 'Catalogs'],
      dtype=object)

In [32]:
apps_merge.g_category.unique()

In [41]:
apps_merge.a_category.unique()

array(['Reference', 'Social Networking', 'Travel', 'Games',
       'Food & Drink', 'Utilities', 'Finance', 'Entertainment',
       'Business', 'Shopping', 'News', 'Photo & Video', 'Productivity',
       'Navigation', 'Sports', 'Education', 'Health & Fitness',
       'Lifestyle', 'Medical', 'Weather', 'Book', 'Catalogs'],
      dtype=object)

In [None]:
categories = pd.DataFrame(data = {'id':list(range(1, 23)),  
                     'apple':['Reference', 'Social Networking', 'Travel', 'Games',
                              'Food & Drink', 'Utilities', 'Finance', 'Entertainment',
                              'Business', 'Shopping', 'News', 'Photo & Video', 'Productivity',
                              'Navigation', 'Sports', 'Education', 'Health & Fitness',
                              'Lifestyle', 'Medical', 'Weather', 'Book', 'Catalogs'], 
                     'google':['BOOKS_AND_REFERENCE', 'SOCIAL', 'TRAVEL_AND_LOCAL', 'GAME',
                               'FOOD_AND_DRINK', 'TOOLS', 'FINANCE', 'FAMILY', 'ENTERTAINMENT',
                               'PRODUCTIVITY', 'BUSINESS', 'COMMUNICATION', 'SHOPPING',
                               'NEWS_AND_MAGAZINES', 'HEALTH_AND_FITNESS', 'SPORTS',
                               'MAPS_AND_NAVIGATION', 'EDUCATION', 'LIFESTYLE', 'PHOTOGRAPHY',
                               'MEDICAL', 'WEATHER', 'VIDEO_PLAYERS', 'ART_AND_DESIGN']})
categories = content_ratings.set_index('id')
categories

In [42]:
apps_merge.a_category.unique()

array(['Reference', 'Social Networking', 'Travel', 'Games',
       'Food & Drink', 'Utilities', 'Finance', 'Entertainment',
       'Business', 'Shopping', 'News', 'Photo & Video', 'Productivity',
       'Navigation', 'Sports', 'Education', 'Health & Fitness',
       'Lifestyle', 'Medical', 'Weather', 'Book', 'Catalogs'],
      dtype=object)

In [74]:
apps_merge.loc[apps_merge['a_category'] == 'Travel']

Unnamed: 0,name,a_size_mb,a_price,a_user_rating,a_content_rating,a_category,g_size_mb,g_price,g_user_rating,g_content_rating,g_category
3,Google Earth,35.5,0.0,3.5,1,Travel,Varies with devic,0,4.3,1,TRAVEL_AND_LOCAL
29,DB Navigator,80.9,0.0,3.5,1,Travel,20,0,4.0,1,MAPS_AND_NAVIGATION
30,Southwest Airlines,84.7,0.0,3.0,1,Travel,8.3,0,3.9,1,TRAVEL_AND_LOCAL
41,Uber,270.7,0.0,3.0,1,Travel,Varies with devic,0,4.2,1,MAPS_AND_NAVIGATION
46,Fly Delta,134.3,0.0,3.0,1,Travel,46,0,3.7,1,TRAVEL_AND_LOCAL
52,Airbnb,232.5,0.0,4.0,1,Travel,Varies with devic,0,4.4,1,TRAVEL_AND_LOCAL
78,United Airlines,134.4,0.0,2.5,1,Travel,80,0,3.5,1,TRAVEL_AND_LOCAL
219,Google Street View,91.5,0.0,4.0,1,Travel,Varies with devic,0,4.2,1,TRAVEL_AND_LOCAL
243,SNCF,116.5,0.0,1.5,1,Travel,Varies with devic,0,3.4,1,TRAVEL_AND_LOCAL


In [75]:
apps_merge.loc[apps_merge['g_category'] == 'TRAVEL_AND_LOCAL']

Unnamed: 0,name,a_size_mb,a_price,a_user_rating,a_content_rating,a_category,g_size_mb,g_price,g_user_rating,g_content_rating,g_category
3,Google Earth,35.5,0.0,3.5,1,Travel,Varies with devic,0,4.3,1,TRAVEL_AND_LOCAL
30,Southwest Airlines,84.7,0.0,3.0,1,Travel,8.3,0,3.9,1,TRAVEL_AND_LOCAL
46,Fly Delta,134.3,0.0,3.0,1,Travel,46,0,3.7,1,TRAVEL_AND_LOCAL
52,Airbnb,232.5,0.0,4.0,1,Travel,Varies with devic,0,4.4,1,TRAVEL_AND_LOCAL
78,United Airlines,134.4,0.0,2.5,1,Travel,80,0,3.5,1,TRAVEL_AND_LOCAL
219,Google Street View,91.5,0.0,4.0,1,Travel,Varies with devic,0,4.2,1,TRAVEL_AND_LOCAL
243,SNCF,116.5,0.0,1.5,1,Travel,Varies with devic,0,3.4,1,TRAVEL_AND_LOCAL


In [52]:
#apps_merge.loc[~(apps_merge['a_category'] == apps_merge['g_category'])]

# Create connection to database

In [None]:
connection_string = (
    f"root:{mysql_conn.password}@localhost/my_db")
engine = create_engine(f'mysql://{connection_string}')

In [None]:
df.to_sql(
    name='df', con=engine, 
    if_exists='append', index=True)