In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.neural_network import MLPClassifier

In [3]:
appstore = pd.read_csv('AppleStore.csv')
appstore = appstore.dropna()

In [4]:
appstore.shape

(7197, 17)

In [5]:
appstore.head()

Unnamed: 0.1,Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,1,281656475,PAC-MAN Premium,100788224,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1
1,2,281796108,Evernote - stay organized,158578688,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1
2,3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1
3,4,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,USD,0.0,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,5,9,1
4,5,282935706,Bible,92774400,USD,0.0,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1


In [6]:
appstore.drop(['Unnamed: 0', 'id', 'track_name'], axis=1, inplace=True)

In [7]:
# Print out column names and # of unique values in each categorical variable
appstore.select_dtypes(include=['object']).nunique()

currency          1
ver            1590
cont_rating       4
prime_genre      23
dtype: int64

In [8]:
# Clean up content rating column and convert to numeric
appstore['cont_rating'] = appstore['cont_rating'].str.replace('+', '')
appstore['cont_rating'] = appstore['cont_rating'].astype('float64')

In [9]:
# Drop currency because there is only 1 value,
# and drop ver since it has 1590
appstore.drop(['currency', 'ver'], axis=1, inplace=True)

In [10]:
appstore.describe()

Unnamed: 0,size_bytes,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,cont_rating,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
count,7197.0,7197.0,7197.0,7197.0,7197.0,7197.0,7197.0,7197.0,7197.0,7197.0,7197.0
mean,199134500.0,1.726218,12892.91,460.373906,3.526956,3.253578,7.093094,37.361817,3.7071,5.434903,0.993053
std,359206900.0,5.833006,75739.41,3920.455183,1.517948,1.809363,4.329046,3.737715,1.986005,7.919593,0.083066
min,589824.0,0.0,0.0,0.0,0.0,0.0,4.0,9.0,0.0,0.0,0.0
25%,46922750.0,0.0,28.0,1.0,3.5,2.5,4.0,37.0,3.0,1.0,1.0
50%,97153020.0,0.0,300.0,23.0,4.0,4.0,4.0,37.0,5.0,1.0,1.0
75%,181924900.0,1.99,2793.0,140.0,4.5,4.5,9.0,38.0,5.0,8.0,1.0
max,4025970000.0,299.99,2974676.0,177050.0,5.0,5.0,17.0,47.0,5.0,75.0,1.0


In [17]:
Y = appstore['prime_genre']

X = appstore.drop(['prime_genre'], axis=1)

In [12]:
X.head()

Unnamed: 0,size_bytes,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,100788224,3.99,21292,26,4.0,4.5,4.0,Games,38,5,10,1
1,158578688,0.0,161065,26,4.0,3.5,4.0,Productivity,37,5,23,1
2,100524032,0.0,188583,2822,3.5,4.5,4.0,Weather,37,5,3,1
3,128512000,0.0,262241,649,4.0,4.5,12.0,Shopping,37,5,9,1
4,92774400,0.0,985920,5320,4.5,5.0,4.0,Reference,37,5,45,1


In [13]:
X.dtypes

size_bytes            int64
price               float64
rating_count_tot      int64
rating_count_ver      int64
user_rating         float64
user_rating_ver     float64
cont_rating         float64
prime_genre          object
sup_devices.num       int64
ipadSc_urls.num       int64
lang.num              int64
vpp_lic               int64
dtype: object

In [25]:
# Establish and fit the model, with a single 100 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(100,))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [27]:
mlp.score(X, Y)

0.5357787967208559

In [26]:
Y.value_counts()/len(Y)

Games                0.536612
Entertainment        0.074337
Education            0.062943
Photo & Video        0.048492
Utilities            0.034459
Health & Fitness     0.025010
Productivity         0.024733
Social Networking    0.023204
Lifestyle            0.020008
Music                0.019175
Shopping             0.016952
Sports               0.015840
Book                 0.015562
Finance              0.014450
Travel               0.011255
News                 0.010421
Weather              0.010004
Reference            0.008893
Food & Drink         0.008754
Business             0.007920
Navigation           0.006392
Medical              0.003196
Catalogs             0.001389
Name: prime_genre, dtype: float64

In [21]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5, n_jobs=-2)

array([0.53383978, 0.53457815, 0.01041667, 0.53835425, 0.54023793])