# Objective

The objective for this assignment will be to create a simple neural network model to see how accurately we can predict the aggregate user rating of an app on the App Store based on the information provided in our dataset. 

We will then be comparing these results to a random forest classifier to weigh the pros and cons of the complexity and accuracy between the two models.

In [1]:
# Basic import statements
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Machine Learning import statements
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
# import the data
appstore = pd.read_csv('AppleStore.csv')
appstore = appstore.dropna()

In [4]:
# Print the size of the dataset
appstore.shape

(7197, 17)

In [5]:
# Preview the data
appstore.head()

Unnamed: 0.1,Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,1,281656475,PAC-MAN Premium,100788224,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1
1,2,281796108,Evernote - stay organized,158578688,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1
2,3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1
3,4,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,USD,0.0,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,5,9,1
4,5,282935706,Bible,92774400,USD,0.0,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1


In [6]:
# Drop columns with no predictive power
appstore.drop(['Unnamed: 0', 'id', 'track_name'], axis=1, inplace=True)

In [7]:
# Print out column names and # of unique values in each categorical variable
appstore.select_dtypes(include=['object']).nunique()

currency          1
ver            1590
cont_rating       4
prime_genre      23
dtype: int64

In [8]:
# Clean up content rating column and convert to numeric
appstore['cont_rating'] = appstore['cont_rating'].str.replace('+', '')
appstore['cont_rating'] = appstore['cont_rating'].astype('float64')

In [9]:
# Drop currency because there is only 1 value,
# and drop ver since it has 1590
appstore.drop(['currency', 'ver'], axis=1, inplace=True)

In [10]:
# Create target variable
Y = appstore['prime_genre']

# Create finalized dataframe
X = appstore.drop(['prime_genre'], axis=1)

In [11]:
# Preview the finalized dataframe
X.head()

Unnamed: 0,size_bytes,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,cont_rating,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,100788224,3.99,21292,26,4.0,4.5,4.0,38,5,10,1
1,158578688,0.0,161065,26,4.0,3.5,4.0,37,5,23,1
2,100524032,0.0,188583,2822,3.5,4.5,4.0,37,5,3,1
3,128512000,0.0,262241,649,4.0,4.5,12.0,37,5,9,1
4,92774400,0.0,985920,5320,4.5,5.0,4.0,37,5,45,1


In [12]:
# Divide the dataset into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify = Y, random_state=0)

In [13]:
y_test.shape

(2160,)

# Neural Network Model

In [14]:
# Establish and fit the model, with a 100 x 10 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(100,10))
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [15]:
# Attempt at implementing gridsearch to mlp

# # Setting up parameters for the random forest
# param_grid = {'hidden_layer_size' : [(100,1), (200,1), (100, 10)],
#               'max_iter': [100, 200, 400],
#               'alpha': [.00001, .0001, .001]}

# # Run grid search to find ideal parameters
# mlp_grid = GridSearchCV(mlp, param_grid, cv=5, n_jobs=-2)

# # Fit the best parameters to our model
# mlp_grid.fit(X_train, y_train)

# # Return model scores
# print(mlp_grid.best_params_)
# print(mlp_grid.best_score_)

In [16]:
mlp.score(X_train, y_train)

0.5366289458010721

In [17]:
y_train.value_counts()/len(y_train)

Games                0.536629
Entertainment        0.074251
Education            0.062934
Photo & Video        0.048442
Utilities            0.034544
Health & Fitness     0.025015
Productivity         0.024816
Social Networking    0.023228
Lifestyle            0.020052
Music                0.019257
Shopping             0.016875
Sports               0.015882
Book                 0.015485
Finance              0.014493
Travel               0.011316
News                 0.010324
Weather              0.009927
Reference            0.008934
Food & Drink         0.008735
Business             0.007941
Navigation           0.006353
Medical              0.003176
Catalogs             0.001390
Name: prime_genre, dtype: float64

In [18]:
mlp_test = cross_val_score(mlp, X_test, y_test, cv=5, n_jobs=-2)
print('Cross Validation Accuracy Scores - Test Set: {:.5f}(+/- {:.2f})'.format(mlp_test.mean(), mlp_test.std()*2))



Cross Validation Accuracy Scores - Test Set: 0.22861(+/- 0.49)


# Random Forest Model

In [19]:
# Constructing the shape of our decision tree
rfc = RandomForestClassifier(n_jobs=-2)

# Fit model to data
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
# Setting up parameters for the random forest
param_grid = {'max_features': ['sqrt'],
              'n_estimators' : [100, 200, 400],
              'max_depth' : [4, 8, 12],
              'min_samples_leaf' : [4, 8, 12]}

# Run grid search to find ideal parameters
rfc_grid = GridSearchCV(rfc, param_grid, cv=5, n_jobs=-2)

# Fit the best parameters to our model
rfc_grid.fit(X_train, y_train)

# Return model scores
print(rfc_grid.best_params_)
print(rfc_grid.best_score_)

{'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'n_estimators': 400}
0.5874528489180068


In [21]:
cv_rfc_test = cross_val_score(rfc_grid, X_test, y_test, cv=5, n_jobs=-2)
print('Cross Validation Accuracy Scores - Test Set: {:.5f}(+/- {:.2f})'.format(cv_rfc_test.mean(), cv_rfc_test.std()*2))



Cross Validation Accuracy Scores - Test Set: 0.55937(+/- 0.02)
