### Libraries

In [231]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import OneHotEncoder


from warnings import filterwarnings

pd.set_option('display.float_format', lambda num:'%1.3f'%num)
pd.set_option('display.max_columns', 99)
filterwarnings('ignore')

# Data Description

Feature | Description
:---|:---
app_id |The unique application id.
category | The category under which app is categorized on the store.
reviews| The number of reviews received on the store.
size| Size of the app available for download (in KB/MB)
installs| The number of people who had installed this app atleast once.
price| The price of the app (in US $)
suitable_for| Rating given to app based on the usage and content.
last_update| When was the app updated last time by the developers.
latest_ver| The latest version of the app available for download.
popularity| User popularity (High/Low)

# Data Wrangling & Visualization 




In [232]:
train = pd.read_csv("train.csv")

In [233]:
train.head()

Unnamed: 0,app_id,category,reviews,size,installs,price,suitable_for,last_update,latest_ver,popularity
0,330090,PERSONALIZATION,4,511k,50+,0,Everyone,"December 31, 2016",1.4,High
1,226147,GAME,568391,5.2M,"5,000,000+",0,Teen,"July 1, 2014",4.3.1,High
2,107000,FAMILY,144,70M,"1,000+",$2.99,Teen,"January 26, 2018",1.0.0,High
3,217582,FAMILY,1499466,96M,"10,000,000+",0,Teen,"July 24, 2018",1.25.0,High
4,370113,DATING,84,4.5M,"1,000+",0,Mature 17+,"July 6, 2018",8.2,High


In [234]:
#Explore columns
train.columns

Index(['app_id', 'category', 'reviews', 'size', 'installs', 'price',
       'suitable_for', 'last_update', 'latest_ver', 'popularity'],
      dtype='object')

In [235]:
#Description
train.describe()

Unnamed: 0,app_id,reviews
count,1975.0,1975.0
mean,490685.787,198816.163
std,288098.937,1494090.662
min,1160.0,1.0
25%,239249.5,33.0
50%,481867.0,516.0
75%,737373.5,19122.0
max,999218.0,44891723.0


# Data pre-processing

In [236]:
train['size'] = train['size'].replace({'k': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)

In [237]:
train['installs']=train['installs'].str.replace(',','')

In [238]:
train['installs']=train['installs'].str.replace('+','')

In [239]:
train["installs"] = pd.to_numeric(train["installs"])

In [240]:
train['price']=train['price'].str.replace('$','')

In [241]:
train["price"] = pd.to_numeric(train["price"])

In [242]:
train['price']

0      0.000
1      0.000
2      2.990
3      0.000
4      0.000
        ... 
1970   0.000
1971   0.000
1972   1.990
1973   0.000
1974   0.000
Name: price, Length: 1975, dtype: float64

In [243]:
train

Unnamed: 0,app_id,category,reviews,size,installs,price,suitable_for,last_update,latest_ver,popularity
0,330090,PERSONALIZATION,4,511000,50,0.000,Everyone,"December 31, 2016",1.4,High
1,226147,GAME,568391,5200000,5000000,0.000,Teen,"July 1, 2014",4.3.1,High
2,107000,FAMILY,144,70000000,1000,2.990,Teen,"January 26, 2018",1.0.0,High
3,217582,FAMILY,1499466,96000000,10000000,0.000,Teen,"July 24, 2018",1.25.0,High
4,370113,DATING,84,4500000,1000,0.000,Mature 17+,"July 6, 2018",8.2,High
...,...,...,...,...,...,...,...,...,...,...
1970,823213,GAME,3883589,57000000,100000000,0.000,Everyone,"July 26, 2018",2.21.1,High
1971,911217,FAMILY,5898,50000000,100000,0.000,Everyone,"August 1, 2017",3.3.8.03082017,High
1972,369343,FAMILY,16,8900000,500,1.990,Everyone,"May 9, 2017",1.0,High
1973,468527,HEALTH_AND_FITNESS,9612,3500000,100000,0.000,Everyone,"May 18, 2018",1.8.12,High


In [244]:
X = train.drop(['category','app_id','popularity', 'latest_ver', 'last_update'], axis=1)
y = train['popularity']

In [245]:
X

Unnamed: 0,reviews,size,installs,price,suitable_for
0,4,511000,50,0.000,Everyone
1,568391,5200000,5000000,0.000,Teen
2,144,70000000,1000,2.990,Teen
3,1499466,96000000,10000000,0.000,Teen
4,84,4500000,1000,0.000,Mature 17+
...,...,...,...,...,...
1970,3883589,57000000,100000000,0.000,Everyone
1971,5898,50000000,100000,0.000,Everyone
1972,16,8900000,500,1.990,Everyone
1973,9612,3500000,100000,0.000,Everyone


In [246]:
y

0       High
1       High
2       High
3       High
4       High
        ... 
1970    High
1971    High
1972    High
1973    High
1974    High
Name: popularity, Length: 1975, dtype: object

# Splitting Dataset into Trainng and Validation dataset

In [247]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state=42)

In [248]:
X_train

Unnamed: 0,reviews,size,installs,price,suitable_for
1249,11677,19000000,1000000,0.000,Everyone
1556,3048,24000000,500000,0.000,Teen
1617,25,46000000,5000,0.000,Everyone
1567,3,4800000,100,0.000,Everyone
1027,705,1800000,10000,0.000,Everyone
...,...,...,...,...,...
1130,1680,4000000,100000,0.000,Everyone
1294,663,25000000,50000,0.000,Everyone
860,5509,3200000,100000,0.000,Everyone
1459,349,11000000,50000,0.000,Everyone


In [249]:
X_test

Unnamed: 0,reviews,size,installs,price,suitable_for
1825,445756,16000000,10000000,0.000,Everyone
1735,23,4000000,5000,0.000,Everyone
678,54,20000000,5000,0.000,Everyone
351,29155,83000000,1000000,0.000,Everyone
1791,14,2900000,500,0.990,Everyone
...,...,...,...,...,...
1146,175,4300000,10000,0.000,Everyone
1204,23,17000000,10000,0.000,Everyone
286,1475,34000000,10000,4.990,Teen
1936,197,9800000,10000,0.000,Everyone


In [250]:
y_train

1249    High
1556     Low
1617    High
1567    High
1027    High
        ... 
1130     Low
1294    High
860     High
1459    High
1126    High
Name: popularity, Length: 1382, dtype: object

# Encoding Independent and Dependent Variable

In [253]:
#Encoding Independent Variable
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [4])], remainder='passthrough')
X_train_ohe = np.array(ct.fit_transform(X_train))

In [254]:
X_train_ohe

array([[0.0e+00, 1.0e+00, 0.0e+00, ..., 1.9e+07, 1.0e+06, 0.0e+00],
       [0.0e+00, 0.0e+00, 0.0e+00, ..., 2.4e+07, 5.0e+05, 0.0e+00],
       [0.0e+00, 1.0e+00, 0.0e+00, ..., 4.6e+07, 5.0e+03, 0.0e+00],
       ...,
       [0.0e+00, 1.0e+00, 0.0e+00, ..., 3.2e+06, 1.0e+05, 0.0e+00],
       [0.0e+00, 1.0e+00, 0.0e+00, ..., 1.1e+07, 5.0e+04, 0.0e+00],
       [0.0e+00, 0.0e+00, 0.0e+00, ..., 7.8e+07, 5.0e+05, 0.0e+00]])

In [255]:
X_test_ohe = np.array(ct.transform(X_test))

In [256]:
# Encoding the Dependent Variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_ohe = le.fit_transform(y_train)

In [257]:
y_test_ohe = le.transform(y_test)

In [229]:
y_test_ohe

array([0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,

# Training model with Decision Tree Classifier

In [258]:
# Training the Decision Tree Classification model on the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train_ohe, y_train_ohe)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

# Validation set results

In [260]:
y_pred = classifier.predict(X_test_ohe)

In [261]:
y_pred

array([0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [262]:
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test_ohe.reshape(len(y_test),1)),1))

[[0 0]
 [0 1]
 [0 1]
 ...
 [0 0]
 [0 1]
 [0 0]]


# Confusion matrix

In [263]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test_ohe, y_pred)
print(cm)
accuracy_score(y_test_ohe, y_pred)

[[377  65]
 [102  49]]


0.718381112984823

# Loading Test data

In [None]:
#Loading Test data
test_data=pd.read_csv('test.csv')
test_data.head()

# Test data pre-processing

In [266]:
test_data['size'] = test_data['size'].replace({'k': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)

In [267]:
test_data['installs']=test_data['installs'].str.replace(',','')

In [268]:
test_data['installs']=test_data['installs'].str.replace('+','')

In [269]:
test_data["installs"] = pd.to_numeric(test_data["installs"])

In [270]:
test_data['price']=test_data['price'].str.replace('$','')

In [271]:
test_data["price"] = pd.to_numeric(test_data["price"])

In [272]:
test_data.head()

Unnamed: 0,app_id,category,reviews,size,installs,price,suitable_for,last_update,latest_ver
0,144236,TOOLS,262076,1200000,10000000,0.0,Everyone,"January 4, 2018",1.0.2
1,881323,PHOTOGRAPHY,12204,6800000,1000000,0.0,Everyone,"March 14, 2017",1.5.2.21
2,765524,TOOLS,632,8199999,500000,0.0,Everyone,"May 23, 2017",1.0
3,390533,BUSINESS,29,16000000,500,0.0,Everyone,"July 5, 2018",2.2.1335
4,817383,ART_AND_DESIGN,192,6000000,10000,0.0,Everyone,"April 25, 2018",1.5


In [274]:
Xtest = test_data.drop(['category','app_id', 'latest_ver', 'last_update'], axis=1)

In [275]:
Xtest

Unnamed: 0,reviews,size,installs,price,suitable_for
0,262076,1200000,10000000,0.000,Everyone
1,12204,6800000,1000000,0.000,Everyone
2,632,8199999,500000,0.000,Everyone
3,29,16000000,500,0.000,Everyone
4,192,6000000,10000,0.000,Everyone
...,...,...,...,...,...
1013,357944,12000000,10000000,0.000,Everyone
1014,125,4300000,10000,0.000,Everyone
1015,315,14000000,10000,0.000,Everyone
1016,11,3500000,500,0.000,Mature 17+


In [295]:
app_id = test_data['app_id']

In [296]:
app_id

0       144236
1       881323
2       765524
3       390533
4       817383
         ...  
1013    990872
1014    850376
1015    983504
1016    886153
1017    541500
Name: app_id, Length: 1018, dtype: int64

# Encoding Test data set

In [289]:
Xtest_ohe = np.array(ct.transform(Xtest))

# Predicting on test set

In [291]:
y_pred_test = classifier.predict(Xtest_ohe)

In [292]:
y_pred_test

array([0, 0, 0, ..., 1, 0, 1])

In [301]:
ypred = pd.DataFrame(y_pred_test, columns=['popularity'])

In [305]:
ypred

Unnamed: 0,popularity
0,0
1,0
2,0
3,1
4,0
...,...
1013,0
1014,0
1015,1
1016,0


In [307]:
ypred_og = le.inverse_transform(y_pred_test)

In [308]:
ypred_og

array(['High', 'High', 'High', ..., 'Low', 'High', 'Low'], dtype=object)

In [309]:
ypred_df = pd.DataFrame(ypred_og, columns=['popularity'])

In [310]:
ypred_df

Unnamed: 0,popularity
0,High
1,High
2,High
3,Low
4,High
...,...
1013,High
1014,High
1015,Low
1016,High


In [313]:
app_id_df = pd.DataFrame(app_id, columns=['app_id'])

In [315]:
app_id_df

Unnamed: 0,app_id
0,144236
1,881323
2,765524
3,390533
4,817383
...,...
1013,990872
1014,850376
1015,983504
1016,886153


In [318]:
submission = [app_id_df, ypred_df]

In [319]:
submission

[      app_id
 0     144236
 1     881323
 2     765524
 3     390533
 4     817383
 ...      ...
 1013  990872
 1014  850376
 1015  983504
 1016  886153
 1017  541500
 
 [1018 rows x 1 columns],
      popularity
 0          High
 1          High
 2          High
 3           Low
 4          High
 ...         ...
 1013       High
 1014       High
 1015        Low
 1016       High
 1017        Low
 
 [1018 rows x 1 columns]]

In [321]:
submission_df = app_id_df.join(ypred_df)

In [322]:
submission_df

Unnamed: 0,app_id,popularity
0,144236,High
1,881323,High
2,765524,High
3,390533,Low
4,817383,High
...,...,...
1013,990872,High
1014,850376,High
1015,983504,Low
1016,886153,High


In [323]:
#Submission
submission_df.to_csv('submissions.csv',index=False)