In [3]:
#imports
%matplotlib inline
from os import listdir
import os.path
import calendar
from os.path import isfile, join
import sys
import subprocess
import sys, traceback
import urllib3
import json
import numpy as np
import pandas as pd
import shutil
import nltk
import csv
import string
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
import random
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
import time
import spacy
import re
from nltk import FreqDist
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

In [4]:
#Get the list of stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michellepetersen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
#Text Cleaning
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize_text(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [7]:
#We use NLTK’s Wordnet to find the meanings of words, synonyms, antonyms, and more. 
#In addition, we use WordNetLemmatizer to get the root word.
nltk.download('wordnet')

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/michellepetersen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
#Prepare the text for topic modelling
def prepare_text_for_lda(text):
    tokens = tokenize_text(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [get_lemma2(token) for token in tokens]
    return tokens

In [9]:
def build_token_list(description):
    tokens = prepare_text_for_lda(description)
    return tokens

In [10]:
# first get a list of all words
def compute_top_common_words(df, column, k):
    all_words = [word for item in list(df[column]) for word in item]

    # use nltk fdist to get a frequency distribution of all words
    fdist = FreqDist(all_words)
    # number of unique words
    # choose k and visually inspect the bottom 10 words of the top k
    top_k_words = fdist.most_common(k)
    #print the distribution of words
    print("\n{} Stats: Total:{} Min Hits:{} Words:{}".format(column, len(fdist), k, top_k_words))

    # define a function only to keep words in the top k words
    top_k_words,_ = zip(*fdist.most_common(k))
    top_k_words = set(top_k_words)
    return top_k_words

def keep_top_k_words(text):
        return [word for word in text if word in top_words_across_apps_unique]
    
def keep_unique_words(text):
        return [word for word in text if word not in top_words_across_apps_unique]

In [11]:
#Filter description
import html
def build_word_list (desc):
    
    global word_df_loc
    desc = desc.encode('ascii', 'ignore').decode('ascii')
    filtered_sentence = [w for w in desc.split(" ") if not w in stop_words]
    new_description = []
    
    for word in filtered_sentence:
        if len(word) == 0:
            continue
        word = html.unescape(word)
        word = word.replace('?', ' ').replace(')', ' ').replace('(', ' ').replace('*', '').replace('.', ' ').replace('&amp;', ' ').replace(',', ' ')
        word = word.strip().replace('-', '').replace('!', '')
        new_description.append(word)
 
    # scrubbed words for this app_id
    word_list = " ".join(str(x) for x in new_description)
    return word_list

In [130]:
# Load application category data
appcatfile = open('play_app_id_to_category.json')
play_cat_info = json.loads(appcatfile.read())

#Load ios application category data
iosappcatfile = open('ios_app_id_to_category.json')
ios_cat_info = json.loads(iosappcatfile.read())

#Load test.ai application category data
#testaiappcatfile = open('testai_category_mappings.json')
#testai_cat_info = json.loads(testaiappcatfile.read())

JSONDecodeError: Expecting property name enclosed in double quotes: line 7 column 9 (char 155)

In [131]:
def get_corrs(df):
    col_correlations = df.corr()
    col_correlations.loc[:, :] = np.tril(col_correlations, k=-1)
    cor_pairs = col_correlations.stack()
    return cor_pairs.to_dict()

In [83]:
#open play store data 
with open('play_app_store.json') as json_data:
    play_store_data = json.load(json_data)
 
# Hardcoded size for now but would like a better way to do this
# Create dataframe for play store application data
play_app_df = pd.DataFrame(index=range(3500), columns=("app_id","category","subcategory","description","installs","found_position", "name","rating",\
                                                   "updated","dayofweek","tokens","num_tokens","token_len",\
                                                   "unique_tokens","num_unique_tokens", "unique_token_len"))
play_df_loc = 0

for category in play_store_data.keys():
    for app in play_store_data[category]:
        if app['app_id'] in play_cat_info['categories'].keys():
            if 'description' in app.keys():
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("category")] = app['category']
                if app['app_id'] in play_cat_info['subcategories'].keys():
                    subcategory = play_cat_info['subcategories'].get(app['app_id'])
                    play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("subcategory")] = subcategory
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("app_id")] = app['app_id']
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("found_position")] = app['found_position']
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("installs")] = int(app['installs'].replace(',',''))
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("name")] = app['name']
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("rating")] = float(app['rating'])
                update_time = pd.to_datetime(app['updated'])
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("updated")] = update_time
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("dayofweek")] = calendar.day_name[update_time.weekday()]
                word_list = build_word_list(app['description'])
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("description")] = word_list
                token_list = build_token_list(word_list)
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("tokens")] = token_list
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("num_tokens")] = len(token_list)
                token_len = sum(len(token) for token in token_list)
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("token_len")] = token_len
                unique_tokens = set(token_list)
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("unique_tokens")] = unique_tokens
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("num_unique_tokens")] = len(unique_tokens)
                unique_token_len = sum(len(token) for token in unique_tokens)
                play_app_df.iloc[play_df_loc, play_app_df.columns.get_loc("unique_token_len")] = unique_token_len
                play_df_loc += 1
                
# Drop null rows
play_app_df.dropna(axis = 0, inplace=True) 
#Clean out categories that we want to blacklist
play_app_df = play_app_df[play_app_df['category'] != "LIBRARIES_AND_DEMO"]
play_app_df = play_app_df[play_app_df['category'] != "GAME_ARCADE"]
play_app_df = play_app_df[play_app_df['category'] != "GAME_ACTION"]
play_app_df = play_app_df[play_app_df['category'] != "GAME_CASUAL"]
play_app_df = play_app_df[play_app_df['category'] != "GAME_PUZZLE"]
play_app_df = play_app_df[play_app_df['category'] != "GAME_ROLE_PLAYING"]
play_app_df = play_app_df[play_app_df['category'] != "GAME_CASINO"]
play_app_df = play_app_df[play_app_df['category'] != "GAME_ADVENTURE"]
play_app_df = play_app_df[play_app_df['category'] != "GAME_BOARD"]
play_app_df = play_app_df[play_app_df['category'] != "GAME_SIMULATION"]
play_app_df = play_app_df[play_app_df['category'] != "GAME_SPORTS"]
play_app_df = play_app_df[play_app_df['category'] != "GAME_MUSIC"]
play_app_df = play_app_df[play_app_df['category'] != "GAME_STRATEGY"]
play_app_df.tail()

Unnamed: 0,app_id,category,subcategory,description,installs,found_position,name,rating,updated,dayofweek,tokens,num_tokens,token_len,unique_tokens,num_unique_tokens,unique_token_len
3315,ipnossoft.rma.premium,HEALTH_AND_FITNESS,yogameditation,control sleep Discover power sounds fall aslee...,100000,27,Relax Melodies P: Sleep Sounds,4.8,2018-01-19 00:00:00,Friday,"[control, sleep, discover, power, sound, aslee...",118,860,"{asleep, mashable, isochronic, meditationsyoul...",88,678
3316,com.excelatlife.motivation,MEDICAL,yogameditation,n relax variety methods No additional subscrip...,100000,28,Qi Gong Meditation Relaxation,4.3,2014-09-26 00:00:00,Friday,"[relax, variety, method, additional, subscript...",235,1797,"{contains, without, cabin, providing, therapy,...",154,1199
3317,com.mobureau.android.mychakra,HEALTH_AND_FITNESS,yogameditation,This free version features Ads To buy Ad Free...,500000,29,My Chakra Meditation,4.4,2018-04-10 00:00:00,Tuesday,"[version, feature, version, please, amazon, st...",78,546,"{mandala, please, vitalize, store, attention, ...",47,329
3318,com.hivebrain.andrewjohnson.relaxlite,HEALTH_AND_FITNESS,yogameditation,headphones drift deep relaxation This applicat...,100000,30,Relax with Andrew Johnson Lite,4.3,2012-06-19 00:00:00,Tuesday,"[headphone, drift, relaxation, application, gu...",14,104,"{stress, hypnosis, intended, application, guid...",14,104
3319,ipnossoft.rma.free,HEALTH_AND_FITNESS,keepcalm,ks Relax Melodies #1 app sleep relaxation rega...,5000000,32,Relax Melodies: Sleep Sounds,4.5,2018-07-23 00:00:00,Monday,"[relax, melody, sleep, relaxation, regain, con...",97,746,"{bird, relax, insomnia, airplane, white, voice...",78,638


In [123]:
#open ios store data 
with open('ios_app_store.json') as json_data:
    ios_store_data = json.load(json_data)
 
# Hardcoded size for now but would like a better way to do this
# Create dataframe for play store application data
ios_app_df = pd.DataFrame(index=range(20000), columns=("ios_app_id","ios_category","ios_found_position","ios_name","ios_num_ratings","ios_rating",\
                                                       "ios_origin_list","ios_store_url","ios_version","ios_website"))
ios_df_loc = 0

for app in ios_store_data:
    ios_app_df.iloc[ios_df_loc, ios_app_df.columns.get_loc("ios_app_id")] = app['app_id']
    ios_app_df.iloc[ios_df_loc, ios_app_df.columns.get_loc("ios_category")] = app['category']
    ios_app_df.iloc[ios_df_loc, ios_app_df.columns.get_loc("ios_found_position")] = app['found_position']
    ios_app_df.iloc[ios_df_loc, ios_app_df.columns.get_loc("ios_name")] = app['name']
    ios_app_df.iloc[ios_df_loc, ios_app_df.columns.get_loc("ios_num_ratings")] = float(app['num_ratings'])
    ios_app_df.iloc[ios_df_loc, ios_app_df.columns.get_loc("ios_rating")] = float(app['rating'])
    ios_app_df.iloc[ios_df_loc, ios_app_df.columns.get_loc("ios_origin_list")] = app['origin_list']
    ios_app_df.iloc[ios_df_loc, ios_app_df.columns.get_loc("ios_store_url")] = app['store_url']
    ios_app_df.iloc[ios_df_loc, ios_app_df.columns.get_loc("ios_version")] = app['version']  
    ios_app_df.iloc[ios_df_loc, ios_app_df.columns.get_loc("ios_website")] = app['website']  
    ios_df_loc += 1
                
# Drop null rows
ios_app_df.dropna(axis = 0, inplace=True)  
ios_app_df = ios_app_df[ios_app_df['ios_category'] != "Games"]
ios_app_df = ios_app_df[ios_app_df['ios_origin_list'] != "paidIpadApplications"]
ios_app_df = ios_app_df[ios_app_df['ios_origin_list'] != "freeIpadApplications"]
print(len(ios_app_df))
ios_app_df = ios_app_df.drop_duplicates(subset=['ios_app_id'])
print(len(ios_app_df))
ios_app_df.tail()

11451


Unnamed: 0,ios_app_id,ios_category,ios_found_position,ios_name,ios_num_ratings,ios_rating,ios_origin_list,ios_store_url,ios_version,ios_website
19303,1061132313,Books,193,CHOMP by Christoph Niemann,347,4.7,paidApplications,https://itunes.apple.com/us/app/id1061132313,1.4,http://www.foxandsheep.com
19304,552498441,Books,194,"Great Pumpkin, Charlie Brown",17,4.1,paidApplications,https://itunes.apple.com/us/app/id552498441,1.5,http://www.loudcrow.com/its-the-great-pumpkin-...
19305,325486873,Books,196,Santa Biblia Version Reina Valera (con audio),17,2.7,paidApplications,https://itunes.apple.com/us/app/id325486873,11,http://www.palreader.com/en/contact.html
19306,399196568,Books,197,Home Library,18,1.8,paidApplications,https://itunes.apple.com/us/app/id399196568,11.0.1,http://www.myhomelibrary.net
19307,878704422,Books,199,Transformers Rescue Bots: Sky Forest Rescue,8,3.0,paidApplications,https://itunes.apple.com/us/app/id878704422,2.0,http://playdatedigital.com/


In [128]:
print(len(ios_app_df))
ios_app_df = ios_app_df.drop_duplicates(subset=['ios_app_id'])
print(len(ios_app_df))
ios_app_df.tail()

11451
9089


Unnamed: 0,ios_app_id,ios_category,ios_found_position,ios_name,ios_num_ratings,ios_rating,ios_origin_list,ios_store_url,ios_version,ios_website
19303,1061132313,Books,193,CHOMP by Christoph Niemann,347,4.7,paidApplications,https://itunes.apple.com/us/app/id1061132313,1.4,http://www.foxandsheep.com
19304,552498441,Books,194,"Great Pumpkin, Charlie Brown",17,4.1,paidApplications,https://itunes.apple.com/us/app/id552498441,1.5,http://www.loudcrow.com/its-the-great-pumpkin-...
19305,325486873,Books,196,Santa Biblia Version Reina Valera (con audio),17,2.7,paidApplications,https://itunes.apple.com/us/app/id325486873,11,http://www.palreader.com/en/contact.html
19306,399196568,Books,197,Home Library,18,1.8,paidApplications,https://itunes.apple.com/us/app/id399196568,11.0.1,http://www.myhomelibrary.net
19307,878704422,Books,199,Transformers Rescue Bots: Sky Forest Rescue,8,3.0,paidApplications,https://itunes.apple.com/us/app/id878704422,2.0,http://playdatedigital.com/


In [None]:
# Display most frequent category
category_df = ios_app_df.groupby(['ios_category']).size().nlargest(50).reset_index(name='App Count')
print('\nTop Categories by Application Count:\n', category_df)

# Display most ratings (maybe this gives #installs)
category_df = ios_app_df.pivot_table(index='ios_category', values="ios_num_ratings", aggfunc='sum')
category_df.sort_values(by=['ios_num_ratings', 'ios_category'], inplace=True, ascending=False)
print('\nTop Categories by Number of Ratings:\n', category_df.head(50))

# Display most frequent combination of category and origin_list
most_popular_combo = ios_app_df.groupby(['ios_category','ios_origin_list']).size().nlargest(150).reset_index(name='App Count')
most_popular_combo.sort_values(by=['ios_category', 'ios_origin_list', 'App Count'], inplace=True, ascending=True)
print('\nTop category and origin_list by app count:\n', most_popular_combo)

# Display most num_ratings
category_origin_df = ios_app_df.pivot_table(index=['ios_category', 'ios_origin_list'], values='ios_num_ratings', aggfunc='sum')
category_origin_df.sort_values(by=['ios_num_ratings', 'ios_category', 'ios_origin_list'], inplace=True, ascending=False)
print('\nTop category and origin_list by Number ratings:\n', category_origin_df.head(150))

most_popular_apps = ios_app_df.loc[ios_app_df['ios_found_position'] <= 25]
print(len(most_popular_apps))
most_popular_apps = most_popular_apps.drop_duplicates(subset=['ios_name','ios_category'])
print(len(most_popular_apps))
most_popular_apps.head(2200)
out = most_popular_apps.to_json(orient='records', lines=True)
with open('most_popular_apps.json', 'w') as f:
    f.write(out)  
#most_popular_apps = most_popular_apps.groupby(['category','ios_found_position']).size().nlargest(2000).reset_index(name='App Count')

In [151]:
#Analyze subcategories
sub_df = ios_app_df.loc[ios_app_df['ios_category'] == "Travel"]
sub_df = sub_df.loc[sub_df['ios_found_position'] <= 50]
len(sub_df)
#sub_df.sort_values(by=['ios_origin_list', 'ios_found_position', 'ios_name'], inplace=True, ascending=True)
#print('\nTop category and origin_list by app count:\n', most_popular_combo)

229

In [112]:
def plot_distribution(df, column, y_max):
    list_values = list(df[column])

    print("average " + column + ":", np.average(list_values),
      "\nminimum " + column + ":", min(list_values),
      "\nmaximum "+ column + ":", max(list_values))

    #Plot a histogram of the values
    num_bins = 500
    fig, ax = plt.subplots(figsize=(12,6));

    # the histogram of the data
    n, bins, patches = ax.hist(list_values, num_bins, density=1)
    ax.set_xlabel(column , fontsize=15)
    ax.set_ylabel('Density Frequency', fontsize=15)
    ax.grid()
    ax.set_xticks(np.logspace(start=np.log10(50),stop=np.log10(max(list_values)),num=8, base=10.0))
    plt.xlim(0,max(list_values))
    ax.plot([np.average(list_values) for i in np.linspace(0.0,y_max,100)], np.linspace(0.0,y_max,100), '-',
        label="average " + column)
    ax.legend()
    ax.grid()
    fig.tight_layout()
    plt.show()

In [None]:
#Distribution of length of unique words in the description across all apps
plot_distribution(play_app_df, "unique_token_len", 0.002)

In [None]:
#Distribution of length of words in the description across all apps
plot_distribution(play_app_df, "token_len", 0.002)

In [None]:
#Distribution of number of words in the description across all apps
plot_distribution(play_app_df, "num_tokens", 0.010)

In [None]:
#Distribution of number of unique words in the description across all apps
plot_distribution(play_app_df, "num_unique_tokens", 0.02)

In [None]:
# Display most frequent category
category_df = play_app_df.groupby(['category']).size().nlargest(50).reset_index(name='App Count')
print('\nTop Categories by Application Count:\n', category_df)

# Display most installs
category_df = play_app_df.pivot_table(index='category', values='installs', aggfunc='sum')
category_df.sort_values(by=['installs', 'category'], inplace=True, ascending=False)
print('\nTop Categories by Number Installs:\n', category_df.head(50))

# Display most frequent subcategory
most_popular_combo = play_app_df.groupby(['subcategory']).size().nlargest(50).reset_index(name='App Count')
print('\nTop Subcategories by Application Count:\n', most_popular_combo)

# Display most installs
subcategory_df = play_app_df.pivot_table(index='subcategory', values='installs', aggfunc='sum')
subcategory_df.sort_values(by=['installs', 'subcategory'], inplace=True, ascending=False)
print('\nTop Subcategories by Number Installs:\n', subcategory_df.head(65))

In [125]:
#Analyze subcategories
sub_df = play_app_df.loc[play_app_df['category'] == "PRODUCTIVITY"]
category_df = sub_df.pivot_table(index=['subcategory'], values='installs', aggfunc='sum')
category_df.sort_values(by=['installs', 'subcategory'], inplace=True, ascending=False)
print(sub_df['subcategory'].unique())
print('\nTop Categories by Number Installs:\n', category_df.head(60))

['alarmclock' 'blogging' 'holdthatthought' 'browsers' 'businessowners'
 'calculator' 'widgets' 'managetime' 'productivity' 'classroom'
 'classroomtools' 'cloudstorage' 'doityourself' 'drinkguides' 'engagement'
 'filemanagers' 'flashlight' 'topfree' 'getenergyback' 'homesweethome'
 'inbox' 'keyboards' 'pdftogo' 'speeduptasks' 'stylehome' 'topgrossing'
 'vpn' 'wifi']

Top Categories by Number Installs:
                    installs
subcategory                
productivity     9640000000
businessowners   1514100000
holdthatthought  1299500000
cloudstorage      613100000
topfree           500000000
keyboards         316050000
classroomtools    200100000
filemanagers      189100000
inbox             115100000
managetime         61200000
widgets            38110000
speeduptasks       26500000
alarmclock         20000000
calculator         18300000
pdftogo            12000000
wifi               10000000
blogging            8050000
vpn                 5000000
flashlight          5000000
styleho

In [127]:
#Analyze subcategories
sub_df = play_app_df.loc[play_app_df['subcategory'] == "chat"]
category_df = sub_df.pivot_table(index=['category', 'app_id'], values='installs', aggfunc='sum')
category_df.sort_values(by=['category','installs'], inplace=True, ascending=False)
print('\nSubcategories by Number Installs:\n', category_df.head(60))


Subcategories by Number Installs:
                                                         installs
category      app_id                                            
TOOLS         com.riteshsahu.SMSBackupRestore           20000000
SOCIAL        com.badoo.mobile                         400000000
              com.gogii.textplus                        30000000
              com.taggedapp                             30000000
              com.enflick.android.TextNow               20000000
              com.pinger.textfree                       20000000
              me.dingtone.app.im                        20000000
              com.pinger.textfree.call                  15000000
              sh.whisper                                10000000
COMMUNICATION com.facebook.orca                       4000000000
              com.google.android.talk                 4000000000
              com.skype.raider                        4000000000
              com.whatsapp                            

In [82]:
# Display most installs
category_df = play_app_df.pivot_table(index=['category', 'subcategory'], values='installs', aggfunc='sum')
category_df.sort_values(by=['installs', 'subcategory', 'category'], inplace=True, ascending=False)
print('\nTop Categories by Number Installs:\n', category_df.head(60))

# Display most frequent combination of category and subcategory
#most_popular_combo = play_app_df.groupby(['category', 'subcategory']).size().nlargest(50).reset_index(name='App Count')
#print('\nTop Fifty Most Popular category and subcategory by app count:\n', most_popular_combo)


Top Categories by Number Installs:
                                                installs
category            subcategory                        
COMMUNICATION       chat                    24492000000
SOCIAL              socialapps               9683000000
PRODUCTIVITY        productivity             9640000000
MUSIC_AND_AUDIO     musicisyourlife          3000060000
TRAVEL_AND_LOCAL    findhome                 3000000000
PHOTOGRAPHY         photoedit                2510000000
                    sharemoments             2051000000
VIDEO_PLAYERS       sharemoments             2000000000
COMMUNICATION       browsers                 1940100000
PRODUCTIVITY        businessowners           1514100000
NEWS_AND_MAGAZINES  socialapps               1500000000
PRODUCTIVITY        holdthatthought          1299500000
COMMUNICATION       inbox                    1184510000
SOCIAL              topgrossing              1100000000
TRAVEL_AND_LOCAL    maps                     1025000000
NEWS_AND_MA

In [61]:
#Top words in a subcategory
top_words_across_apps = compute_top_common_words(play_app_df, "tokens", 4000)




In [63]:
#Top words across all apps only using the unique words in each app 
top_words_across_apps_unique = compute_top_common_words(play_app_df, "unique_tokens", 3500)




In [47]:
play_app_df['dayofweek'].value_counts()
#Most popular days to update an app are Monday and Tuesday

Monday       633
Tuesday      619
Thursday     583
Friday       538
Wednesday    503
Sunday       181
Saturday     175
Name: dayofweek, dtype: int64

In [91]:
#Analyze subcategories
sub_df = play_app_df.loc[play_app_df['subcategory'] == "chat"]
print("Starting Length: {}".format(len(sub_df)))

# chooose a number of tokens so that we get about 10 hits 
top_words_in_sub = compute_top_common_words(sub_df, "tokens", 200)

#sub_df['filtered_tokens'] = sub_df['tokens'].apply(keep_top_k_words)
#sub_df['num_filtered_tokens'] = sub_df['filtered_tokens'].apply(len)

#sub_df['remaining_tokens'] = sub_df['tokens'].apply(keep_unique_words)
#sub_df['num_remaining_tokens'] = sub_df['remaining_tokens'].apply(len)
#sub_df.head()

# drop out descriptions that are smaller than 20 tokens
#sub_df.loc[sub_df['num_filtered_tokens'] >= 10]
#print("\nLength After: {}".format(len(chat_df)))

#train_df, test_df = create_train_test_datasets(sub_df)
#dictionary, corpus, lda = train_lda(train_df, 10, 20)

#for i in range(0, lda.num_topics):
#     print(i, lda.show_topic(topicid=i, topn=7), "\n")
        
# we need to use nested list comprehension here
# this may take 1-2 minutes...
#doc_topic_dist = np.array([[tup[1] for tup in lst] for lst in lda[corpus]])
#doc_topic_dist.shape

#print_train_app_description (train_df)
#print_test_app_description (test_df, train_df, doc_topic_dist)

Starting Length: 99
-0.12063163934432528
0.01695594103445285
-0.30340141477801297
-0.05851548331518697
-0.007621823022224475
-0.04857712762324355
-0.048491272412650416
-0.0004638697703883004
-0.030652710275381894

tokens Stats: Total:1148 Min Hits:200 Words:[('message', 385), ('call', 338), ('phone', 294), ('video', 244), ('group', 209), ('friend', 203), ('number', 186), ('calling', 155), ('voice', 153), ('messaging', 145), ('photo', 138), ('sticker', 110), ('share', 101), ('feature', 97), ('international', 96), ('contact', 95), ('people', 95), ('texting', 88), ('family', 87), ('device', 87), ('chat', 80), ('android', 79), ('messenger', 76), ('charge', 61), ('mobile', 61), ('whatsapp', 60), ('user', 59), ('start', 57), ('world', 56), ('support', 55), ('picture', 54), ('backup', 54), ('emoji', 51), ('viber', 50), ('skype', 50), ('unlimited', 49), ('anyone', 48), ('receive', 46), ('google', 46), ('using', 45), ('connect', 45), ('network', 44), ('easily', 44), ('conversation', 43), ('touc

In [67]:
def jensen_shannon(query, matrix):
    """
    This function implements a Jensen-Shannon similarity
    between the input query (an LDA topic distribution for an app description)
    and the entire corpus of topic distributions.
    It returns an array of length M where M is the number of app descriptions in the corpus
    """
    # lets keep with the p,q notation above
    p = query[None,:].T # take transpose
    q = matrix.T # transpose matrix
    m = 0.5*(p + q)
    return np.sqrt(0.5*(entropy(p,m) + entropy(q,m)))

def get_most_similar_documents(query,matrix,k=10):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sims = jensen_shannon(query,matrix) # list of jensen shannon distances
    return sims.argsort()[:k] # the top k positional index of the smallest Jensen Shannon distances

In [964]:
def create_train_test_datasets(df):
    # make sure all tokenized items are lists
    df[df['filtered_tokens'].map(type) == list]
    df.drop('updated', axis=1)
    df.reset_index(drop=True,inplace=True)

    # create a mask of binary values
    msk = np.random.rand(len(df)) < 0.9
    train_df = df[msk]
    train_df.reset_index(drop=True,inplace=True)

    test_df = df[~msk]
    test_df.reset_index(drop=True,inplace=True)
    print("LDA Len: {} Train Len: {} Test Len: {} \n".format(len(df),len(train_df),len(test_df)))
    return train_df, test_df

In [924]:
def train_lda(data, num_topics, chunk_size):
    """
    This function trains the lda model
    We setup parameters like number of topics, the chunksize to use in Hoffman method
    We also do 2 passes of the data since this is a small dataset, so we want the distributions to stabilize
    """
    num_topics = num_topics
    chunksize = chunk_size
    dictionary = corpora.Dictionary(data['filtered_tokens'])
    corpus = [dictionary.doc2bow(doc) for doc in data['filtered_tokens']]
    t1 = time.time()
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                   chunksize=chunksize, minimum_probability=0.0, passes=15)
    t2 = time.time()
    print("Time to train LDA model on ", len(app_df_lda), "descriptions: ", (t2-t1)/60, "min")
    return dictionary,corpus,lda

In [925]:
# select an app at random from train_df
def print_train_app_description (train_df):
    random_app_index = np.random.randint(len(train_df))
    bow = dictionary.doc2bow(train_df.iloc[random_app_index, train_df.columns.get_loc("filtered_tokens")])
    print(random_app_index)
    print(train_df.iloc[random_app_index, train_df.columns.get_loc("app_id")])
    print(train_df.iloc[random_app_index, train_df.columns.get_loc("name")])

    # get the topic contributions for the document chosen at random above
    doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=bow)])

    # bar plot of topic distribution for this document
    fig, ax = plt.subplots(figsize=(12,6));

    # the histogram of the data
    patches = ax.bar(np.arange(len(doc_distribution)), doc_distribution)
    ax.set_xlabel('Topic ID', fontsize=15)
    ax.set_ylabel('Topic Contribution', fontsize=15)
    ax.set_title("Topic Distribution for App Description " + str(random_article_index), fontsize=20)
    ax.set_xticks(np.linspace(10,100,10))
    fig.tight_layout()
    plt.show()

    # print the contributing topics and their words
    for i in doc_distribution.argsort()[-5:][::-1]:
        print(i, lda.show_topic(topicid=i, topn=10), "\n")

In [981]:
# select and article at random from test_df
def print_test_app_description (test_df, train_df, doc_topic_dist):
    random_app_index = np.random.randint(len(test_df))

    new_bow = dictionary.doc2bow(test_df.iloc[random_app_index, train_df.columns.get_loc("filtered_tokens")])
    print(random_app_index)
    print(test_df.iloc[random_app_index, train_df.columns.get_loc("app_id")])
    print(test_df.iloc[random_app_index, test_df.columns.get_loc("name")])

    new_doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=new_bow)])
    #Let's do the same visual analysis as before on this new unseen document

    # bar plot of topic distribution for this document
    fig, ax = plt.subplots(figsize=(12,6));

    # the histogram of the data
    patches = ax.bar(np.arange(len(new_doc_distribution)), new_doc_distribution)
    ax.set_xlabel('Topic ID', fontsize=15)
    ax.set_ylabel('Topic Contribution', fontsize=15)
    ax.set_title("Topic Distribution for an Unseen Application", fontsize=20)
    ax.set_xticks(np.linspace(10, 100, 10))
    fig.tight_layout()
    plt.show()

    # print the top 5 contributing topics and their words
    for i in new_doc_distribution.argsort()[-5:][::-1]:
        print(i, lda.show_topic(topicid=i, topn=10), "\n")
    
    # this is surprisingly fast
    most_sim_ids = get_most_similar_documents(new_doc_distribution, doc_topic_dist)
    most_similar_df = train_df[train_df.index.isin(most_sim_ids)]
    most_similar_df.head(5)

In [None]:
print("Apps by Revenue")
Most_popular_rev_apps = ios_app_df.loc[ios_app_df['ios_origin_list'] == 'appsByRevenue']
print(len(Most_popular_rev_apps))
Most_popular_rev_apps = Most_popular_rev_apps.drop_duplicates(subset=['ios_app_id'])
print(len(Most_popular_rev_apps))
Most_popular_rev_apps['ios_num_ratings'].describe()
Most_popular_rev_apps = Most_popular_rev_apps.loc[Most_popular_rev_apps['ios_num_ratings'] > float(300.0)]
print(len(Most_popular_rev_apps))
Most_popular_rev_apps = Most_popular_rev_apps.loc[Most_popular_rev_apps['ios_rating'] >= float(3.0)]
print(len(Most_popular_rev_apps))
Most_popular_rev_apps = Most_popular_rev_apps.loc[Most_popular_rev_apps['ios_found_position'] <= 25]
print(len(Most_popular_rev_apps))
out = Most_popular_rev_apps.to_json(orient='records', lines=True)
with open('most_popular_revenue_apps.json', 'w') as f:
    f.write(out)  
    
print("Apps Paid")
Most_popular_paid_apps = ios_app_df.loc[ios_app_df['ios_origin_list'] == 'paidApplications']
print(len(Most_popular_paid_apps))
Most_popular_paid_apps = Most_popular_paid_apps.drop_duplicates(subset=['ios_app_id'])
print(len(Most_popular_paid_apps))
Most_popular_paid_apps = Most_popular_paid_apps.loc[Most_popular_paid_apps['ios_num_ratings'] >= float(300.0)]
print(len(Most_popular_paid_apps))
Most_popular_paid_apps = Most_popular_paid_apps.loc[Most_popular_paid_apps['ios_rating'] >= float(3.0)]
print(len(Most_popular_paid_apps))
Most_popular_paid_apps = Most_popular_paid_apps.loc[Most_popular_paid_apps['ios_found_position'] <= 25]
print(len(Most_popular_paid_apps))
out = Most_popular_paid_apps.to_json(orient='records', lines=True)
with open('most_popular_paid_apps.json', 'w') as f:
    f.write(out) 
    
print("Ipad Paid")
Most_ipad_paid_apps = ios_app_df.loc[ios_app_df['ios_origin_list'] == 'paidIpadApplications']
print(len(Most_ipad_paid_apps))
Most_ipad_paid_apps = Most_ipad_paid_apps.drop_duplicates(subset=['ios_app_id'])
print(len(Most_ipad_paid_apps))
Most_ipad_paid_apps = Most_ipad_paid_apps.loc[Most_ipad_paid_apps['ios_num_ratings'] >= float(300.0)]
print(len(Most_ipad_paid_apps))
Most_ipad_paid_apps = Most_ipad_paid_apps.loc[Most_ipad_paid_apps['ios_rating'] >= float(3.0)]
print(len(Most_popular_paid_apps))
Most_ipad_paid_apps = Most_ipad_paid_apps.loc[Most_ipad_paid_apps['ios_found_position'] <= 25]
print(len(Most_ipad_paid_apps))
out = Most_ipad_paid_apps.to_json(orient='records', lines=True)
with open('most_popular_paid_ipad_apps.json', 'w') as f:
    f.write(out) 
    
print("Merged Paid")
df_row_paid_merged = pd.concat([most_popular_rev_apps, Most_popular_paid_apps], ignore_index=True)
print(len(df_row_paid_merged))
df_row_paid_merged = df_row_paid_merged.drop_duplicates(subset=['ios_app_id'])
print(len(df_row_paid_merged))
out = df_row_paid_merged.to_json(orient='records', lines=True)
with open('most_popular_paid_merged_apps.json', 'w') as f:
    f.write(out)
    
print("Apps Free")
most_popular_free_apps = ios_app_df.loc[ios_app_df['ios_origin_list'] == 'freeApplications']
print(len(most_popular_free_apps))
most_popular_free_apps = most_popular_free_apps.drop_duplicates(subset=['ios_app_id'])
print(len(most_popular_free_apps))
most_popular_free_apps = most_popular_free_apps.loc[most_popular_free_apps['ios_num_ratings'] >= float(300.0)]
print(len(most_popular_free_apps))
most_popular_free_apps = most_popular_free_apps.loc[most_popular_free_apps['ios_rating'] >= float(3.0)]
print(len(most_popular_free_apps))
most_popular_free_apps = most_popular_free_apps.loc[most_popular_free_apps['ios_found_position'] <= 50]
print(len(most_popular_free_apps))
out = most_popular_free_apps.to_json(orient='records', lines=True)
with open('most_popular_free_apps.json', 'w') as f:
    f.write(out)  
    
print("Apps Free Ipad")
most_popular_free_ipad_apps = ios_app_df.loc[ios_app_df['ios_origin_list'] == 'freeIpadApplications']
print(len(most_popular_free_ipad_apps))
most_popular_free_ipad_apps = most_popular_free_ipad_apps.drop_duplicates(subset=['ios_app_id'])
print(len(most_popular_free_ipad_apps))
most_popular_free_ipad_apps = most_popular_free_ipad_apps.loc[most_popular_free_ipad_apps['ios_num_ratings'] >= float(300.0)]
print(len(most_popular_free_ipad_apps))
most_popular_free_ipad_apps = most_popular_free_ipad_apps.loc[most_popular_free_ipad_apps['ios_rating'] >= float(3.0)]
print(len(most_popular_free_ipad_apps))
most_popular_free_ipad_apps = most_popular_free_ipad_apps.loc[most_popular_free_ipad_apps['ios_found_position'] <= 50]
print(len(most_popular_free_ipad_apps))
out = most_popular_free_ipad_apps.to_json(orient='records', lines=True)
with open('most_popular_free_ipad_apps.json', 'w') as f:
    f.write(out)  

print("Merged Free")
df_row_free_merged = pd.concat([most_popular_free_apps, most_popular_free_ipad_apps], ignore_index=True)
print(len(df_row_free_merged))
df_row_free_merged = df_row_free_merged.drop_duplicates(subset=['ios_app_id'])
print(len(df_row_free_merged))
out = df_row_free_merged.to_json(orient='records', lines=True)
with open('most_popular_free_merged_apps.json', 'w') as f:
    f.write(out)

print("Merged All")
df_row_merged = pd.concat([most_popular_rev_apps, Most_ipad_paid_apps, most_popular_free_ipad_apps, most_popular_free_apps, Most_popular_paid_apps], ignore_index=True)
print(len(df_row_merged))
df_row_merged = df_row_merged.drop_duplicates(subset=['ios_app_id'])
print(len(df_row_merged))
df_row_merged = df_row_merged.loc[df_row_merged['ios_num_ratings'] >= float(300.0)]
print(len(df_row_merged))
df_row_merged = df_row_merged.loc[df_row_merged['ios_rating'] >= float(3.0)]
print(len(df_row_merged))
out = df_row_merged.to_json(orient='records', lines=True)
with open('most_popular_apps.json', 'w') as f:
    f.write(out)
    
# Display most frequent combination of category after munging
most_popular_combo = df_row_merged.groupby(['ios_category']).size().nlargest(150).reset_index(name='App Count')
print('Top category by app count: ', most_popular_combo.head(150))

# Display most frequent category
category_df = ios_app_df.groupby(['ios_category']).size().nlargest(50).reset_index(name='App Count')
print('Top Categories by Application Count: ', category_df.head(50))

# Display most ratings (maybe this gives #installs)
category_df = ios_app_df.pivot_table(index='ios_category', values="ios_num_ratings", aggfunc='sum')
category_df.sort_values(by=['ios_num_ratings', 'ios_category'], inplace=True, ascending=False)
print('Top Categories by Number of Ratings: ', category_df.head(50))

# Display most frequent combination of category and origin_list
most_popular_combo = ios_app_df.groupby(['ios_category','ios_origin_list']).size().nlargest(150).reset_index(name='App Count')
most_popular_combo.sort_values(by=['ios_category', 'ios_origin_list', 'App Count'], inplace=True, ascending=True)
print('Top category and origin_list by app count: ', most_popular_combo.head(150))
