In [433]:
import pandas as pd
from sklearn import preprocessing
import numpy as np

In [434]:
data = pd.read_csv('appstore_games.csv')

In [450]:
data.head()

Unnamed: 0,URL,ID,Name,Subtitle,Icon URL,Average User Rating,User Rating Count,Price,In-app Purchases,Description,...,isSports & Activities,isGaming,isComics & Cartoons,isAnimals & Nature,isPeople,isKids & Family,isArt,isPlaces & Objects,isWeather,isMagazines & Newspapers
0,https://apps.apple.com/us/app/sudoku/id284921427,284921427,Sudoku,,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,4.0,3553.0,2.99,,"Join over 21,000,000 of our fans and download ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,https://apps.apple.com/us/app/reversi/id284926400,284926400,Reversi,,https://is4-ssl.mzstatic.com/image/thumb/Purpl...,3.5,284.0,1.99,,"The classic game of Reversi, also known as Oth...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,https://apps.apple.com/us/app/morocco/id284946595,284946595,Morocco,,https://is5-ssl.mzstatic.com/image/thumb/Purpl...,3.0,8376.0,0.0,,Play the classic strategy game Othello (also k...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,https://apps.apple.com/us/app/sudoku-free/id28...,285755462,Sudoku (Free),,https://is3-ssl.mzstatic.com/image/thumb/Purpl...,3.5,190394.0,0.0,,"Top 100 free app for over a year.\nRated ""Best...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,https://apps.apple.com/us/app/senet-deluxe/id2...,285831220,Senet Deluxe,,https://is1-ssl.mzstatic.com/image/thumb/Purpl...,3.5,28.0,2.99,,"""Senet Deluxe - The Ancient Game of Life and A...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Adding a col with the legnth of the description
because just the raw text is less helpful, and I think length is the best way of turning it into numbers

Also for the number of languages & in-app purchases

In [436]:
def getLen(n):
    if(str(n) == 'nan'):
        return 0
    else:
        return len(n.split())


In [437]:
data['Description Length'] = data['Description'].apply(getLen)
data['Num Languages'] = data['Languages'].apply(getLen)
data['Num In-App'] = data['In-app Purchases'].apply(getLen)

### Adding col with most expensive in-app purchase

In [438]:
def getMax(n):
    if(str(n) == 'nan'):
        return 0
    else:
        return max(map(lambda x: float(x), n.split(', ')))

In [439]:
data['Max In-App'] = data['In-app Purchases'].apply(getMax)

### One hot encoding for genres

In [440]:
enc_pgenre = preprocessing.LabelEncoder()
encoded_pgenre = enc_pgenre.fit_transform(data['Primary Genre'])

onehot_pgenre = preprocessing.OneHotEncoder(sparse=False)
transformed_pgenre = onehot_pgenre.fit_transform(encoded_pgenre.reshape(-1,1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [441]:
def getIndexOneHot(col):
    return list(col)
    
ohGenres = list(map(getIndexOneHot,  transformed_pgenre))

In [442]:
values = onehot_pgenre.transform(enc_pgenre.transform(data['Primary Genre'].unique()).reshape(-1,1))
values = list(map(lambda x: list(x).index(1), values))

keys = data['Primary Genre'].unique()

genreDict = dict(zip(keys, values))

preGenreDict = dict(zip(keys, values))

In [443]:
def multipleHotIncode(genresList):
    genresList = genresList.split(', ')
    out = np.zeros(len(genreDict))
    for genre in genresList:
        if(genre in genreDict):
            out[genreDict[genre]] = 1
        else:
            genreDict[genre] = len(genreDict)
            out = np.append(out, 1)
    return(out)
    

In [444]:
mhGenres = data['Genres'].apply(multipleHotIncode)

In [445]:
genreList = []
for i in range(len(genreDict)):
    nextValue = list(genreDict.values()).index(i)
    nextKey = list(genreDict.keys())[nextValue]
    genreList += ['is' + nextKey]
    
primaryGenreList = []
for i in range(len(preGenreDict)):
    nextValue = list(preGenreDict.values()).index(i)
    nextKey = list(preGenreDict.keys())[nextValue]
    primaryGenreList += ['primarilyIs' + nextKey]

In [446]:
mhGenresDF = pd.DataFrame(list(mhGenres), columns=genreList).fillna(0)
ohGenresDF = pd.DataFrame(list(ohGenres), columns=primaryGenreList).fillna(0)

In [447]:
data = pd.concat([data,ohGenresDF,mhGenresDF], axis=1)

In [448]:
with pd.option_context('display.max_columns', None):  # more options can be specified also
    print(data)

                                                     URL          ID  \
0       https://apps.apple.com/us/app/sudoku/id284921427   284921427   
1      https://apps.apple.com/us/app/reversi/id284926400   284926400   
2      https://apps.apple.com/us/app/morocco/id284946595   284946595   
3      https://apps.apple.com/us/app/sudoku-free/id28...   285755462   
4      https://apps.apple.com/us/app/senet-deluxe/id2...   285831220   
...                                                  ...         ...   
17002  https://apps.apple.com/us/app/stack-puzzle-ris...  1474626442   
17003  https://apps.apple.com/us/app/eachother/id1474...  1474919257   
17004  https://apps.apple.com/us/app/rabbit-vs-tortoi...  1474962324   
17005  https://apps.apple.com/us/app/fatall/id1474963671  1474963671   
17006  https://apps.apple.com/us/app/the-three-kingdo...  1475076711   

                            Name                        Subtitle  \
0                         Sudoku                             NaN   

In [449]:
data

Unnamed: 0,URL,ID,Name,Subtitle,Icon URL,Average User Rating,User Rating Count,Price,In-app Purchases,Description,...,isSports & Activities,isGaming,isComics & Cartoons,isAnimals & Nature,isPeople,isKids & Family,isArt,isPlaces & Objects,isWeather,isMagazines & Newspapers
0,https://apps.apple.com/us/app/sudoku/id284921427,284921427,Sudoku,,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,4.0,3553.0,2.99,,"Join over 21,000,000 of our fans and download ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,https://apps.apple.com/us/app/reversi/id284926400,284926400,Reversi,,https://is4-ssl.mzstatic.com/image/thumb/Purpl...,3.5,284.0,1.99,,"The classic game of Reversi, also known as Oth...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,https://apps.apple.com/us/app/morocco/id284946595,284946595,Morocco,,https://is5-ssl.mzstatic.com/image/thumb/Purpl...,3.0,8376.0,0.00,,Play the classic strategy game Othello (also k...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,https://apps.apple.com/us/app/sudoku-free/id28...,285755462,Sudoku (Free),,https://is3-ssl.mzstatic.com/image/thumb/Purpl...,3.5,190394.0,0.00,,"Top 100 free app for over a year.\nRated ""Best...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,https://apps.apple.com/us/app/senet-deluxe/id2...,285831220,Senet Deluxe,,https://is1-ssl.mzstatic.com/image/thumb/Purpl...,3.5,28.0,2.99,,"""Senet Deluxe - The Ancient Game of Life and A...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17002,https://apps.apple.com/us/app/stack-puzzle-ris...,1474626442,Stack Puzzle : Rise Tower,"Blast the cubes, solve puzzle!",https://is5-ssl.mzstatic.com/image/thumb/Purpl...,,,0.00,,"The goal is very simple, move the square horiz...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17003,https://apps.apple.com/us/app/eachother/id1474...,1474919257,EachOther,,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,,,0.00,,Collect a score while you play!!\n\nBy linking...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17004,https://apps.apple.com/us/app/rabbit-vs-tortoi...,1474962324,Rabbit Vs Tortoise,,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,,,0.00,,"""Rabbit Vs Tortoise is chess type cool simple ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17005,https://apps.apple.com/us/app/fatall/id1474963671,1474963671,FaTaLL,Most fun game!!!,https://is1-ssl.mzstatic.com/image/thumb/Purpl...,,,0.00,"9.99, 49.99, 3.99",Upgrade your character and use your skills to ...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
