In [593]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn.model_selection as sk
sb.set()

In [594]:
googleplay = pd.read_csv('C:/Users/davyn/OneDrive/Desktop/SC1015 mini-project/googleplay2/Google-Playstore.csv')

In [595]:
googleplay.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2312944 entries, 0 to 2312943
Data columns (total 24 columns):
 #   Column             Dtype  
---  ------             -----  
 0   App Name           object 
 1   App Id             object 
 2   Category           object 
 3   Rating             float64
 4   Rating Count       float64
 5   Installs           object 
 6   Minimum Installs   float64
 7   Maximum Installs   int64  
 8   Free               bool   
 9   Price              float64
 10  Currency           object 
 11  Size               object 
 12  Minimum Android    object 
 13  Developer Id       object 
 14  Developer Website  object 
 15  Developer Email    object 
 16  Released           object 
 17  Last Updated       object 
 18  Content Rating     object 
 19  Privacy Policy     object 
 20  Ad Supported       bool   
 21  In App Purchases   bool   
 22  Editors Choice     bool   
 23  Scraped Time       object 
dtypes: bool(4), float64(4), int64(1), object(15)
memor

## Dropping unneccessary Columns (e.g. Developer ID, Email, etc.)

In [596]:
googleplay = googleplay[['Category','Rating','Rating Count','Installs', 'Free', 'Price', 'Size', 'Last Updated', 'Content Rating', 'Ad Supported', 'In App Purchases', 'Editors Choice']]
googleplay.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2312944 entries, 0 to 2312943
Data columns (total 12 columns):
 #   Column            Dtype  
---  ------            -----  
 0   Category          object 
 1   Rating            float64
 2   Rating Count      float64
 3   Installs          object 
 4   Free              bool   
 5   Price             float64
 6   Size              object 
 7   Last Updated      object 
 8   Content Rating    object 
 9   Ad Supported      bool   
 10  In App Purchases  bool   
 11  Editors Choice    bool   
dtypes: bool(4), float64(3), object(5)
memory usage: 150.0+ MB


## Removing all rows with null values 

In [597]:
googleplayclean = googleplay.dropna()

# Removing all apps with a rating of 0, to avoid using apps that have 0 outreach and/or are brand new

In [598]:
googleplayclean = googleplayclean[(googleplayclean['Rating'] > 0)]

## Cleaning the Dataset, so that only Game Categories Remain

In [599]:
games = ['Word', 'Strategy', 'Card', 'Board', 'Trivia', 'Racing', 'Role Playing', 'Adventure', 'Action', 'Simulation', 'Arcade', 'Puzzle', 'Casual']

In [600]:
googleplayclean = googleplayclean[googleplayclean['Category'].isin(games)]

In [601]:
print(googleplayclean['Category'].value_counts())

Puzzle          31939
Arcade          30315
Casual          28969
Simulation      19130
Action          18113
Adventure       15733
Role Playing     8239
Racing           7539
Trivia           7204
Board            7142
Card             6162
Strategy         5966
Word             5859
Name: Category, dtype: int64


# Clean 'Size' column, convert all to kilobytes 

In [602]:
googleplayclean['Size'].value_counts()

Varies with device    9808
19M                   3575
18M                   3493
23M                   3480
25M                   3432
                      ... 
531k                     1
281k                     1
239M                     1
559k                     1
870k                     1
Name: Size, Length: 1117, dtype: int64

In [603]:
googleplayclean.drop(googleplayclean[googleplayclean['Size'] == 'Varies with device'].index, inplace=True)

In [604]:
def convert_size(x): #convert to kilobytes
    unit = x[-1]
    value = x[0:-1]
    if(unit == 'M'):
        value = value.replace(',', '')
        # value = float(x[0:-1])
        value = float(value)
        value = value*1024
        
    elif(unit == 'k'):
        # value = x[0:-1]
        value = value.replace(',', '')
        value = float(value)
    
    return value

In [605]:
googleplayclean['Size'] = googleplayclean.apply(lambda row : convert_size(row['Size']), axis = 1)

In [606]:
googleplayclean['Size']

10         52224.0
22         16384.0
25         25600.0
43         26624.0
69         39936.0
            ...   
2312846    29696.0
2312866    22528.0
2312932    37888.0
2312939    78848.0
2312943     5324.8
Name: Size, Length: 182502, dtype: object

# Cleaning the 'Installs' Column

In [607]:
installs_0 = ['0+', '1+', '5+', '10+', '50+', '100+', '500+', '1,000+', '5,000+', '10,000+', '50,000+', '100,000+']
googleplayclean.loc[~googleplayclean['Installs'].isin(installs_0), 'Installs'].value_counts()

1,000,000+        10269
500,000+           6999
10,000,000+        2285
5,000,000+         2249
50,000,000+         299
100,000,000+        170
500,000,000+         10
1,000,000,000+        1
Name: Installs, dtype: int64

# Turning Installs into a "High" or "Low" Variable

In [608]:
googleplayclean.loc[~googleplayclean['Installs'].isin(installs_0), 'Installs'] = '>=500000'
googleplayclean.loc[googleplayclean['Installs'].isin(installs_0), 'Installs'] = '<500000'


In [609]:
googleplayclean['Installs'].value_counts()

<500000     160220
>=500000     22282
Name: Installs, dtype: int64

# Cleaning the 'Ratings' Column

In [610]:
googleplayclean.describe()[['Rating']]

Unnamed: 0,Rating
count,182502.0
mean,4.099288
std,0.619297
min,1.0
25%,3.8
50%,4.2
75%,4.5
max,5.0


In [611]:
ratings_0 = [4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0]
ratings_1 = [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9]

In [612]:
googleplayclean.loc[googleplayclean['Rating'].isin(ratings_0), 'Rating'] = '>=4.0'
googleplayclean.loc[googleplayclean['Rating'].isin(ratings_1), 'Rating'] = '<4.0'

In [613]:
googleplayclean['Rating'].value_counts()

>=4.0    116328
<4.0      66174
Name: Rating, dtype: int64

# Convert 'Last Updated' into datetime

In [614]:
from datetime import datetime
googleplayclean['Last Updated'] = googleplayclean.apply(lambda row : datetime.strptime(row['Last Updated'], '%b %d, %Y'), axis = 1)
print(googleplayclean['Last Updated'])

10        2020-07-30
22        2020-05-11
25        2018-03-26
43        2021-05-01
69        2021-05-24
             ...    
2312846   2020-08-15
2312866   2016-04-08
2312932   2017-03-06
2312939   2021-06-01
2312943   2019-08-19
Name: Last Updated, Length: 182502, dtype: datetime64[ns]


# Creating a "Success" column, based of Installs and Rating


In [615]:
googleplayclean["Success"] = googleplayclean["Rating"] + googleplayclean["Installs"]

In [616]:
success_0 = ['>=4.0>=500000']
googleplayclean.loc[~googleplayclean['Success'].isin(success_0), 'Success'] = False
googleplayclean.loc[googleplayclean['Success'].isin(success_0), 'Success'] = True
googleplayclean['Success'].value_counts()

False    168140
True      14362
Name: Success, dtype: int64

In [617]:
googleplayclean['Success'].astype(bool)

10         False
22         False
25         False
43         False
69         False
           ...  
2312846    False
2312866    False
2312932     True
2312939    False
2312943    False
Name: Success, Length: 182502, dtype: bool

In [618]:
googleplayclean.to_csv("googleplayclean.csv")