# 2.1- Data Preparation - Train.csv

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("datasets/TrainDataAnalysis.csv")

In [3]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,review_group
0,GPS Status & Toolbox,TRAVEL_AND_LOCAL,1.0,149723,4.1M,"10,000,000+",Free,0,Everyone,Travel & Local,"June 13, 2018",8.1.171,4.0 and up,[26432+)
1,Fish|Hunt FL,LIFESTYLE,0.0,853,34M,"100,000+",Free,0,Everyone,Lifestyle,"March 22, 2018",3.2.0,4.1 and up,"[23, 889)"
2,Allrecipes Dinner Spinner,FOOD_AND_DRINK,1.0,61881,Varies with device,"5,000,000+",Free,0,Everyone,Food & Drink,"April 10, 2018",Varies with device,Varies with device,[26432+)
3,Don't touch my phone,TOOLS,0.0,21943,2.3M,"5,000,000+",Free,0,Everyone,Tools,"October 19, 2016",30,2.3 and up,"[889, 26432)"
4,BM SPM Practice,FAMILY,1.0,6,9.2M,"1,000+",Free,0,Everyone,Education,"January 24, 2018",1.0,4.1 and up,"[0, 23)"


## Feature Engineering content

* Delete `App` column, because is different in each row
* Change `Size` column values to millions or thousands
* Delete `Install` signature of + 
* Delete `$` symbol to Price
* Dummy encode `Categories, Content Rating, Genres`, because they don't have any range as ordinal data
* Last updated change to one column with year, one column with month, one column with day
* Delete `Current Ver`, because don't give any value
* In `Android Ver` edit number deleting the word "and up", and deleting the outliers with forma "-"
* Boolean values for `Type`
* Give an ordinal number to `review_group` as range of importance
* Delete `App, Size, Installs, Price, Last Updated, LastUpdatedNew, Current Ver, Android Ver, review_group` because is already encoded
* Impute Null values
* Normalization, Standarization???

### 1. Personalized Changes

#### 1.1 Changing Size format
From values with M and k, to floats

In [4]:
def change_M_k(df, new_column_name, column_name):
    df[new_column_name] = df[column_name].apply(lambda x: 
                         float(x.split("M")[0]) / 100 if x[-1] == "M" else 
                         float(x.split("k")[0]) / 100 if x[-1] == "k" else
                         np.nan
                    )
    return df

In [5]:
df = change_M_k(df, "SizeNew", "Size")

In [6]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,review_group,SizeNew
0,GPS Status & Toolbox,TRAVEL_AND_LOCAL,1.0,149723,4.1M,"10,000,000+",Free,0,Everyone,Travel & Local,"June 13, 2018",8.1.171,4.0 and up,[26432+),0.041
1,Fish|Hunt FL,LIFESTYLE,0.0,853,34M,"100,000+",Free,0,Everyone,Lifestyle,"March 22, 2018",3.2.0,4.1 and up,"[23, 889)",0.34
2,Allrecipes Dinner Spinner,FOOD_AND_DRINK,1.0,61881,Varies with device,"5,000,000+",Free,0,Everyone,Food & Drink,"April 10, 2018",Varies with device,Varies with device,[26432+),
3,Don't touch my phone,TOOLS,0.0,21943,2.3M,"5,000,000+",Free,0,Everyone,Tools,"October 19, 2016",30,2.3 and up,"[889, 26432)",0.023
4,BM SPM Practice,FAMILY,1.0,6,9.2M,"1,000+",Free,0,Everyone,Education,"January 24, 2018",1.0,4.1 and up,"[0, 23)",0.092


#### 1.2 Changing installs "+" format for integers

In [7]:
def delimiter_format(df, new_column_name, column_name):
    df[new_column_name] = df[column_name].apply(lambda x: x.split("+")[0] if x[-1] == "+" else np.nan)
    df[new_column_name] = df[new_column_name].apply(lambda x: int(x.replace(',', '')))
    
    return df

In [8]:
df = delimiter_format(df, 'InstallsNew', 'Installs')

In [9]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,review_group,SizeNew,InstallsNew
0,GPS Status & Toolbox,TRAVEL_AND_LOCAL,1.0,149723,4.1M,"10,000,000+",Free,0,Everyone,Travel & Local,"June 13, 2018",8.1.171,4.0 and up,[26432+),0.041,10000000
1,Fish|Hunt FL,LIFESTYLE,0.0,853,34M,"100,000+",Free,0,Everyone,Lifestyle,"March 22, 2018",3.2.0,4.1 and up,"[23, 889)",0.34,100000
2,Allrecipes Dinner Spinner,FOOD_AND_DRINK,1.0,61881,Varies with device,"5,000,000+",Free,0,Everyone,Food & Drink,"April 10, 2018",Varies with device,Varies with device,[26432+),,5000000
3,Don't touch my phone,TOOLS,0.0,21943,2.3M,"5,000,000+",Free,0,Everyone,Tools,"October 19, 2016",30,2.3 and up,"[889, 26432)",0.023,5000000
4,BM SPM Practice,FAMILY,1.0,6,9.2M,"1,000+",Free,0,Everyone,Education,"January 24, 2018",1.0,4.1 and up,"[0, 23)",0.092,1000


#### 1.3 Changing symbol "$" in Pricing

In [10]:
def delete_price_symbol(df, new_column_name, column_name):
    df[new_column_name] = df[column_name].apply(lambda x: float(x.split("$")[1]) if x[0] == "$" else 0)
    return df

In [11]:
df = delete_price_symbol(df, "PriceNew", "Price")

In [12]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,review_group,SizeNew,InstallsNew,PriceNew
0,GPS Status & Toolbox,TRAVEL_AND_LOCAL,1.0,149723,4.1M,"10,000,000+",Free,0,Everyone,Travel & Local,"June 13, 2018",8.1.171,4.0 and up,[26432+),0.041,10000000,0.0
1,Fish|Hunt FL,LIFESTYLE,0.0,853,34M,"100,000+",Free,0,Everyone,Lifestyle,"March 22, 2018",3.2.0,4.1 and up,"[23, 889)",0.34,100000,0.0
2,Allrecipes Dinner Spinner,FOOD_AND_DRINK,1.0,61881,Varies with device,"5,000,000+",Free,0,Everyone,Food & Drink,"April 10, 2018",Varies with device,Varies with device,[26432+),,5000000,0.0
3,Don't touch my phone,TOOLS,0.0,21943,2.3M,"5,000,000+",Free,0,Everyone,Tools,"October 19, 2016",30,2.3 and up,"[889, 26432)",0.023,5000000,0.0
4,BM SPM Practice,FAMILY,1.0,6,9.2M,"1,000+",Free,0,Everyone,Education,"January 24, 2018",1.0,4.1 and up,"[0, 23)",0.092,1000,0.0


#### 1.4 Encoding Dates, separating each year, month and weekday

In [13]:
def change_date_format(df, new_column_name, column_name):
    df[new_column_name] = pd.to_datetime(df[column_name])
    df["{}Year".format(new_column_name)] = df[new_column_name].dt.year
    df["{}Month".format(new_column_name)] = df[new_column_name].dt.month
    df["{}Day".format(new_column_name)] = df[new_column_name].dt.day
    return df

In [14]:
df = change_date_format(df, "LastUpdatedNew", "Last Updated")

In [15]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,...,Current Ver,Android Ver,review_group,SizeNew,InstallsNew,PriceNew,LastUpdatedNew,LastUpdatedNewYear,LastUpdatedNewMonth,LastUpdatedNewDay
0,GPS Status & Toolbox,TRAVEL_AND_LOCAL,1.0,149723,4.1M,"10,000,000+",Free,0,Everyone,Travel & Local,...,8.1.171,4.0 and up,[26432+),0.041,10000000,0.0,2018-06-13,2018,6,13
1,Fish|Hunt FL,LIFESTYLE,0.0,853,34M,"100,000+",Free,0,Everyone,Lifestyle,...,3.2.0,4.1 and up,"[23, 889)",0.34,100000,0.0,2018-03-22,2018,3,22
2,Allrecipes Dinner Spinner,FOOD_AND_DRINK,1.0,61881,Varies with device,"5,000,000+",Free,0,Everyone,Food & Drink,...,Varies with device,Varies with device,[26432+),,5000000,0.0,2018-04-10,2018,4,10
3,Don't touch my phone,TOOLS,0.0,21943,2.3M,"5,000,000+",Free,0,Everyone,Tools,...,30,2.3 and up,"[889, 26432)",0.023,5000000,0.0,2016-10-19,2016,10,19
4,BM SPM Practice,FAMILY,1.0,6,9.2M,"1,000+",Free,0,Everyone,Education,...,1.0,4.1 and up,"[0, 23)",0.092,1000,0.0,2018-01-24,2018,1,24


#### 1.5 Converting semversion to a float

In [16]:
def semversion_to_number(df, new_column_name, column_name):
    ## Find rare characters
    df[new_column_name] = df[column_name].apply(lambda x: 
                            x.split("W")[0] if x[-8] == "W" else
                            x.split("-")[0] if x[-7] == "-" else
                            x.split(" and up")[0] if x[-1] == "p" else
                            str(0)) ## change later this cero for NaN
    
    ## Find records with 3 poinbt version, like 4.0.2 and replace for float with first version
    df[new_column_name] = df[new_column_name].apply(lambda x: 
                            float(x.split(".")[0]) if x.count('.') == 2 else
                            float(x))
    
    ## Change zeros for NaNs
    df[new_column_name] = df[new_column_name].apply(lambda x: 
                            np.nan if x == 0 else
                            x)
    
    return df

In [17]:
df = semversion_to_number(df, "AndroidVerNew", "Android Ver")

In [18]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,...,Android Ver,review_group,SizeNew,InstallsNew,PriceNew,LastUpdatedNew,LastUpdatedNewYear,LastUpdatedNewMonth,LastUpdatedNewDay,AndroidVerNew
0,GPS Status & Toolbox,TRAVEL_AND_LOCAL,1.0,149723,4.1M,"10,000,000+",Free,0,Everyone,Travel & Local,...,4.0 and up,[26432+),0.041,10000000,0.0,2018-06-13,2018,6,13,4.0
1,Fish|Hunt FL,LIFESTYLE,0.0,853,34M,"100,000+",Free,0,Everyone,Lifestyle,...,4.1 and up,"[23, 889)",0.34,100000,0.0,2018-03-22,2018,3,22,4.1
2,Allrecipes Dinner Spinner,FOOD_AND_DRINK,1.0,61881,Varies with device,"5,000,000+",Free,0,Everyone,Food & Drink,...,Varies with device,[26432+),,5000000,0.0,2018-04-10,2018,4,10,
3,Don't touch my phone,TOOLS,0.0,21943,2.3M,"5,000,000+",Free,0,Everyone,Tools,...,2.3 and up,"[889, 26432)",0.023,5000000,0.0,2016-10-19,2016,10,19,2.3
4,BM SPM Practice,FAMILY,1.0,6,9.2M,"1,000+",Free,0,Everyone,Education,...,4.1 and up,"[0, 23)",0.092,1000,0.0,2018-01-24,2018,1,24,4.1


#### 1.6 Encoding review groups to oridinal numbers

In [19]:
def from_review_group_to_oridinal(df, new_column_name, column_name):
    df[new_column_name] = df[column_name].apply(
        lambda x: 0 if x == '[0, 23)' else 
        1 if x == '[23, 889)' else
        2 if x == '[889, 26432)' else
        3
    )
    return df

In [20]:
df = from_review_group_to_oridinal(df, "ReviewGroupNew", "review_group")

In [21]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,...,review_group,SizeNew,InstallsNew,PriceNew,LastUpdatedNew,LastUpdatedNewYear,LastUpdatedNewMonth,LastUpdatedNewDay,AndroidVerNew,ReviewGroupNew
0,GPS Status & Toolbox,TRAVEL_AND_LOCAL,1.0,149723,4.1M,"10,000,000+",Free,0,Everyone,Travel & Local,...,[26432+),0.041,10000000,0.0,2018-06-13,2018,6,13,4.0,3
1,Fish|Hunt FL,LIFESTYLE,0.0,853,34M,"100,000+",Free,0,Everyone,Lifestyle,...,"[23, 889)",0.34,100000,0.0,2018-03-22,2018,3,22,4.1,1
2,Allrecipes Dinner Spinner,FOOD_AND_DRINK,1.0,61881,Varies with device,"5,000,000+",Free,0,Everyone,Food & Drink,...,[26432+),,5000000,0.0,2018-04-10,2018,4,10,,3
3,Don't touch my phone,TOOLS,0.0,21943,2.3M,"5,000,000+",Free,0,Everyone,Tools,...,"[889, 26432)",0.023,5000000,0.0,2016-10-19,2016,10,19,2.3,2
4,BM SPM Practice,FAMILY,1.0,6,9.2M,"1,000+",Free,0,Everyone,Education,...,"[0, 23)",0.092,1000,0.0,2018-01-24,2018,1,24,4.1,0


### 2- Generic Changes

#### 2.1 Null values imputation

**Note:** Please note here that this data is for a competition, so I don't need to worry about data leakage

In [22]:
def null_exploration(df):
    ## to explore nulls
    print(df.isnull().sum())
    print(df.isnull().mean())

In [23]:
null_exploration(df)

App                      0
Category                 0
Rating                   0
Reviews                  0
Size                     0
Installs                 0
Type                     0
Price                    0
Content Rating           0
Genres                   0
Last Updated             0
Current Ver              0
Android Ver              0
review_group             0
SizeNew                728
InstallsNew              0
PriceNew                 0
LastUpdatedNew           0
LastUpdatedNewYear       0
LastUpdatedNewMonth      0
LastUpdatedNewDay        0
AndroidVerNew          584
ReviewGroupNew           0
dtype: int64
App                    0.000000
Category               0.000000
Rating                 0.000000
Reviews                0.000000
Size                   0.000000
Installs               0.000000
Type                   0.000000
Price                  0.000000
Content Rating         0.000000
Genres                 0.000000
Last Updated           0.000000
Current Ver   

In [24]:
def null_imputation(column_name):
    df[column_name].fillna(df[column_name].mean(), inplace=True)
    return df

In [25]:
df = null_imputation("SizeNew")

In [26]:
df = null_imputation("AndroidVerNew")

#### 2.2 OneStepFeatEng

In [27]:
#### One step feature engineering

class OneStepFeatEng():
    def __init__(self, df):
        self.df = df
        
    # Drop columns
    def drop_column(self, column_names):
        self.df.drop(column_names, axis=1, inplace=True)
        return self.df.head()
    
    ## Separated cols
    def separated_cols(self, target_var):
        target_var = target_var
        features = [x for x in list(self.df.columns) if x != target_var]
        
        cat_cols = [column_name for column_name in self.df.columns if self.df[column_name].dtypes=='O']
        num_cols = [column_name for column_name in self.df.columns if self.df[column_name].dtypes!='O']
        
        return features, cat_cols, num_cols
    
    # Boolean encoding
    def boolean_encoding(self, column_name, true_val):
        self.df[column_name] = self.df[column_name].apply(lambda x: 1 if x == true_val else 0)
        return self.df.head()
    
    # Final columns
    def final_columns(self):
        return self.df.columns
    
    # Final dataframe
    def final_dataframe(self):
        return self.df

In [28]:
## Instance
X = OneStepFeatEng(df)

In [29]:
X.drop_column(['App', 'Size', 'Installs', 'Price', 'Last Updated', 'LastUpdatedNew', 'Current Ver', 
               'Android Ver', 'review_group'])

Unnamed: 0,Category,Rating,Reviews,Type,Content Rating,Genres,SizeNew,InstallsNew,PriceNew,LastUpdatedNewYear,LastUpdatedNewMonth,LastUpdatedNewDay,AndroidVerNew,ReviewGroupNew
0,TRAVEL_AND_LOCAL,1.0,149723,Free,Everyone,Travel & Local,0.041,10000000,0.0,2018,6,13,4.0,3
1,LIFESTYLE,0.0,853,Free,Everyone,Lifestyle,0.34,100000,0.0,2018,3,22,4.1,1
2,FOOD_AND_DRINK,1.0,61881,Free,Everyone,Food & Drink,0.372354,5000000,0.0,2018,4,10,3.831783,3
3,TOOLS,0.0,21943,Free,Everyone,Tools,0.023,5000000,0.0,2016,10,19,2.3,2
4,FAMILY,1.0,6,Free,Everyone,Education,0.092,1000,0.0,2018,1,24,4.1,0


In [30]:
X.boolean_encoding("Type", "Paid")

Unnamed: 0,Category,Rating,Reviews,Type,Content Rating,Genres,SizeNew,InstallsNew,PriceNew,LastUpdatedNewYear,LastUpdatedNewMonth,LastUpdatedNewDay,AndroidVerNew,ReviewGroupNew
0,TRAVEL_AND_LOCAL,1.0,149723,0,Everyone,Travel & Local,0.041,10000000,0.0,2018,6,13,4.0,3
1,LIFESTYLE,0.0,853,0,Everyone,Lifestyle,0.34,100000,0.0,2018,3,22,4.1,1
2,FOOD_AND_DRINK,1.0,61881,0,Everyone,Food & Drink,0.372354,5000000,0.0,2018,4,10,3.831783,3
3,TOOLS,0.0,21943,0,Everyone,Tools,0.023,5000000,0.0,2016,10,19,2.3,2
4,FAMILY,1.0,6,0,Everyone,Education,0.092,1000,0.0,2018,1,24,4.1,0


In [31]:
X.final_dataframe().Type.value_counts()

0    5323
1     465
Name: Type, dtype: int64

In [32]:
features, cat_cols, num_cols = X.separated_cols('Rating')

In [33]:
X.final_dataframe().head()

Unnamed: 0,Category,Rating,Reviews,Type,Content Rating,Genres,SizeNew,InstallsNew,PriceNew,LastUpdatedNewYear,LastUpdatedNewMonth,LastUpdatedNewDay,AndroidVerNew,ReviewGroupNew
0,TRAVEL_AND_LOCAL,1.0,149723,0,Everyone,Travel & Local,0.041,10000000,0.0,2018,6,13,4.0,3
1,LIFESTYLE,0.0,853,0,Everyone,Lifestyle,0.34,100000,0.0,2018,3,22,4.1,1
2,FOOD_AND_DRINK,1.0,61881,0,Everyone,Food & Drink,0.372354,5000000,0.0,2018,4,10,3.831783,3
3,TOOLS,0.0,21943,0,Everyone,Tools,0.023,5000000,0.0,2016,10,19,2.3,2
4,FAMILY,1.0,6,0,Everyone,Education,0.092,1000,0.0,2018,1,24,4.1,0


In [34]:
features

['Category',
 'Reviews',
 'Type',
 'Content Rating',
 'Genres',
 'SizeNew',
 'InstallsNew',
 'PriceNew',
 'LastUpdatedNewYear',
 'LastUpdatedNewMonth',
 'LastUpdatedNewDay',
 'AndroidVerNew',
 'ReviewGroupNew']

In [35]:
cat_cols

['Category', 'Content Rating', 'Genres']

In [36]:
num_cols

['Rating',
 'Reviews',
 'Type',
 'SizeNew',
 'InstallsNew',
 'PriceNew',
 'LastUpdatedNewYear',
 'LastUpdatedNewMonth',
 'LastUpdatedNewDay',
 'AndroidVerNew',
 'ReviewGroupNew']

In [37]:
X.final_dataframe().to_csv("datasets/TrainFeatureEngined.csv", index=False)

In [38]:
X.final_dataframe()

Unnamed: 0,Category,Rating,Reviews,Type,Content Rating,Genres,SizeNew,InstallsNew,PriceNew,LastUpdatedNewYear,LastUpdatedNewMonth,LastUpdatedNewDay,AndroidVerNew,ReviewGroupNew
0,TRAVEL_AND_LOCAL,1.0,149723,0,Everyone,Travel & Local,0.041000,10000000,0.00,2018,6,13,4.000000,3
1,LIFESTYLE,0.0,853,0,Everyone,Lifestyle,0.340000,100000,0.00,2018,3,22,4.100000,1
2,FOOD_AND_DRINK,1.0,61881,0,Everyone,Food & Drink,0.372354,5000000,0.00,2018,4,10,3.831783,3
3,TOOLS,0.0,21943,0,Everyone,Tools,0.023000,5000000,0.00,2016,10,19,2.300000,2
4,FAMILY,1.0,6,0,Everyone,Education,0.092000,1000,0.00,2018,1,24,4.100000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5783,COMMUNICATION,1.0,69119316,0,Everyone,Communication,0.372354,1000000000,0.00,2018,8,3,3.831783,3
5784,HEALTH_AND_FITNESS,0.0,44,0,Everyone,Health & Fitness,0.490000,1000,0.00,2018,8,1,4.300000,1
5785,WEATHER,1.0,3005,1,Everyone,Weather,0.260000,10000,2.99,2018,8,4,4.000000,2
5786,FAMILY,1.0,708,0,Everyone,Education,0.330000,50000,0.00,2018,2,12,3.000000,1
