In [71]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [72]:
df = pd.read_csv("appsgoogleplay.csv")
df.head(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Last Updated
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,"10,000+",Free,0.0,"January 7, 2018"
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0.0,"January 15, 2018"
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,"5,000,000+",Free,0.0,"August 1, 2018"
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,"50,000,000+",Free,0.0,"June 8, 2018"
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,"100,000+",Free,0.0,"June 20, 2018"


In [73]:
X = df.iloc[:,:-1] # Target
y = df.iloc[:,-1] # Feature

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [75]:
print("Dimensi data\t:", df.shape)
print()
print("Dimensi X_train\t:", X_train.shape)
print("Dimensi X_test\t:", X_test.shape)
print("Dimensi y_train\t:", y_train.shape)
print("Dimensi y_test\t:", y_test.shape)

Dimensi data	: (9659, 9)

Dimensi X_train	: (6761, 8)
Dimensi X_test	: (2898, 8)
Dimensi y_train	: (6761,)
Dimensi y_test	: (2898,)


In [76]:
from sklearn.preprocessing import MinMaxScaler

In [77]:
min_max_scaler = MinMaxScaler()

In [78]:
to_be_normalized = df.copy()
to_be_normalized.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Last Updated'],
      dtype='object')

In [79]:
x_scaler = min_max_scaler.fit_transform(to_be_normalized[["Rating", "Reviews", "Size"]])

In [80]:
df_normalized = pd.DataFrame(x_scaler)
df_normalized

Unnamed: 0,0,1,2
0,0.775,2.034333e-06,0.190
1,0.725,1.237233e-05,0.140
2,0.925,1.119651e-03,0.087
3,0.875,2.759067e-03,0.250
4,0.825,1.237233e-05,0.028
...,...,...,...
9654,0.875,4.861927e-07,0.530
9655,1.000,5.117818e-08,0.036
9656,,3.838364e-08,0.095
9657,0.875,1.458578e-06,


In [81]:
print("Nilai min:")
print(df_normalized.min())
print()
print("Nilai max:")
print(df_normalized.max())

Nilai min:
0    0.0
1    0.0
2    0.0
dtype: float64

Nilai max:
0    1.0
1    1.0
2    1.0
dtype: float64


In [82]:
from sklearn.preprocessing import StandardScaler

In [83]:
std_scaler = StandardScaler()

In [84]:
to_be_standardized = df.copy()
standardized_data = std_scaler.fit_transform(to_be_standardized[["Rating", "Reviews", "Size"]])
print("Nilai standar deviasi setelah scaling:", np.nanstd(standardized_data))

Nilai standar deviasi setelah scaling: 1.0


In [85]:
df_standardized = pd.DataFrame(standardized_data)
df_standardized

Unnamed: 0,0,1,2
0,-0.136497,-0.118191,-0.063929
1,-0.509219,-0.117749,-0.293011
2,0.981671,-0.070490,-0.535839
3,0.608949,-0.000518,0.210970
4,0.236226,-0.117749,-0.806156
...,...,...,...
9654,0.608949,-0.118257,1.493831
9655,1.540755,-0.118275,-0.769503
9656,,-0.118276,-0.499185
9657,0.608949,-0.118215,


In [86]:
from sklearn.impute import SimpleImputer

In [89]:
imputer_mean = SimpleImputer(strategy="mean")

In [91]:
df.isna().sum()

App                0
Category           0
Rating          1463
Reviews            0
Size            1227
Installs           0
Type               0
Price              0
Last Updated       0
dtype: int64

In [92]:
df["Rating"] = imputer_mean.fit_transform(df[["Rating"]])
df["Size"] = imputer_mean.fit_transform(df[["Size"]])

In [93]:
df.isna().sum()

App             0
Category        0
Rating          0
Reviews         0
Size            0
Installs        0
Type            0
Price           0
Last Updated    0
dtype: int64

In [94]:
df.duplicated().sum()

0

In [95]:
df = pd.concat([df, df.head(3)], ignore_index=True)
df.duplicated().sum()

3

In [96]:
df[df.duplicated()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Last Updated
9659,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,"10,000+",Free,0.0,"January 7, 2018"
9660,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0.0,"January 15, 2018"
9661,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,"5,000,000+",Free,0.0,"August 1, 2018"


In [97]:
df.drop_duplicates(inplace=True)

In [98]:
df.duplicated().sum()

0

In [99]:
df.dtypes

App              object
Category         object
Rating          float64
Reviews           int64
Size            float64
Installs         object
Type             object
Price           float64
Last Updated     object
dtype: object

In [100]:
df["Reviews"] = df["Reviews"].astype("object")
df.dtypes

App              object
Category         object
Rating          float64
Reviews          object
Size            float64
Installs         object
Type             object
Price           float64
Last Updated     object
dtype: object

In [101]:
from sklearn.preprocessing import OneHotEncoder

In [102]:
encoder = OneHotEncoder(sparse=False)

In [103]:
category_enc = encoder.fit_transform(df[["Category"]])
category_enc = pd.DataFrame(category_enc)

In [104]:
df = df.join(category_enc)
df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Last Updated,0,...,23,24,25,26,27,28,29,30,31,32
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.100000,159,19.000000,"10,000+",Free,0.0,"January 7, 2018",1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Coloring book moana,ART_AND_DESIGN,3.900000,967,14.000000,"500,000+",Free,0.0,"January 15, 2018",1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.700000,87510,8.700000,"5,000,000+",Free,0.0,"August 1, 2018",1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.500000,215644,25.000000,"50,000,000+",Free,0.0,"June 8, 2018",1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.300000,967,2.800000,"100,000+",Free,0.0,"June 20, 2018",1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9654,Sya9a Maroc - FR,FAMILY,4.500000,38,53.000000,"5,000+",Free,0.0,"July 25, 2017",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9655,Fr. Mike Schmitz Audio Teachings,FAMILY,5.000000,4,3.600000,100+,Free,0.0,"July 6, 2018",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9656,Parkinson Exercices FR,MEDICAL,4.173243,3,9.500000,"1,000+",Free,0.0,"January 20, 2017",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9657,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.500000,114,20.395327,"1,000+",Free,0.0,"January 19, 2015",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
