## Import Library

In [130]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

## Import Dataset

In [133]:
df = pd.read_csv('ab_data.csv')

## Quick Look

In [136]:
df.shape

(294478, 5)

In [138]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [140]:
df.shape

(294478, 5)

### Sample Size using Slovin

In [143]:
N = 200000000
e = 0.05/100

In [145]:
slovin = N/(1+N * e**2)

In [147]:
slovin

3921568.6274509802

In [149]:
data = pd.read_csv('iris.csv')

In [151]:
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [153]:
data['Species'].value_counts() #jumlah jenis2 spesis bunga iris

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [155]:
#menghitung korelas antar kolom bunga
data = data.drop(['Id'], axis=1)#ilangin kolom id dulu

In [157]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [163]:
#coefficient correlation
# Check data types
print(data.dtypes)

SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object


In [167]:
# Select only numerical columns
numerical_data = data.select_dtypes(include=[np.number])

In [173]:
# Calculate correlation
correlation_matrix = numerical_data.corr()
round(correlation_matrix, 2)#dibulatkan 2 angka belakang koma

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
SepalLengthCm,1.0,-0.11,0.87,0.82
SepalWidthCm,-0.11,1.0,-0.42,-0.36
PetalLengthCm,0.87,-0.42,1.0,0.96
PetalWidthCm,0.82,-0.36,0.96,1.0


### Sample 400 observation

In [None]:
df_sampel1 = df.sample(n=400)

In [None]:
df_sampel1['group'].value_counts()

treatment    209
control      191
Name: group, dtype: int64

In [None]:
df_sampel1['user_id'].head()

134688    709772
213261    809991
268608    803829
236035    687310
82085     922934
Name: user_id, dtype: int64

### Sample using Same Randomness

In [None]:
df.sample(n=400,random_state = 123).head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
260162,847857,2017-01-18 15:15:02.551941,control,old_page,0
37456,920163,2017-01-10 20:42:09.498343,control,old_page,0
80272,806917,2017-01-06 12:05:56.049996,control,old_page,0
278131,662772,2017-01-08 04:36:26.397852,control,old_page,0
270691,911648,2017-01-08 17:21:28.253382,control,old_page,0


### Sample 30% from dataset

In [None]:
df.sample(frac=0.3)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
186583,893743,2017-01-11 04:39:37.172078,control,old_page,0
256031,853511,2017-01-03 05:00:45.694946,control,old_page,0
283006,814654,2017-01-05 21:04:18.990880,treatment,new_page,0
254110,751639,2017-01-08 14:59:31.537678,control,old_page,0
181889,665122,2017-01-06 17:44:30.179297,control,old_page,0
...,...,...,...,...,...
52803,670220,2017-01-06 11:44:33.323235,control,old_page,0
26239,888236,2017-01-23 05:55:08.816955,treatment,new_page,0
47497,845510,2017-01-03 16:58:13.046340,control,old_page,0
77750,653130,2017-01-22 18:56:55.587907,treatment,new_page,0


# Sampling for Each Group / Stratified Random Sampling

In [None]:
df.groupby(['group'],as_index=False).count()

Unnamed: 0,group,user_id,timestamp,landing_page,converted
0,control,147202,147202,147202,147202
1,treatment,147276,147276,147276,147276


In [None]:
df_sample2 = df.groupby(['group']).apply(lambda x: x.sample(n=200,random_state=123))

In [None]:
df_sample2

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,timestamp,group,landing_page,converted
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
control,95574,704344,2017-01-08 06:33:15.620318,control,old_page,0
control,282637,903218,2017-01-07 16:40:31.904242,control,old_page,0
control,201262,724634,2017-01-05 18:38:31.257679,control,old_page,0
control,93315,750623,2017-01-21 19:20:32.814948,control,old_page,0
control,16163,651056,2017-01-04 03:17:39.846424,control,old_page,0
control,132758,690994,2017-01-05 01:15:53.625539,control,old_page,0
control,127331,741562,2017-01-10 02:43:45.374197,control,old_page,0
control,64040,837026,2017-01-04 13:16:48.025956,control,old_page,0
control,288521,877825,2017-01-22 13:57:36.086248,control,old_page,0
control,7842,645554,2017-01-21 09:43:14.063473,control,old_page,0


In [None]:
df_sample2['group'].value_counts()

control      200
treatment    200
Name: group, dtype: int64