In [12]:
# Langkah 1: Import library yang dibutuhkan
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [13]:
# Langkah 2: Membaca dataset
file_path = 'user_behavior_dataset.csv'
df = pd.read_csv(file_path)

In [14]:
# Langkah 3: Menampilkan 5 baris pertama untuk melihat struktur dataset
print("Lima baris pertama dataset:")
print(df.head())

Lima baris pertama dataset:
   User ID    Device Model Operating System  App Usage Time (min/day)  \
0        1  Google Pixel 5          Android                       393   
1        2       OnePlus 9          Android                       268   
2        3    Xiaomi Mi 11          Android                       154   
3        4  Google Pixel 5          Android                       239   
4        5       iPhone 12              iOS                       187   

   Screen On Time (hours/day)  Battery Drain (mAh/day)  \
0                         6.4                     1872   
1                         4.7                     1331   
2                         4.0                      761   
3                         4.8                     1676   
4                         4.3                     1367   

   Number of Apps Installed  Data Usage (MB/day)  Age  Gender  \
0                        67                 1122   40    Male   
1                        42                  944   47 

In [15]:
# Langkah 4: Melihat informasi umum dataset
print("\nInformasi dataset:")
print(df.info())


Informasi dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User ID                     700 non-null    int64  
 1   Device Model                700 non-null    object 
 2   Operating System            700 non-null    object 
 3   App Usage Time (min/day)    700 non-null    int64  
 4   Screen On Time (hours/day)  700 non-null    float64
 5   Battery Drain (mAh/day)     700 non-null    int64  
 6   Number of Apps Installed    700 non-null    int64  
 7   Data Usage (MB/day)         700 non-null    int64  
 8   Age                         700 non-null    int64  
 9   Gender                      700 non-null    object 
 10  User Behavior Class         700 non-null    int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 60.3+ KB
None


In [16]:
# Langkah 5: Mengecek apakah ada missing values
print("\nJumlah missing values per kolom:")
print(df.isnull().sum())


Jumlah missing values per kolom:
User ID                       0
Device Model                  0
Operating System              0
App Usage Time (min/day)      0
Screen On Time (hours/day)    0
Battery Drain (mAh/day)       0
Number of Apps Installed      0
Data Usage (MB/day)           0
Age                           0
Gender                        0
User Behavior Class           0
dtype: int64


In [17]:
# Langkah 6: Mengisi missing values jika ada
# Misal: jika kolom 'age' memiliki missing values, kita bisa mengisinya dengan median
if 'age' in df.columns:
    df['age'].fillna(df['age'].median(), inplace=True)

In [18]:
# Langkah 7: Mengubah kolom kategori menjadi numerik (jika ada)
# Gunakan LabelEncoder untuk mengubah data kategorikal menjadi numerik
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

In [19]:
# Langkah 8: Normalisasi fitur numerik
# Misal kita ingin menormalisasi kolom yang memiliki tipe data numerik
numerical_features = df.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [20]:
# Langkah 9: Split data menjadi fitur dan target (X dan y)
# Misal: jika kolom terakhir adalah target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [21]:
# Langkah 10: Membagi data menjadi training dan testing (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
# Langkah 11: Menampilkan hasil split data
print("\nUkuran data training:", X_train.shape)
print("Ukuran data testing:", X_test.shape)


Ukuran data training: (490, 10)
Ukuran data testing: (210, 10)


In [23]:
# Langkah 12: Menambahkan kembali hasil preprocessing ke dalam satu dataframe
# Menggabungkan fitur (X) dan target (y) kembali ke satu dataframe
df_processed = pd.concat([X, y], axis=1)

In [25]:
# Langkah 13: Menyimpan dataset yang sudah diproses ke dalam file CSV
processed_file_path = 'dataset_preprocessed.csv'
df_processed.to_csv(processed_file_path, index=False)

In [26]:
# Langkah 14: Mengecek 5 baris pertama dari dataset yang sudah diproses
print("\nLima baris pertama dari dataset yang sudah diproses:")
print(df_processed.head())


Lima baris pertama dari dataset yang sudah diproses:
    User ID  Device Model  Operating System  App Usage Time (min/day)  \
0 -1.729578     -1.420106         -0.513359                  0.688256   
1 -1.724629     -0.720547         -0.513359                 -0.017668   
2 -1.719681      0.678573         -0.513359                 -0.661471   
3 -1.714732     -1.420106         -0.513359                 -0.181443   
4 -1.709783      1.378133          1.947953                 -0.475107   

   Screen On Time (hours/day)  Battery Drain (mAh/day)  \
0                    0.367626                 0.423726   
1                   -0.186771                -0.237198   
2                   -0.415053                -0.933550   
3                   -0.154160                 0.184279   
4                   -0.317218                -0.193218   

   Number of Apps Installed  Data Usage (MB/day)       Age    Gender  \
0                  0.606096             0.300405  0.126383  0.960769   
1             