In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import time
from google.colab import drive

drive.mount('/content/drive')

file_path = "/content/drive/MyDrive/Ecommerce_Consumer_Behavior_Analysis_Data.csv"
df = pd.read_csv(file_path)

print("Informasi Dataset:")
print(df.info())
print("\n5 Data Pertama:")
print(df.head())
print("\nStatistik Deskriptif:")
print(df.describe())
print("\nJumlah Nilai Unik per Kolom:")
print(df.nunique())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Informasi Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 28 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Customer_ID                            1000 non-null   object 
 1   Age                                    1000 non-null   int64  
 2   Gender                                 1000 non-null   object 
 3   Income_Level                           1000 non-null   object 
 4   Marital_Status                         1000 non-null   object 
 5   Education_Level                        1000 non-null   object 
 6   Occupation                             1000 non-null   object 
 7   Location                               1000 non-null   object 
 8   Purchase_Category                      1000 non-null   object

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import time
from google.colab import drive

drive.mount('/content/drive')

file_path = "/content/drive/MyDrive/Ecommerce_Consumer_Behavior_Analysis_Data.csv"
df = pd.read_csv(file_path)

le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

X = df.drop('Purchase_Intent', axis=1)
y = df['Purchase_Intent']

scaler = StandardScaler()
X = scaler.fit_transform(X)

def run_svm_experiments(X, y, kernel_type, params=None, n_experiments=5):
    results = {
        '80_20': {'precision': [], 'recall': [], 'f1': [], 'accuracy': [], 'time': []},
        '70_30': {'precision': [], 'recall': [], 'f1': [], 'accuracy': [], 'time': []},
        'kfold': {'precision': [], 'recall': [], 'f1': [], 'accuracy': [], 'time': []}
    }

    for i in range(n_experiments):
        start_time = time.time()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

        if kernel_type == 'rbf':
            C = params.get('C', 1.0) if params else 1.0
            gamma = params.get('gamma', 'scale') if params else 'scale'
            svm = SVC(kernel='rbf', C=C, gamma=gamma)
        elif kernel_type == 'poly':
            C = params.get('C', 1.0) if params else 1.0
            degree = params.get('degree', 3) if params else 3
            svm = SVC(kernel='poly', degree=degree, C=C)
        else:
            C = params.get('C', 1.0) if params else 1.0
            svm = SVC(kernel='linear', C=C)

        svm.fit(X_train, y_train)
        y_pred = svm.predict(X_test)

        results['80_20']['precision'].append(precision_score(y_test, y_pred, average='weighted'))
        results['80_20']['recall'].append(recall_score(y_test, y_pred, average='weighted'))
        results['80_20']['f1'].append(f1_score(y_test, y_pred, average='weighted'))
        results['80_20']['accuracy'].append(accuracy_score(y_test, y_pred))
        results['80_20']['time'].append(time.time() - start_time)

        start_time = time.time()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)

        svm.fit(X_train, y_train)
        y_pred = svm.predict(X_test)

        results['70_30']['precision'].append(precision_score(y_test, y_pred, average='weighted'))
        results['70_30']['recall'].append(recall_score(y_test, y_pred, average='weighted'))
        results['70_30']['f1'].append(f1_score(y_test, y_pred, average='weighted'))
        results['70_30']['accuracy'].append(accuracy_score(y_test, y_pred))
        results['70_30']['time'].append(time.time() - start_time)

        start_time = time.time()
        kf = KFold(n_splits=3, shuffle=True, random_state=i)

        precision, recall, f1, accuracy = [], [], [], []

        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            svm.fit(X_train, y_train)
            y_pred = svm.predict(X_test)

            precision.append(precision_score(y_test, y_pred, average='weighted'))
            recall.append(recall_score(y_test, y_pred, average='weighted'))
            f1.append(f1_score(y_test, y_pred, average='weighted'))
            accuracy.append(accuracy_score(y_test, y_pred))

        results['kfold']['precision'].append(np.mean(precision))
        results['kfold']['recall'].append(np.mean(recall))
        results['kfold']['f1'].append(np.mean(f1))
        results['kfold']['accuracy'].append(np.mean(accuracy))
        results['kfold']['time'].append(time.time() - start_time)

    return results

def validate_results_structure(results, kernel_name):
    if results is None:
        print(f"Hasil {kernel_name} adalah None")
        return False

    required_splits = ['80_20', '70_30', 'kfold']
    required_metrics = ['precision', 'recall', 'f1', 'accuracy', 'time']

    for split in required_splits:
        if split not in results:
            print(f"Split {split} tidak ditemukan dalam hasil {kernel_name}")
            return False

        for metric in required_metrics:
            if metric not in results[split]:
                print(f"Metric {metric} tidak ditemukan dalam {split} untuk {kernel_name}")
                return False
            elif len(results[split][metric]) == 0:
                print(f"Metric {metric} dalam {split} untuk {kernel_name} kosong")
                return False

    return True

def create_results_df(results, kernel_name):
    n_experiments = len(results['80_20']['precision'])
    df = pd.DataFrame(index=range(n_experiments))

    for split_type in ['80_20', '70_30', 'kfold']:
        for metric in ['precision', 'recall', 'f1', 'accuracy', 'time']:
            col_name = f"{split_type}_{metric}"
            df[col_name] = results[split_type][metric]

    df['kernel'] = kernel_name
    df['experiment'] = range(1, n_experiments + 1)

    return df

rbf_params = {'C': 100, 'gamma': 0.1}
poly_params = {'degree': 3, 'C': 1.0}

print("Running Linear Kernel Experiments...")
linear_results = run_svm_experiments(X, y, 'linear')

print("\nRunning RBF Kernel Experiments...")
rbf_results = run_svm_experiments(X, y, 'rbf', rbf_params)

print("\nRunning Polynomial Kernel Experiments...")
poly_results = run_svm_experiments(X, y, 'poly', poly_params)

dfs = []

if linear_results is not None and validate_results_structure(linear_results, 'Linear'):
    linear_df = create_results_df(linear_results, 'Linear')
    dfs.append(linear_df)
else:
    print("Hasil Linear tidak valid, dilewati")

if rbf_results is not None and validate_results_structure(rbf_results, 'RBF'):
    rbf_df = create_results_df(rbf_results, 'RBF')
    dfs.append(rbf_df)
else:
    print("Hasil RBF tidak valid, dilewati")

if poly_results is not None and validate_results_structure(poly_results, 'Polynomial'):
    poly_df = create_results_df(poly_results, 'Polynomial')
    dfs.append(poly_df)
else:
    print("Hasil Polynomial tidak valid, dilewati")

if dfs:
    all_results = pd.concat(dfs)

    all_results.to_csv('svm_experiment_results.csv', index=False)

    summary = all_results.groupby('kernel').agg({
        '80_20_precision': 'mean',
        '80_20_recall': 'mean',
        '80_20_f1': 'mean',
        '80_20_accuracy': 'mean',
        '80_20_time': 'mean',
        '70_30_precision': 'mean',
        '70_30_recall': 'mean',
        '70_30_f1': 'mean',
        '70_30_accuracy': 'mean',
        '70_30_time': 'mean',
        'kfold_precision': 'mean',
        'kfold_recall': 'mean',
        'kfold_f1': 'mean',
        'kfold_accuracy': 'mean',
        'kfold_time': 'mean'
    }).reset_index()

    print("\nHASIL EKSPERIMEN :")
    print("=============================================")

    print("\nHasil Detail Setiap Eksperimen:")
    print(all_results.to_markdown(index=False))

    print("\nRingkasan Hasil Rata-rata:")
    print(summary.to_markdown(index=False))

    print("\nFormat Dalam Bentuk Tabel:")
    print("""
| Data Latih:Data Uji | Algoritma | Rata-Rata Precision | Rata-Rata Recall | Rata-Rata F1-Score | Rata-Rata Akurasi | Rata-Rata Waktu Pelatihan |
|---------------------|-----------|----------------------|------------------|--------------------|-------------------|---------------------------|""")

    for _, row in summary.iterrows():
        print(f"| 80:20 | SVM {row['kernel']} | {row['80_20_precision']:.4f} | {row['80_20_recall']:.4f} | {row['80_20_f1']:.4f} | {row['80_20_accuracy']:.4f} | {row['80_20_time']:.4f} |")
        print(f"| 70:30 | SVM {row['kernel']} | {row['70_30_precision']:.4f} | {row['70_30_recall']:.4f} | {row['70_30_f1']:.4f} | {row['70_30_accuracy']:.4f} | {row['70_30_time']:.4f} |")
        print(f"| K-Fold | SVM {row['kernel']} | {row['kfold_precision']:.4f} | {row['kfold_recall']:.4f} | {row['kfold_f1']:.4f} | {row['kfold_accuracy']:.4f} | {row['kfold_time']:.4f} |")

else:
    print("Tidak ada hasil eksperimen yang valid untuk diproses")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Running Linear Kernel Experiments...

Running RBF Kernel Experiments...

Running Polynomial Kernel Experiments...

HASIL EKSPERIMEN :

Hasil Detail Setiap Eksperimen:
|   80_20_precision |   80_20_recall |   80_20_f1 |   80_20_accuracy |   80_20_time |   70_30_precision |   70_30_recall |   70_30_f1 |   70_30_accuracy |   70_30_time |   kfold_precision |   kfold_recall |   kfold_f1 |   kfold_accuracy |   kfold_time | kernel     |   experiment |
|------------------:|---------------:|-----------:|-----------------:|-------------:|------------------:|---------------:|-----------:|-----------------:|-------------:|------------------:|---------------:|-----------:|-----------------:|-------------:|:-----------|-------------:|
|          0.244446 |          0.225 |   0.227363 |            0.225 |    0.299281  |          0.245263 |       0.233333 |   0.231858 |     

In [21]:
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

X = df.drop('Purchase_Intent', axis=1)
y = df['Purchase_Intent']

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [22]:
def run_svm_experiments(X, y, kernel_type, params=None, n_experiments=5):
    results = {
        '80_20': {'precision': [], 'recall': [], 'f1': [], 'accuracy': [], 'time': []},
        '70_30': {'precision': [], 'recall': [], 'f1': [], 'accuracy': [], 'time': []},
        'kfold': {'precision': [], 'recall': [], 'f1': [], 'accuracy': [], 'time': []}
    }

    try:
        for i in range(n_experiments):
            # 80%:20% split
            start_time = time.time()
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

            # Inisialisasi model
            if kernel_type == 'rbf':
                C = params.get('C', 1.0) if params else 1.0
                gamma = params.get('gamma', 'scale') if params else 'scale'
                svm = SVC(kernel='rbf', C=C, gamma=gamma)
            elif kernel_type == 'poly':
                C = params.get('C', 1.0) if params else 1.0
                degree = params.get('degree', 3) if params else 3
                svm = SVC(kernel='poly', degree=degree, C=C)
            else:  # linear
                C = params.get('C', 1.0) if params else 1.0
                svm = SVC(kernel='linear', C=C)

            # Latih model dan hitung metrik
            svm.fit(X_train, y_train)
            y_pred = svm.predict(X_test)

            # Simpan semua metrik untuk 80_20
            results['80_20']['precision'].append(precision_score(y_test, y_pred, average='weighted'))
            results['80_20']['recall'].append(recall_score(y_test, y_pred, average='weighted'))
            results['80_20']['f1'].append(f1_score(y_test, y_pred, average='weighted'))
            results['80_20']['accuracy'].append(accuracy_score(y_test, y_pred))
            results['80_20']['time'].append(time.time() - start_time)

            # [Lanjutkan untuk 70_30 dan kfold dengan cara yang sama]

        return results

    except Exception as e:
        print(f"Error dalam run_svm_experiments: {str(e)}")
        return None

In [23]:
def create_results_df(results, kernel_name):
    if results is None:
        print(f"Peringatan: Hasil untuk kernel {kernel_name} adalah None")
        return pd.DataFrame()

    # Cari panjang data yang diharapkan (jumlah eksperimen)
    n_experiments = 0
    for split_type in ['80_20', '70_30', 'kfold']:
        if split_type in results and 'precision' in results[split_type]:
            n_experiments = len(results[split_type]['precision'])
            break

    if n_experiments == 0:
        print(f"Peringatan: Tidak ada data eksperimen untuk kernel {kernel_name}")
        return pd.DataFrame()

    # Buat DataFrame dengan index yang sesuai
    df = pd.DataFrame(index=range(n_experiments))

    for split_type in ['80_20', '70_30', 'kfold']:
        if split_type not in results:
            print(f"Peringatan: {split_type} tidak ditemukan dalam hasil")
            continue

        for metric in ['precision', 'recall', 'f1', 'accuracy', 'time']:
            col_name = f"{split_type}_{metric}"

            if metric in results[split_type]:
                df[col_name] = results[split_type][metric]
            else:
                print(f"Peringatan: {metric} tidak ditemukan dalam {split_type}")
                df[col_name] = np.nan  # Isi dengan NaN jika data tidak ada

    df['kernel'] = kernel_name
    df['experiment'] = range(1, n_experiments + 1)

    return df

In [24]:
def validate_results_structure(results, kernel_name):
    if results is None:
        print(f"Hasil {kernel_name} adalah None")
        return False

    required_splits = ['80_20', '70_30', 'kfold']
    required_metrics = ['precision', 'recall', 'f1', 'accuracy', 'time']

    valid = True

    for split in required_splits:
        if split not in results:
            print(f"Split {split} tidak ditemukan dalam hasil {kernel_name}")
            valid = False
            continue

        for metric in required_metrics:
            if metric not in results[split]:
                print(f"Metric {metric} tidak ditemukan dalam {split} untuk {kernel_name}")
                valid = False
            elif len(results[split][metric]) == 0:
                print(f"Metric {metric} dalam {split} untuk {kernel_name} kosong")
                valid = False

    return valid

# Sebelum membuat DataFrame, validasi struktur hasil
if linear_results is not None and validate_results_structure(linear_results, 'Linear'):
    linear_df = create_results_df(linear_results, 'Linear')
    dfs.append(linear_df)
else:
    print("Hasil Linear tidak valid, dilewati")

if rbf_results is not None and validate_results_structure(rbf_results, 'RBF'):
    rbf_df = create_results_df(rbf_results, 'RBF')
    dfs.append(rbf_df)
else:
    print("Hasil RBF tidak valid, dilewati")

if poly_results is not None and validate_results_structure(poly_results, 'Polynomial'):
    poly_df = create_results_df(poly_results, 'Polynomial')
    dfs.append(poly_df)
else:
    print("Hasil Polynomial tidak valid, dilewati")

Metric precision dalam 70_30 untuk Linear kosong
Metric recall dalam 70_30 untuk Linear kosong
Metric f1 dalam 70_30 untuk Linear kosong
Metric accuracy dalam 70_30 untuk Linear kosong
Metric time dalam 70_30 untuk Linear kosong
Metric precision dalam kfold untuk Linear kosong
Metric recall dalam kfold untuk Linear kosong
Metric f1 dalam kfold untuk Linear kosong
Metric accuracy dalam kfold untuk Linear kosong
Metric time dalam kfold untuk Linear kosong
Hasil Linear tidak valid, dilewati
Metric precision dalam 70_30 untuk RBF kosong
Metric recall dalam 70_30 untuk RBF kosong
Metric f1 dalam 70_30 untuk RBF kosong
Metric accuracy dalam 70_30 untuk RBF kosong
Metric time dalam 70_30 untuk RBF kosong
Metric precision dalam kfold untuk RBF kosong
Metric recall dalam kfold untuk RBF kosong
Metric f1 dalam kfold untuk RBF kosong
Metric accuracy dalam kfold untuk RBF kosong
Metric time dalam kfold untuk RBF kosong
Hasil RBF tidak valid, dilewati
Metric precision dalam 70_30 untuk Polynomial k

In [26]:
def create_results_df(results, kernel_name):
    if results is None:
        print(f"Peringatan: Hasil untuk kernel {kernel_name} adalah None")
        return pd.DataFrame()

    # Cari panjang data yang diharapkan (jumlah eksperimen)
    n_experiments = 0
    for split_type in ['80_20', '70_30', 'kfold']:
        if split_type in results and 'precision' in results[split_type]:
            n_experiments = len(results[split_type]['precision'])
            break

    if n_experiments == 0:
        print(f"Peringatan: Tidak ada data eksperimen untuk kernel {kernel_name}")
        return pd.DataFrame()

    # Buat DataFrame dengan index yang sesuai
    df = pd.DataFrame(index=range(n_experiments))

    for split_type in ['80_20', '70_30', 'kfold']:
        if split_type not in results:
            print(f"Peringatan: {split_type} tidak ditemukan dalam hasil")
            continue

        for metric in ['precision', 'recall', 'f1', 'accuracy', 'time']:
            col_name = f"{split_type}_{metric}"

            if metric in results[split_type]:
                df[col_name] = results[split_type][metric]
            else:
                print(f"Peringatan: {metric} tidak ditemukan dalam {split_type}")
                df[col_name] = np.nan  # Isi dengan NaN jika data tidak ada

    df['kernel'] = kernel_name
    df['experiment'] = range(1, n_experiments + 1)

    return df

In [27]:
def create_results_df(results, kernel_name):
    df = pd.DataFrame()

    for split_type in ['80_20', '70_30', 'kfold']:
        for metric in ['precision', 'recall', 'f1', 'accuracy', 'time']:
            col_name = f"{split_type}_{metric}"
            df[col_name] = results[split_type][metric]

    df['kernel'] = kernel_name
    df['experiment'] = range(1, len(df)+1)

    return df

# Membuat DataFrame untuk semua kernel
linear_df = create_results_df(linear_results, 'Linear')
rbf_df = create_results_df(rbf_results, 'RBF')
poly_df = create_results_df(poly_results, 'Polynomial')

# Menggabungkan semua hasil
all_results = pd.concat([linear_df, rbf_df, poly_df])

# Menyimpan hasil ke CSV
all_results.to_csv('svm_experiment_results.csv', index=False)

# Menampilkan hasil
print("\nHasil Eksperimen:")
print(all_results)

# Menghitung rata-rata untuk setiap kernel dan split
summary = all_results.groupby('kernel').agg({
    '80_20_precision': 'mean',
    '80_20_recall': 'mean',
    '80_20_f1': 'mean',
    '80_20_accuracy': 'mean',
    '80_20_time': 'mean',
    '70_30_precision': 'mean',
    '70_30_recall': 'mean',
    '70_30_f1': 'mean',
    '70_30_accuracy': 'mean',
    '70_30_time': 'mean',
    'kfold_precision': 'mean',
    'kfold_recall': 'mean',
    'kfold_f1': 'mean',
    'kfold_accuracy': 'mean',
    'kfold_time': 'mean'
}).reset_index()

print("\nRangkuman Hasil:")
print(summary)

ValueError: Length of values (0) does not match length of index (5)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import time

# Memuat dataset Anda (ganti 'dataset.csv' dengan nama file dataset Anda)
data = pd.read_csv('dataset.csv')

# Memisahkan fitur dan target
X = data.drop('target_column', axis=1)  # Ganti 'target_column' dengan nama kolom target
y = data['target_column']

# Normalisasi data
scaler = StandardScaler()
X = scaler.fit_transform(X)