<a href="https://colab.research.google.com/github/pandu1992/PraktikumDataScience/blob/main/Praktikum_Data_Science_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Soal 1: Eksplorasi Data dan Preprocessing
import pandas as pd
import numpy as np

# Load dataset dari GitHub
url = "https://github.com/databricks/Spark-The-Definitive-Guide/blob/master/data/retail-data/all/online-retail-dataset.csv?raw=true"
data = pd.read_csv(url)

# Tampilkan 5 baris pertama
print("Dataframe awal:")
print(data.head())

# Deskripsi statistik
print("\nDeskripsi Statistik:")
print(data.describe())

# Cek nilai yang hilang
print("\nCek nilai yang hilang:")
print(data.isnull().sum())

# Buat fitur baru 'TotalPurchase'
data['TotalPurchase'] = data['Quantity'] * data['UnitPrice']
print("\nData dengan fitur baru 'TotalPurchase':")
print(data[['Quantity', 'UnitPrice', 'TotalPurchase']].head())

# Ubah variabel kategorikal menjadi dummy variables
data_dummies = pd.get_dummies(data, columns=['Country'], drop_first=True)
print("\nDataset setelah preprocessing:")
print(data_dummies.head())


Dataframe awal:
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

      InvoiceDate  UnitPrice  CustomerID         Country  
0  12/1/2010 8:26       2.55     17850.0  United Kingdom  
1  12/1/2010 8:26       3.39     17850.0  United Kingdom  
2  12/1/2010 8:26       2.75     17850.0  United Kingdom  
3  12/1/2010 8:26       3.39     17850.0  United Kingdom  
4  12/1/2010 8:26       3.39     17850.0  United Kingdom  

Deskripsi Statistik:
            Quantity      UnitPrice     CustomerID
count  541909.000000  541909.000000  406829.000000
mean        9.552250       4.611114   15287.690570
std       218.081158  

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Pilih variabel target dan fitur
X = data_dummies[['Quantity', 'UnitPrice'] + [col for col in data_dummies.columns if 'Country' in col]]
y = data_dummies["TotalPurchase"]

# Bagi dataset menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Buat dan latih model Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

# Prediksi dan evaluasi
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE) untuk Linear Regression: {mse}")


Mean Squared Error (MSE) untuk Linear Regression: 50451.5360739058


In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Buat variabel target biner
y_binary = (data_dummies["TotalPurchase"] > 1000).astype(int)

# Bagi dataset menjadi training dan testing set
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Buat model K-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_clf, y_train_clf)

# Prediksi dan evaluasi
y_pred_knn = knn_model.predict(X_test_clf)
accuracy_knn = accuracy_score(y_test_clf, y_pred_knn)
conf_matrix_knn = confusion_matrix(y_test_clf, y_pred_knn)

print(f"Accuracy untuk KNN: {accuracy_knn}")
print(f"Confusion Matrix untuk KNN:\n{conf_matrix_knn}")


Accuracy untuk KNN: 0.9995109889095976
Confusion Matrix untuk KNN:
[[108286     19]
 [    34     43]]


In [5]:
from sklearn.neural_network import MLPClassifier

# Buat model Neural Network
nn_model = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=300, random_state=42)
nn_model.fit(X_train_clf, y_train_clf)

# Prediksi dan evaluasi
y_pred_nn = nn_model.predict(X_test_clf)
accuracy_nn = accuracy_score(y_test_clf, y_pred_nn)
print(f"Accuracy untuk Neural Network: {accuracy_nn}")


Accuracy untuk Neural Network: 0.9997785610156668


In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

# Buat model Decision Tree
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train_clf, y_train_clf)

# Prediksi dan evaluasi
y_pred_tree = tree_model.predict(X_test_clf)
accuracy_tree = accuracy_score(y_test_clf, y_pred_tree)
print(f"Accuracy untuk Decision Tree: {accuracy_tree}")

# Visualisasikan Decision Tree
plt.figure(figsize=(15, 10))
plot_tree(tree_model, feature_names=X_train_clf.columns, class_names=["Below 1000", "Above 1000"], filled=True)
plt.show()


In [4]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# Buat model Naïve Bayesian
nb_model = GaussianNB()
nb_model.fit(X_train_clf, y_train_clf)

# Prediksi dan evaluasi
y_pred_nb = nb_model.predict(X_test_clf)
accuracy_nb = accuracy_score(y_test_clf, y_pred_nb)
print(f"Accuracy untuk Naïve Bayesian: {accuracy_nb}")

# Classification report
print("Classification Report untuk Naïve Bayesian:")
print(classification_report(y_test_clf, y_pred_nb, target_names=["Below 1000", "Above 1000"]))


Accuracy untuk Naïve Bayesian: 0.040135815910391025
Classification Report untuk Naïve Bayesian:
              precision    recall  f1-score   support

  Below 1000       1.00      0.04      0.08    108305
  Above 1000       0.00      0.99      0.00        77

    accuracy                           0.04    108382
   macro avg       0.50      0.51      0.04    108382
weighted avg       1.00      0.04      0.08    108382

