In [2]:
print("="*80)
print(" TAHAP 1: SETUP ENVIRONMENT & LOAD DATASET ".center(80, "="))
print("="*80)
print("\nKelompok 8: Rahma Fitria Tunnisa & Rahmawati")
print("Dataset: Higher Education Students Performance Evaluation (UCI)")
print("="*80)

print("\n[1.1] Import Libraries...")
print("-"*80)

# Libraries untuk manipulasi data
import pandas as pd
import numpy as np
import os
import warnings

# Libraries untuk visualisasi
import matplotlib.pyplot as plt
import seaborn as sns

# Libraries untuk machine learning (preprocessing)
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

# Suppress warnings untuk output yang lebih bersih
warnings.filterwarnings('ignore')

# Set random seed untuk reproducibility
np.random.seed(42)

# Konfigurasi tampilan pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 100)

# Konfigurasi style visualisasi
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9

print("‚úÖ Libraries berhasil di-import!")
print(f"   ‚Ä¢ Pandas version: {pd.__version__}")
print(f"   ‚Ä¢ NumPy version: {np.__version__}")
print(f"   ‚Ä¢ Matplotlib version: {plt.matplotlib.__version__}")
print(f"   ‚Ä¢ Seaborn version: {sns.__version__}")



Kelompok 8: Rahma Fitria Tunnisa & Rahmawati
Dataset: Higher Education Students Performance Evaluation (UCI)

[1.1] Import Libraries...
--------------------------------------------------------------------------------


ModuleNotFoundError: No module named 'pandas'

In [None]:
# ============================================================================
# 1.2 BUAT STRUKTUR FOLDER OTOMATIS
# ============================================================================

print("\n[1.2] Membuat Struktur Folder Project...")
print("-"*80)

# Definisi struktur folder sesuai best practice
folders = [
    'data/raw',                 # Dataset original
    'data/processed',           # Data setelah preprocessing
    'results/figures',          # Visualisasi (PNG/JPG)
    'results/tables',           # Tabel hasil (CSV)
    'results/reports',          # Laporan tekstual (TXT)
    'models'                    # Model yang sudah ditraining (PKL)
]

# Buat folder jika belum ada
created = []
existing = []

for folder in folders:
    if not os.path.exists(folder):
        os.makedirs(folder)
        created.append(folder)
    else:
        existing.append(folder)

# Tampilkan hasil
if created:
    print(f"‚úÖ Berhasil membuat {len(created)} folder baru:")
    for folder in created:
        print(f"   üìÅ {folder}/")

if existing:
    print(f"\n‚úÖ {len(existing)} folder sudah ada:")
    for folder in existing:
        print(f"   üìÅ {folder}/")

print(f"\n‚úÖ Total folder: {len(folders)}")


[1.2] Membuat Struktur Folder Project...
--------------------------------------------------------------------------------

‚úÖ 6 folder sudah ada:
   üìÅ data/raw/
   üìÅ data/processed/
   üìÅ results/figures/
   üìÅ results/tables/
   üìÅ results/reports/
   üìÅ models/

‚úÖ Total folder: 6


In [None]:
# ============================================================================
# 1.3 LOAD DATASET
# ============================================================================

print("\n[1.3] Loading Dataset...")
print("-"*80)

# Definisi lokasi file yang mungkin
file_locations = [
    'DATA (1).csv',                    # Lokasi 1: Folder root
    'data/raw/DATA (1).csv',          # Lokasi 2: Folder data/raw/
    '../DATA (1).csv',                # Lokasi 3: Parent folder
    'student_performance.csv',        # Lokasi 4: Jika sudah direname
]

# Coba load dari berbagai lokasi
df = None
loaded_from = None
encoding_used = None

print("üîç Mencari file dataset...")

for location in file_locations:
    print(f"   Mencoba: {location}...", end=" ")
    
    # Coba berbagai encoding
    for encoding in ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']:
        try:
            df = pd.read_csv(location, encoding=encoding)
            loaded_from = location
            encoding_used = encoding
            print(f"‚úÖ BERHASIL (encoding: {encoding})")
            break
        except (FileNotFoundError, UnicodeDecodeError):
            continue
        except Exception as e:
            continue
    
    if df is not None:
        break
    else:
        print("‚ùå Tidak ditemukan")

# Validasi apakah dataset berhasil dimuat
if df is None:
    print("\n" + "="*80)
    print("‚ùå DATASET TIDAK DITEMUKAN!")
    print("="*80)
    print("\nüìå SOLUSI:")
    print("1. Pastikan file 'DATA (1).csv' ada di salah satu lokasi:")
    for loc in file_locations:
        print(f"   ‚Ä¢ {loc}")
    print("\n2. Upload file menggunakan menu Jupyter Notebook")
    print("\n3. Atau jalankan kode berikut untuk upload manual:")
    print("   from google.colab import files")
    print("   uploaded = files.upload()")
    print("   df = pd.read_csv('DATA (1).csv')")
    print("="*80)
    raise FileNotFoundError("Dataset tidak ditemukan! Ikuti instruksi di atas.")



[1.3] Loading Dataset...
--------------------------------------------------------------------------------
üîç Mencari file dataset...
   Mencoba: DATA (1).csv... ‚ùå Tidak ditemukan
   Mencoba: data/raw/DATA (1).csv... ‚úÖ BERHASIL (encoding: utf-8)


In [None]:
# ============================================================================
# 1.4 INFORMASI DASAR DATASET
# ============================================================================

print("\n[1.4] Informasi Dasar Dataset")
print("-"*80)

print(f"‚úÖ Dataset berhasil dimuat dari: {loaded_from}")
print(f"‚úÖ Encoding yang digunakan: {encoding_used}")

print(f"\nüìä DIMENSI DATASET:")
print(f"   ‚Ä¢ Jumlah baris (sampel): {df.shape[0]}")
print(f"   ‚Ä¢ Jumlah kolom (fitur): {df.shape[1]}")
print(f"   ‚Ä¢ Total data points: {df.shape[0] * df.shape[1]:,}")
print(f"   ‚Ä¢ Memory usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")

print(f"\nüìã NAMA KOLOM ({len(df.columns)} kolom):")
for i, col in enumerate(df.columns, 1):
    dtype = df[col].dtype
    unique = df[col].nunique()
    print(f"   {i:2d}. {col:<35} | Type: {str(dtype):<10} | Unique: {unique:>3}")


[1.4] Informasi Dasar Dataset
--------------------------------------------------------------------------------
‚úÖ Dataset berhasil dimuat dari: data/raw/DATA (1).csv
‚úÖ Encoding yang digunakan: utf-8

üìä DIMENSI DATASET:
   ‚Ä¢ Jumlah baris (sampel): 145
   ‚Ä¢ Jumlah kolom (fitur): 33
   ‚Ä¢ Total data points: 4,785
   ‚Ä¢ Memory usage: 44.63 KB

üìã NAMA KOLOM (33 kolom):
    1. STUDENT ID                          | Type: object     | Unique: 145
    2. 1                                   | Type: int64      | Unique:   3
    3. 2                                   | Type: int64      | Unique:   2
    4. 3                                   | Type: int64      | Unique:   3
    5. 4                                   | Type: int64      | Unique:   5
    6. 5                                   | Type: int64      | Unique:   2
    7. 6                                   | Type: int64      | Unique:   2
    8. 7                                   | Type: int64      | Unique:   2
    9. 8 

In [None]:
# ============================================================================
# 1.5 PREVIEW DATA
# ============================================================================

print(f"\n[1.5] Preview Data")
print("-"*80)

print("\nüîç 5 BARIS PERTAMA:")
print(df.head())

print("\nüîç 5 BARIS TERAKHIR:")
print(df.tail())

print("\nüîç SAMPLE RANDOM (3 baris):")
print(df.sample(3, random_state=42))




[1.5] Preview Data
--------------------------------------------------------------------------------

üîç 5 BARIS PERTAMA:
  STUDENT ID  1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  COURSE ID  GRADE
0   STUDENT1  2  2  3  3  1  2  2  1  1   1   1   2   3   1   2   5   3   2   2   1   1   1   1   1   3   2   1   2   1   1          1      1
1   STUDENT2  2  2  3  3  1  2  2  1  1   1   2   3   2   1   2   1   2   2   2   1   1   1   1   1   3   2   3   2   2   3          1      1
2   STUDENT3  2  2  2  3  2  2  2  2  4   2   2   2   2   1   2   1   2   1   2   1   1   1   1   1   2   2   1   1   2   2          1      1
3   STUDENT4  1  1  1  3  1  2  1  2  1   2   1   2   5   1   2   1   3   1   2   1   1   1   1   2   3   2   2   1   3   2          1      1
4   STUDENT5  2  2  1  3  2  2  1  3  1   4   3   3   2   1   2   4   2   1   1   1   1   1   2   1   2   2   2   1   2   2          1      1

üîç 5 BARIS TERAKHIR:


In [None]:
# ============================================================================
# 1.6 TIPE DATA DAN MISSING VALUES
# ============================================================================

print(f"\n[1.6] Analisis Tipe Data dan Missing Values")
print("-"*80)

print("\nüìà DISTRIBUSI TIPE DATA:")
print(df.dtypes.value_counts())

print("\nüìä BREAKDOWN TIPE DATA:")
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"   ‚Ä¢ Numerik: {len(numeric_cols)} kolom")
print(f"   ‚Ä¢ Kategorik: {len(categorical_cols)} kolom")

print(f"\n‚ùì MISSING VALUES:")
missing = df.isnull().sum()
missing_pct = (df.isnull().sum() / len(df)) * 100

if missing.sum() == 0:
    print("   ‚úÖ TIDAK ADA MISSING VALUES!")
else:
    print(f"   Total missing values: {missing.sum()}")
    print("\n   Kolom dengan missing values:")
    missing_df = pd.DataFrame({
        'Column': missing[missing > 0].index,
        'Missing Count': missing[missing > 0].values,
        'Percentage': missing_pct[missing_pct > 0].values
    })
    print(missing_df.to_string(index=False))


[1.6] Analisis Tipe Data dan Missing Values
--------------------------------------------------------------------------------

üìà DISTRIBUSI TIPE DATA:
int64     32
object     1
Name: count, dtype: int64

üìä BREAKDOWN TIPE DATA:
   ‚Ä¢ Numerik: 32 kolom
   ‚Ä¢ Kategorik: 1 kolom

‚ùì MISSING VALUES:
   ‚úÖ TIDAK ADA MISSING VALUES!


In [None]:
# ============================================================================
# 1.7 STATISTIK DESKRIPTIF
# ============================================================================

print(f"\n[1.7] Statistik Deskriptif")
print("-"*80)

if len(numeric_cols) > 0:
    print("\nüìä STATISTIK FITUR NUMERIK:")
    stats = df[numeric_cols].describe().T
    stats['missing'] = df[numeric_cols].isnull().sum()
    print(stats)
else:
    print("\n‚ö†Ô∏è Tidak ada kolom numerik untuk analisis statistik")



[1.7] Statistik Deskriptif
--------------------------------------------------------------------------------

üìä STATISTIK FITUR NUMERIK:
           count      mean       std  min  25%  50%  75%  max  missing
1          145.0  1.620690  0.613154  1.0  1.0  2.0  2.0  3.0        0
2          145.0  1.600000  0.491596  1.0  1.0  2.0  2.0  2.0        0
3          145.0  1.944828  0.537216  1.0  2.0  2.0  2.0  3.0        0
4          145.0  3.572414  0.805750  1.0  3.0  3.0  4.0  5.0        0
5          145.0  1.662069  0.474644  1.0  1.0  2.0  2.0  2.0        0
6          145.0  1.600000  0.491596  1.0  1.0  2.0  2.0  2.0        0
7          145.0  1.579310  0.495381  1.0  1.0  2.0  2.0  2.0        0
8          145.0  1.627586  1.020245  1.0  1.0  1.0  2.0  5.0        0
9          145.0  1.620690  1.061112  1.0  1.0  1.0  2.0  4.0        0
10         145.0  1.731034  0.783999  1.0  1.0  2.0  2.0  4.0        0
11         145.0  2.282759  1.223062  1.0  1.0  2.0  3.0  6.0        0
12      

In [None]:

# ============================================================================
# 1.8 IDENTIFIKASI KOLOM TARGET
# ============================================================================

print(f"\n[1.8] Identifikasi Kolom Target")
print("-"*80)

# Kemungkinan nama kolom target
possible_targets = [
    'GRADE', 'Grade', 'grade', 
    'OUTPUT', 'Output', 'output',
    'FINAL_GRADE', 'final_grade',
    'CLASS', 'Class', 'class',
    'LABEL', 'Label', 'label',
    'TARGET', 'Target', 'target'
]

target_col = None
for col in possible_targets:
    if col in df.columns:
        target_col = col
        print(f"‚úÖ Kolom target terdeteksi: '{target_col}'")
        break

if target_col is None:
    print("‚ö†Ô∏è Kolom target tidak terdeteksi otomatis.")
    print("\nüí° Silakan tentukan kolom target secara manual:")
    print("   target_col = 'NAMA_KOLOM'")
    print("\nüìã Kolom yang tersedia:")
    for i, col in enumerate(df.columns, 1):
        n_unique = df[col].nunique()
        print(f"   {i:2d}. {col:<30} (unique values: {n_unique})")
else:
    # Analisis target
    print(f"\nüìä ANALISIS TARGET: '{target_col}'")
    print(f"   ‚Ä¢ Jumlah kelas unik: {df[target_col].nunique()}")
    print(f"   ‚Ä¢ Kelas: {sorted(df[target_col].unique())}")
    
    print(f"\n   DISTRIBUSI KELAS:")
    dist = df[target_col].value_counts().sort_index()
    for grade, count in dist.items():
        pct = (count / len(df)) * 100
        bar = "‚ñà" * int(pct / 2)
        print(f"   {grade:<10} : {count:>3} ({pct:>5.2f}%) {bar}")


[1.8] Identifikasi Kolom Target
--------------------------------------------------------------------------------
‚úÖ Kolom target terdeteksi: 'GRADE'

üìä ANALISIS TARGET: 'GRADE'
   ‚Ä¢ Jumlah kelas unik: 8
   ‚Ä¢ Kelas: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7)]

   DISTRIBUSI KELAS:
   0          :   8 ( 5.52%) ‚ñà‚ñà
   1          :  35 (24.14%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
   2          :  24 (16.55%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
   3          :  21 (14.48%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
   4          :  10 ( 6.90%) ‚ñà‚ñà‚ñà
   5          :  17 (11.72%) ‚ñà‚ñà‚ñà‚ñà‚ñà
   6          :  13 ( 8.97%) ‚ñà‚ñà‚ñà‚ñà
   7          :  17 (11.72%) ‚ñà‚ñà‚ñà‚ñà‚ñà


In [None]:
# ============================================================================
# 1.9 SAVE CHECKPOINT
# ============================================================================

print(f"\n[1.9] Save Checkpoint")
print("-"*80)

# Save raw data ke folder processed untuk dokumentasi
checkpoint_path = 'data/processed/00_raw_loaded.csv'
df.to_csv(checkpoint_path, index=False)
print(f"‚úÖ Dataset disimpan ke: {checkpoint_path}")

# Buat summary info
summary_path = 'results/reports/01_dataset_info.txt'
with open(summary_path, 'w', encoding='utf-8') as f:
    f.write("="*80 + "\n")
    f.write(" INFORMASI DATASET - TAHAP 1 ".center(80, "=") + "\n")
    f.write("="*80 + "\n\n")
    f.write(f"File source: {loaded_from}\n")
    f.write(f"Encoding: {encoding_used}\n")
    f.write(f"Jumlah baris: {df.shape[0]}\n")
    f.write(f"Jumlah kolom: {df.shape[1]}\n")
    f.write(f"Total data points: {df.shape[0] * df.shape[1]:,}\n")
    f.write(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB\n\n")
    f.write("Kolom:\n")
    for i, col in enumerate(df.columns, 1):
        f.write(f"  {i:2d}. {col}\n")
    f.write(f"\nTarget column: {target_col}\n")
    if target_col:
        f.write(f"Target classes: {sorted(df[target_col].unique())}\n")

print(f"‚úÖ Summary info disimpan ke: {summary_path}")


[1.9] Save Checkpoint
--------------------------------------------------------------------------------
‚úÖ Dataset disimpan ke: data/processed/00_raw_loaded.csv
‚úÖ Summary info disimpan ke: results/reports/01_dataset_info.txt


In [None]:
# ============================================================================
# RINGKASAN TAHAP 1
# ============================================================================

print("\n" + "="*80)
print(" ‚úÖ TAHAP 1 SELESAI ".center(80, "="))
print("="*80)

print("\nüìå RINGKASAN:")
print(f"   ‚úì Dataset berhasil dimuat: {df.shape[0]} baris √ó {df.shape[1]} kolom")
print(f"   ‚úì Target column: {target_col if target_col else 'Belum teridentifikasi'}")
print(f"   ‚úì Missing values: {'Tidak ada' if missing.sum() == 0 else f'{missing.sum()} values'}")
print(f"   ‚úì Struktur folder: OK")
print(f"   ‚úì Checkpoint saved: OK")

print("\nüéØ LANGKAH SELANJUTNYA:")
print("   ‚Üí Lanjut ke TAHAP 2: Exploratory Data Analysis (EDA)")
print("   ‚Üí Jalankan notebook berikutnya atau cell berikutnya")

print("\n" + "="*80)

# Ekspose variabel untuk digunakan di tahap berikutnya
print("\nüíæ VARIABEL YANG TERSEDIA UNTUK TAHAP SELANJUTNYA:")
print(f"   ‚Ä¢ df          : DataFrame utama ({df.shape})")
print(f"   ‚Ä¢ target_col  : Nama kolom target ('{target_col}')")
print(f"   ‚Ä¢ numeric_cols: List kolom numerik ({len(numeric_cols)} cols)")
print(f"   ‚Ä¢ categorical_cols: List kolom kategorik ({len(categorical_cols)} cols)")

print("\n‚úÖ SIAP UNTUK TAHAP 2!")



üìå RINGKASAN:
   ‚úì Dataset berhasil dimuat: 145 baris √ó 33 kolom
   ‚úì Target column: GRADE
   ‚úì Missing values: Tidak ada
   ‚úì Struktur folder: OK
   ‚úì Checkpoint saved: OK

üéØ LANGKAH SELANJUTNYA:
   ‚Üí Lanjut ke TAHAP 2: Exploratory Data Analysis (EDA)
   ‚Üí Jalankan notebook berikutnya atau cell berikutnya


üíæ VARIABEL YANG TERSEDIA UNTUK TAHAP SELANJUTNYA:
   ‚Ä¢ df          : DataFrame utama ((145, 33))
   ‚Ä¢ target_col  : Nama kolom target ('GRADE')
   ‚Ä¢ numeric_cols: List kolom numerik (32 cols)
   ‚Ä¢ categorical_cols: List kolom kategorik (1 cols)

‚úÖ SIAP UNTUK TAHAP 2!
