## Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Making Dummy Data

In [None]:
np.random.seed(42)

# daftar subjek
subjects = [
    "Pendidikan Agama", "Pendidikan Pancasila", "Bahasa Inggris", "Bahasa Mandarin",
    "Matematika (Umum)", "Biologi", "Fisika", "Kimia", "Geografi", "Sejarah",
    "Sosiologi", "Ekonomi", "Pendidikan Jasmani, Olahraga, dan Kesehatan",
    "Informatika", "Seni Musik", "Bahasa Indonesia"
]

# students yang bakal di generate
n_students = 5000

# Simulasi semester sekarang
low_count = int(0.2 * n_students)
mid_count = int(0.5 * n_students)
high_count = n_students - low_count - mid_count

low_avg = np.random.normal(loc=60, scale=5, size=low_count)
mid_avg = np.random.normal(loc=75, scale=5, size=mid_count)
high_avg = np.random.normal(loc=90, scale=5, size=high_count)

base_averages = np.concatenate([low_avg, mid_avg, high_avg])
base_averages = np.clip(base_averages, 0, 100)

# Simulasi skor subjek per murid
grades_current = np.array([
    base + np.random.normal(loc=0, scale=5, size=len(subjects))
    for base in base_averages
])
grades_current = np.clip(grades_current, 0, 100)

# dataframe
df = pd.DataFrame(grades_current, columns=subjects)
df['Rata-rata'] = df.mean(axis=1)

df[subjects] = df[subjects].round().astype(int)  # Convert subject scores to integers
df['Rata-rata'] = df['Rata-rata'].round(2) 

# Mask
low_mask = df['Rata-rata'] < 70
mid_mask = (df['Rata-rata'] >= 70) & (df['Rata-rata'] < 85)
high_mask = df['Rata-rata'] >= 85

df_next_sem = df[subjects].copy()

def add_noise(data, loc=0, scale=5):
    return np.clip(data + np.random.normal(loc=loc, scale=scale, size=data.shape), 0, 100)

# Low performers
indices = df[low_mask].index.to_numpy()
np.random.shuffle(indices)
n = len(indices)
df_next_sem.loc[indices[:int(0.5 * n)]] = add_noise(df_next_sem.loc[indices[:int(0.5 * n)]], loc=0, scale=3)
df_next_sem.loc[indices[int(0.5 * n):int(0.9 * n)]] = add_noise(df_next_sem.loc[indices[int(0.5 * n):int(0.9 * n)]], loc=5, scale=4)
df_next_sem.loc[indices[int(0.9 * n):]] = add_noise(df_next_sem.loc[indices[int(0.9 * n):]], loc=10, scale=5)

# Mid performers
indices = df[mid_mask].index.to_numpy()
np.random.shuffle(indices)
n = len(indices)
df_next_sem.loc[indices[:int(0.6 * n)]] = add_noise(df_next_sem.loc[indices[:int(0.6 * n)]], loc=0, scale=3)
df_next_sem.loc[indices[int(0.6 * n):int(0.9 * n)]] = add_noise(df_next_sem.loc[indices[int(0.6 * n):int(0.9 * n)]], loc=5, scale=4)
df_next_sem.loc[indices[int(0.9 * n):]] = add_noise(df_next_sem.loc[indices[int(0.9 * n):]], loc=-7, scale=5)

# High performers
indices = df[high_mask].index.to_numpy()
np.random.shuffle(indices)
n = len(indices)
df_next_sem.loc[indices[:int(0.8 * n)]] = add_noise(df_next_sem.loc[indices[:int(0.8 * n)]], loc=-3, scale=3)
df_next_sem.loc[indices[int(0.8 * n):]] = add_noise(df_next_sem.loc[indices[int(0.8 * n):]], loc=3, scale=2)

# pembulatan dan pergantian nama df
df_next_sem = df_next_sem.round().astype(int)
df_next_sem.columns = [col + " (Next Sem)" for col in df_next_sem.columns]

# kombinasi
df_final = pd.concat([df, df_next_sem], axis=1)
df_final['Rata-rata (Next Sem)'] = df_next_sem.mean(axis=1).round(2)

# Kategori berdasarkan performa dan perubahan
df_final['Kategori Performa'] = "Unlabeled"
df_final.loc[low_mask, 'Kategori Performa'] = "Low"
df_final.loc[mid_mask, 'Kategori Performa'] = "Mid"
df_final.loc[high_mask, 'Kategori Performa'] = "High"

# Kategori perubahan
df_final['Kategori Performa (Next Sem)'] = pd.cut(
    df_final['Rata-rata (Next Sem)'],
    bins=[-1, 70, 85, 100],
    labels=["Low", "Mid", "High"]
)

# Tambahkan jumlah ketidakhadiran berdasarkan kategori performa
def generate_absences(current, future):
    if current == "Low" and future == "High":
        return np.random.randint(0, 4)
    elif current == "Low" and future == "Low":
        return np.random.randint(6, 12)
    elif current == "Low" and future == "Mid":
        return np.random.randint(4, 8)
    elif current == "Mid" and future == "Mid":
        return np.random.randint(2, 6)
    elif current == "Mid" and future == "High":
        return np.random.randint(0, 5)
    elif current == "Mid" and future == "Low":
        return np.random.randint(2, 12)
    elif current == "High" and future =="High":
        return np.random.randint(0, 2)
    elif current == "High" and future =="Low":
        return np.random.randint(0, 6)
    elif current == "High" and future =="Mid":
        return np.random.randint(0, 3)
    else:
        return np.random.randint(1, 10)

df_final['Jumlah Ketidakhadiran'] = df_final.apply(
    lambda row: generate_absences(row['Kategori Performa'], row['Kategori Performa (Next Sem)']),
    axis=1
)




In [None]:
# # Tambahkan jumlah ketidakhadiran berdasarkan kategori performa
# def generate_absences(current, future):
#     if current == "Low" and future == "High":
#         return np.random.randint(0, 4)
#     elif current == "Low" and future == "Low":
#         return np.random.randint(6, 12)
#     elif current == "Low" and future == "Mid":
#         return np.random.randint(4, 8)
#     elif current == "Mid" and future == "Mid":
#         return np.random.randint(2, 6)
#     elif current == "Mid" and future == "High":
#         return np.random.randint(0, 5)
#     elif current == "Mid" and future == "Low":
#         return np.random.randint(0, 12)
#     elif current == "High" and future =="High":
#         return np.random.randint(0, 2)
#     elif current == "High" and future =="Low":
#         return np.random.randint(0, 6)
#     elif current == "High" and future =="Mid":
#         return np.random.randint(0, 3)
#     else:
#         return np.random.randint(1, 10)

# df_final['Jumlah Ketidakhadiran'] = df_final.apply(
#     lambda row: generate_absences(row['Kategori Performa'], row['Kategori Performa (Next Sem)']),
#     axis=1
# )

In [None]:
df_final.info()
df_final.describe()

In [None]:
df.head()

In [None]:
# Calculate the performance difference
df_final['performance_diff'] = df_final['Rata-rata (Next Sem)'] - df_final['Rata-rata']

# Define thresholds for "slight" changes (adjust these as needed)
slight_threshold = 3  # points
significant_threshold = 10  # points

conditions = [
    # Decreased performance categories
    (df_final['performance_diff'] <= -significant_threshold),
    (df_final['performance_diff'] < 0) & (df_final['performance_diff'] > -significant_threshold),
    
    # Stable performance (small change window)
    (abs(df_final['performance_diff']) <= 2),  # You can adjust this threshold
    
    # Increased performance categories
    (df_final['performance_diff'] > 0) & (df_final['performance_diff'] <= significant_threshold),
    (df_final['performance_diff'] > significant_threshold)
]

choices = [
    'significantly decreased performance',
    'slightly decreased performance',
    'stable performance',
    'slightly increased performance',
    'significantly increased performance'
]

df_final['performance_trend'] = np.select(conditions, choices)

# Optional: Add a column with the exact difference
df_final['performance_change'] = df_final['Rata-rata (Next Sem)'] - df_final['Rata-rata']

In [None]:


# Set style for better-looking plots
plt.style.use('default')
sns.set_palette("husl")

# Create figure with multiple subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Student Performance Analysis: Categories vs Trends', fontsize=16, fontweight='bold')

# 1. Crosstab table for exact counts
crosstab = pd.crosstab(df_final['Kategori Performa'], df_final['performance_trend'], margins=True)
print("=== Student Counts by Category and Performance Trend ===")
print(crosstab)
print("\n")

# 2. Stacked Bar Chart
ax1 = axes[0, 0]
crosstab_no_margins = pd.crosstab(df_final['Kategori Performa'], df_final['performance_trend'])
crosstab_no_margins.plot(kind='bar', stacked=True, ax=ax1, rot=45)
ax1.set_title('Stacked Bar: Students by Category & Trend')
ax1.set_xlabel('Performance Category')
ax1.set_ylabel('Number of Students')
ax1.legend(title='Performance Trend', bbox_to_anchor=(1.05, 1), loc='upper left')

# 3. Grouped Bar Chart
ax2 = axes[0, 1]
crosstab_no_margins.plot(kind='bar', ax=ax2, rot=45)
ax2.set_title('Grouped Bar: Students by Category & Trend')
ax2.set_xlabel('Performance Category')
ax2.set_ylabel('Number of Students')
ax2.legend(title='Performance Trend', bbox_to_anchor=(1.05, 1), loc='upper left')

# 4. Heatmap
ax3 = axes[1, 0]
sns.heatmap(crosstab_no_margins, annot=True, fmt='d', cmap='YlOrRd', ax=ax3)
ax3.set_title('Heatmap: Student Distribution')
ax3.set_xlabel('Performance Trend')
ax3.set_ylabel('Performance Category')

# 5. Pie Chart showing overall trend distribution
ax4 = axes[1, 1]
trend_counts = df_final['performance_trend'].value_counts()
colors = plt.cm.Set3(range(len(trend_counts)))
wedges, texts, autotexts = ax4.pie(trend_counts.values, labels=trend_counts.index, autopct='%1.1f%%', colors=colors, startangle=90)
ax4.set_title('Overall Performance Trend Distribution')

# Adjust layout
plt.tight_layout()
plt.show()

# Additional detailed breakdown
print("=== Detailed Breakdown ===")
for category in ['Low', 'Mid', 'High']:
    print(f"\n{category} Performers:")
    category_data = df_final[df_final['Kategori Performa'] == category]['performance_trend'].value_counts()
    total = category_data.sum()
    for trend, count in category_data.items():
        percentage = (count/total) * 100
        print(f"  {trend}: {count} students ({percentage:.1f}%)")

# Summary statistics
print(f"\n=== Summary ===")
print(f"Total Students: {len(df_final)}")
print(f"Low Performers: {len(df_final[df_final['Kategori Performa'] == 'Low'])}")
print(f"Mid Performers: {len(df_final[df_final['Kategori Performa'] == 'Mid'])}")
print(f"High Performers: {len(df_final[df_final['Kategori Performa'] == 'High'])}")

In [None]:
# Average absences per student in each group
pivot_avg = df_final.pivot_table(
    values='Jumlah Ketidakhadiran', 
    index='Kategori Performa', 
    columns='performance_trend', 
    aggfunc='mean',  # Average absences per student
    fill_value=0
)
print("Average Absence Days per Student:")
print(pivot_avg.round(1))

# Heatmap of average absences
sns.heatmap(pivot_avg, annot=True, fmt='.1f', cmap='Oranges')
plt.title('Average Absence Days per Student by Category & Trend')
plt.show()

In [None]:
df_final.to_csv('daftar_nilai_revised.csv', index=False)

## Reference Code

In [None]:
# # Plot grouped bars
# cross_tab.plot(kind='bar', figsize=(12, 6), colormap='viridis')
# plt.title('Performance Categories by Trend Status')
# plt.xlabel('Performance Level')
# plt.ylabel('Count')
# plt.xticks(rotation=45)
# plt.legend(title='Performance Final')
# plt.show()

# cross_tab = pd.crosstab(
#     index=df_final['Kategori Performa'],
#     columns=df_final['performance_trend'],
#     values=df_final['Jumlah Ketidakhadiran'],
#     aggfunc='sum'
# )

# # Plot heatmap
# plt.figure(figsize=(10, 6))
# sns.heatmap(cross_tab, annot=True, fmt='g', cmap='YlGnBu', linewidths=.5)
# plt.title('Total Absences by Performance and Trend Categories')
# plt.xlabel('Performance Trend')
# plt.ylabel('Performance Category')
# plt.show()

# import seaborn as sns


# # Create cross-tabulation
# cross_tab = pd.crosstab(df_final['Kategori Performa'], df_final['performance_trend'])

# # Plot heatmap
# plt.figure(figsize=(10, 6))
# sns.heatmap(cross_tab, annot=True, fmt='d', cmap='YlGnBu', linewidths=.5)
# plt.title('Relationship Between Performance and Performance Trends')
# plt.xlabel('Performance Trend')
# plt.ylabel('Performance')
# plt.show()
# # Create cross-tab with values as sum of absences

# import matplotlib.pyplot as plt

# # Set up the figure
# plt.figure(figsize=(12, 6))

# # First subplot - performance distribution
# plt.subplot(1, 2, 1)
# df_final['Kategori Performa'].value_counts().plot(kind='bar', color=['#1f77b4', '#ff7f0e', '#2ca02c'])
# plt.title('Performance Distribution')
# plt.xlabel('Performance Category')
# plt.ylabel('Count')
# plt.xticks(rotation=45)

# # Second subplot - performance_final distribution
# plt.subplot(1, 2, 2)
# df_final['performance_trend'].value_counts().plot(kind='bar', color=['#d62728', '#9467bd', '#8c564b'])
# plt.title('Performance Trend Distribution')
# plt.xlabel('Performance Trend Category')
# plt.ylabel('Count')
# plt.xticks(rotation=45)

# plt.tight_layout()
# plt.show()

# df_final.info()
# df_final['performance_trend'].value_counts()

# # Pivot table for cleaner layout
# pivot_table = df_final.pivot_table(
#     values='Jumlah Ketidakhadiran', 
#     index='Kategori Performa', 
#     columns='performance_trend', 
#     aggfunc='count', 
#     fill_value=0
# )
# print(pivot_table)

# # Crosstab for frequency analysis
# crosstab = pd.crosstab(
#     df_final['Kategori Performa'], 
#     df_final['performance_trend'], 
#     margins=True
# )
# print(crosstab)

# # Stacked bar chart
# df_final.groupby(['Kategori Performa', 'performance_trend']).size().unstack().plot(kind='bar', stacked=True)
# plt.title('Performance Trends by Category')
# plt.show()

# # Heatmap
# pivot_data = df_final.pivot_table(values='Jumlah Ketidakhadiran', index='Kategori Performa', columns='performance_trend', aggfunc='count')
# sns.heatmap(pivot_data, annot=True, fmt='g')
# plt.show()

# df_final.info()
# df_final['performance'].value_counts()

# conditions = [
#     df_final['Rata-rata'] > df_final['Rata-rata (Next Sem)'],
#     df_final['Rata-rata'] < df_final['Rata-rata (Next Sem)'],
#     df_final['Rata-rata'] == df_final['Rata-rata (Next Sem)']
# ]

# choices = [
#     'decreased performance',
#     'increased performance',
#     'stable performance'
# ]

# df_final['performance_trend'] = np.select(conditions, choices)

# df_final['performance'] = df_final['Rata-rata'].apply(
#     lambda x: 'low performer' if x < 70 else (
#         'average performer' if 70 <= x < 85 else 'high performer'
#     )
# )

# df_final[['Kategori Performa', 'Kategori Performa (Next Sem)', 'Jumlah Ketidakhadiran']].value_counts().head(10)

## Tambah Kasus yang bervariasi
_Tambah siswa dengan nilai sangat rendah,etc.

## Tambah Kolom Absensi

In [None]:
# # Re-import necessary modules after code execution environment reset
# import pandas as pd
# import numpy as np

# np.random.seed(42)

# # daftar subjek
# subjects = [
#     "Pendidikan Agama", "Pendidikan Pancasila", "Bahasa Inggris", "Bahasa Mandarin",
#     "Matematika (Umum)", "Biologi", "Fisika", "Kimia", "Geografi", "Sejarah",
#     "Sosiologi", "Ekonomi", "Pendidikan Jasmani, Olahraga, dan Kesehatan",
#     "Informatika", "Seni Musik", "Bahasa Indonesia"
# ]

# # students yang bakal di generate
# n_students = 1000

# # Simulasi semester sekarang
# low_count = int(0.2 * n_students)
# mid_count = int(0.5 * n_students)
# high_count = n_students - low_count - mid_count

# low_avg = np.random.normal(loc=60, scale=5, size=low_count)
# mid_avg = np.random.normal(loc=75, scale=5, size=mid_count)
# high_avg = np.random.normal(loc=90, scale=5, size=high_count)

# base_averages = np.concatenate([low_avg, mid_avg, high_avg])
# base_averages = np.clip(base_averages, 0, 100)

# # Simulasi skor subjek per murid
# grades_current = np.array([
#     base + np.random.normal(loc=0, scale=5, size=len(subjects))
#     for base in base_averages
# ])
# grades_current = np.clip(grades_current, 0, 100)

# # dataframe
# df = pd.DataFrame(grades_current, columns=subjects)
# df['Rata-rata'] = df.mean(axis=1)

# # Mask
# low_mask = df['Rata-rata'] < 70
# mid_mask = (df['Rata-rata'] >= 70) & (df['Rata-rata'] < 85)
# high_mask = df['Rata-rata'] >= 85

# df_next_sem = df[subjects].copy()

# def add_noise(data, loc=0, scale=5):
#     return np.clip(data + np.random.normal(loc=loc, scale=scale, size=data.shape), 0, 100)

# # Low performers
# indices = df[low_mask].index.to_numpy()
# np.random.shuffle(indices)
# n = len(indices)
# df_next_sem.loc[indices[:int(0.7 * n)]] = add_noise(df_next_sem.loc[indices[:int(0.7 * n)]], loc=5, scale=4)
# df_next_sem.loc[indices[int(0.7 * n):int(0.9 * n)]] = add_noise(df_next_sem.loc[indices[int(0.7 * n):int(0.9 * n)]], loc=0, scale=3)
# df_next_sem.loc[indices[int(0.9 * n):]] = add_noise(df_next_sem.loc[indices[int(0.9 * n):]], loc=15, scale=5)

# # Mid performers
# indices = df[mid_mask].index.to_numpy()
# np.random.shuffle(indices)
# n = len(indices)
# df_next_sem.loc[indices[:int(0.6 * n)]] = add_noise(df_next_sem.loc[indices[:int(0.6 * n)]], loc=0, scale=3)
# df_next_sem.loc[indices[int(0.6 * n):int(0.9 * n)]] = add_noise(df_next_sem.loc[indices[int(0.6 * n):int(0.9 * n)]], loc=5, scale=4)
# df_next_sem.loc[indices[int(0.9 * n):]] = add_noise(df_next_sem.loc[indices[int(0.9 * n):]], loc=-7, scale=5)

# # High performers
# indices = df[high_mask].index.to_numpy()
# np.random.shuffle(indices)
# n = len(indices)
# df_next_sem.loc[indices[:int(0.8 * n)]] = add_noise(df_next_sem.loc[indices[:int(0.8 * n)]], loc=-3, scale=3)
# df_next_sem.loc[indices[int(0.8 * n):]] = add_noise(df_next_sem.loc[indices[int(0.8 * n):]], loc=0, scale=2)

# # pembulatan dan pergantian nama df
# df_next_sem = df_next_sem.round().astype(int)
# df_next_sem.columns = [col + " (Next Sem)" for col in df_next_sem.columns]

# # kombinasi
# df_final = pd.concat([df, df_next_sem], axis=1)
# df_final['Rata-rata (Next Sem)'] = df_next_sem.mean(axis=1).round(2)

# # Kategori berdasarkan performa dan perubahan
# df_final['Kategori Performa'] = "Unlabeled"
# df_final.loc[low_mask, 'Kategori Performa'] = "Low"
# df_final.loc[mid_mask, 'Kategori Performa'] = "Mid"
# df_final.loc[high_mask, 'Kategori Performa'] = "High"

# # Kategori perubahan
# df_final['Kategori Performa (Next)'] = pd.cut(
#     df_final['Rata-rata (Next Sem)'],
#     bins=[-1, 70, 85, 100],
#     labels=["Low", "Mid", "High"]
# )

# # Tambahkan jumlah ketidakhadiran berdasarkan kategori performa
# def generate_absences(current, future):
#     if current == "Low" and future == "High":
#         return np.random.randint(0, 4)
#     elif current == "Low" and future == "Low":
#         return np.random.randint(7, 16)
#     elif current == "Mid" and future == "Mid":
#         return np.random.randint(2, 9)
#     elif current == "Mid" and future == "High":
#         return np.random.randint(0, 7)
#     elif current == "High":
#         return np.random.randint(0, 5)
#     else:
#         return np.random.randint(1, 10)

# df_final['Jumlah Ketidakhadiran'] = df_final.apply(
#     lambda row: generate_absences(row['Kategori Performa'], row['Kategori Performa (Next)']),
#     axis=1
# )

# df_final[['Kategori Performa', 'Kategori Performa (Next)', 'Jumlah Ketidakhadiran']].value_counts().head(10)


## Hapus Kolom Tidak Digunakan

In [None]:
# More realistic subject correlations
stem_subjects = ["Matematika (Umum)", "Biologi", "Fisika", "Kimia"]
language_subjects = ["Bahasa Indonesia", "Bahasa Inggris", "Bahasa Mandarin"]
# Apply different correlation patterns

# More conservative improvement ranges
# Low performers: +2 to +8 instead of +15
# High performers: -1 to +2 instead of -3

## Splitting Data

# 🧠 Feature Engineering

## Hitung Nilai Rata-rata per Siswa

## Normalisasi Nilai

## Tambah Fitur Tren Peningkatan/Penurunan Nilai

## Tambah Fitur Statistik

# 🏗️ Model Development (Compliance Focus)

## Bangun Model TensorFlow (tanpa API luar)

## PCA (jika diperlukan)

## Cross Validation

## Simpan Model

# 🧪 Evaluation

## Evaluasi dengan MAE, RMSE, R²

## Visualisasi Prediksi vs Nilai Asli

# 🚀 Inference & Integration

## Kode Inference Sederhana

## Integrasi ke UI atau API