# 학생 성과 예측

[1] 모듈 로드

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

[2] 데이터 준비

In [2]:
dataDF=pd.read_csv('student_performance_prediction.csv')
dataDF.info()

FileNotFoundError: [Errno 2] No such file or directory: 'student_performance_prediction.csv'

In [None]:
dataDF.head()  

In [None]:
dataDF.describe()

In [None]:
# 결측치 확인
dataDF.isna().sum()

[3] 데이터 전처리

In [None]:
# Study Hours per Week 값 Nan은 0으로 바꾸고, 양수만 남기기
values = {'Study Hours per Week':0}
dataDF.fillna(value=values,inplace=True)

dataDF = dataDF[dataDF['Study Hours per Week'] >= 0]
dataDF

In [None]:
# Attendance Rate 데이터 음수값, 100 초과값 날리기
mask = (dataDF['Attendance Rate']>0) & (dataDF['Attendance Rate']<=100)
dataDF=dataDF[mask]
dataDF

In [None]:
# Previous Grades => 100 초과값 날리기, Nan값 => 중앙값

# Participation in Extracurricular Activities => Nan값 -> no

# Parent Education Level => Nan값 => 중졸이하


mask = (dataDF['Previous Grades']<=100)
dataDF=dataDF[mask].copy()

values = {'Previous Grades':65.2,'Participation in Extracurricular Activities':'No','Parent Education Level':'Under_Middle'}
dataDF.fillna(value=values,inplace=True)

dataDF

In [None]:
#  passed 컬럼(target) 결측치 제거
dataDF=dataDF.dropna(subset=['Passed'], how='any', axis=0)
dataDF

In [None]:
dataDF.isna().sum()

In [None]:
dataDF.describe()

In [None]:
dataDF['Parent Education Level'].value_counts()

In [None]:
dataDF['Participation in Extracurricular Activities'].value_counts()

In [None]:
dataDF=dataDF.drop('Student ID',axis=1)
dataDF

In [15]:
# LabelEncoder 인스턴스 생성
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
# 데이터 인코딩
for c in dataDF.columns:
    dataDF[c] = encoder.fit_transform(dataDF[c])

dataDF.head()

In [None]:
for c in dataDF.columns:
    print(f'{c} : {dataDF[c].unique()}')


In [None]:
dataDF.corr()

In [19]:
def draw_bar(df):
    col = 3
    row = (len(df.columns) + col - 1) // col
    colors = [
    "#AED6F1",  # 옅은 하늘색
    "#A9DFBF",  # 옅은 초록색
    "#F9E79F",  # 옅은 노란색
    "#F5B7B1",  # 옅은 핑크색
    "#D2B4DE",  # 옅은 보라색
    "#FAD7A0",  # 옅은 주황색
    "#D5DBDB",  # 옅은 회색
    "#A3E4D7",  # 옅은 민트색
    "#D7BDE2",  # 옅은 연보라색
    "#F5CBA7",  # 옅은 살구색
]

    plt.figure(figsize=(15, row * 4))
    for i, column in enumerate(df.columns, 1):
        plt.subplot(row, col, i)
        df[column].value_counts().plot(kind='bar', color=colors)
        plt.title(column)
        plt.xticks(rotation=360)
    plt.tight_layout()
    plt.show()

In [None]:
draw_bar(dataDF)

## DecisionTree

- target, feature 분리

In [21]:
featureDF=dataDF[dataDF.columns[:-1]]
targetSR=dataDF[dataDF.columns[-1:]]

In [None]:
featureDF.head()

In [None]:
targetSR.head()

- 학습과 테스트 데이터 세트로 분리

In [24]:
X_train, X_test, y_train, y_test = train_test_split(featureDF,
                                                    targetSR,
                                                    random_state=777)

- DecisionTree Classifier생성

In [25]:
dt_clf=DecisionTreeClassifier(random_state=777)

- 학습 

In [None]:
dt_clf.fit(X_train,y_train)

In [27]:
# export_graphiz()의 호출 결과로 out_file로 지정된 tree.dot 파일을 생성함.
export_graphviz(dt_clf,out_file='Student_tree.dot', 
                class_names=['0','1'],
                feature_names=featureDF.columns,
                impurity=True,filled=True)

In [None]:
import graphviz

with open('Student_tree.dot') as f:
    dot_graph=f.read()
graphviz.Source(dot_graph)

In [None]:
dt_clf.score(X_test,y_test)

In [30]:
parameters={'max_depth':[10,15,20,25,30],'min_samples_split':[2,3,4,5]}

In [31]:
from sklearn.model_selection import GridSearchCV
grid_dtree=GridSearchCV(dt_clf,param_grid=parameters,cv=5,refit=True)

In [None]:
grid_dtree.fit(X_train,y_train)

In [None]:
dataDF2=pd.DataFrame(grid_dtree.cv_results_)
dataDF2[['params','mean_test_score','rank_test_score','split0_test_score','split1_test_score','split2_test_score']]

In [None]:
# 가장 좋은 모델
best_model = grid_dtree.best_estimator_
print(f'best_model.max_depth: {best_model.max_depth}')
print(f'best_model.min_samples_leaf: {best_model.min_samples_leaf}')

In [None]:

plt.barh(range(best_model.n_features_in_), best_model.feature_importances_, align='center')
plt.yticks(np.arange(best_model.n_features_in_), best_model.feature_names_in_)
plt.xlabel('[Feature Importance]')
plt.ylabel('Feature')
plt.show()

In [36]:
# GridSearchCV에서 최적의 결정 트리 모델 추출
best_dtree = grid_dtree.best_estimator_

# 결정 트리 시각화
export_graphviz(best_dtree, 
                out_file='Student_tree2.dot', 
                class_names=['0', '1'],
                feature_names=featureDF.columns,
                impurity=True, 
                filled=True)

In [None]:
# 생성된 dot 파일을 읽어 시각화
with open('Student_tree2.dot') as f:
    dot_graph = f.read()
    
graphviz.Source(dot_graph)

In [None]:
parameters2={'max_depth':[5,6,7,8,9,10],'min_samples_split':[2,3,4,5,6,7,8,9]}
grid_dtree2=GridSearchCV(dt_clf,param_grid=parameters2,cv=20,refit=True)
grid_dtree2.fit(X_train,y_train)

dataDF3=pd.DataFrame(grid_dtree2.cv_results_)
dataDF3[['params','mean_test_score','rank_test_score','split0_test_score','split1_test_score','split2_test_score']]



In [None]:
# 가장 좋은 모델
best_model = grid_dtree2.best_estimator_
print(f'best_model.max_depth: {best_model.max_depth}')
print(f'best_model.min_samples_leaf: {best_model.min_samples_leaf}')


plt.barh(range(best_model.n_features_in_), best_model.feature_importances_, align='center')
plt.yticks(np.arange(best_model.n_features_in_), best_model.feature_names_in_)
plt.xlabel('[Feature Importance]')
plt.ylabel('Feature')
plt.show()


In [None]:
# GridSearchCV에서 최적의 결정 트리 모델 추출
best_dtree3 = grid_dtree2.best_estimator_

# 결정 트리 시각화
export_graphviz(best_dtree3, 
                out_file='Student_tree2.dot', 
                class_names=['0', '1'],
                feature_names=featureDF.columns,
                impurity=True, 
                filled=True)


# 생성된 dot 파일을 읽어 시각화
with open('Student_tree2.dot') as f:
    dot_graph = f.read()
    
graphviz.Source(dot_graph)

In [None]:
from sklearn.metrics import confusion_matrix
pred = best_dtree3.predict(X_test)
confusion = confusion_matrix(pred, y_test)
confusion

In [None]:
parameters2={'max_depth':[5,6,7,8,9,10],'max_leaf_nodes':[10],'min_samples_split':[2,3,4,5,6,7,8,9]}
grid_dtree2=GridSearchCV(dt_clf,param_grid=parameters2,cv=20,refit=True)
grid_dtree2.fit(X_train,y_train)

dataDF3=pd.DataFrame(grid_dtree2.cv_results_)
dataDF3[['params','mean_test_score','rank_test_score','split0_test_score','split1_test_score','split2_test_score']]

In [None]:
# GridSearchCV에서 최적의 결정 트리 모델 추출
best_dtree3 = grid_dtree2.best_estimator_

# 결정 트리 시각화
export_graphviz(best_dtree3, 
                out_file='Student_tree2.dot', 
                class_names=['0', '1'],
                feature_names=featureDF.columns,
                impurity=True, 
                filled=True)


# 생성된 dot 파일을 읽어 시각화
with open('Student_tree2.dot') as f:
    dot_graph = f.read()
    
graphviz.Source(dot_graph)

In [None]:
pred = best_dtree3.predict(X_test)
confusion = confusion_matrix(pred, y_test)
confusion

In [None]:
parameters2={'max_depth':[20,25,30,35,40,45],'max_leaf_nodes':[10],'min_samples_split':[2,3,4,5,6,7,8,9]}
grid_dtree2=GridSearchCV(dt_clf,param_grid=parameters2,cv=20,refit=True)
grid_dtree2.fit(X_train,y_train)

dataDF3=pd.DataFrame(grid_dtree2.cv_results_)
dataDF3[['params','mean_test_score','rank_test_score','split0_test_score','split1_test_score','split2_test_score']]

In [None]:
parameters2={'max_depth':[20,25,30,35,40,45],'min_samples_split':[2,3,4,5,6,7,8,9]}
grid_dtree2=GridSearchCV(dt_clf,param_grid=parameters2,cv=20,refit=True)
grid_dtree2.fit(X_train,y_train)

dataDF3=pd.DataFrame(grid_dtree2.cv_results_)
dataDF3[['params','mean_test_score','rank_test_score','split0_test_score','split1_test_score','split2_test_score']]

In [None]:
import joblib

# 최적의 모델 가져오기
best_model = grid_dtree2.best_estimator_

# 모델 저장
joblib.dump(best_model, 'DT_model.joblib')