**Import The Library**

In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle


**Load The Dataset**

In [67]:
df = pd.read_csv('student.csv')
df

Unnamed: 0,type_school,school_accreditation,gender,interest,residence,parent_age,parent_salary,house_area,average_grades,parent_was_in_college,will_go_to_college
0,Academic,A,Male,Less Interested,Urban,56,6950000,83.0,84.09,False,True
1,Academic,A,Male,Less Interested,Urban,57,4410000,76.8,86.91,False,True
2,Academic,B,Female,Very Interested,Urban,50,6500000,80.6,87.43,False,True
3,Vocational,B,Male,Very Interested,Rural,49,6600000,78.2,82.12,True,True
4,Academic,A,Female,Very Interested,Urban,57,5250000,75.1,86.79,False,False
...,...,...,...,...,...,...,...,...,...,...,...
995,Vocational,A,Female,Very Interested,Rural,49,7420000,63.6,85.99,True,True
996,Academic,B,Female,Less Interested,Rural,51,7480000,84.3,89.72,True,True
997,Vocational,A,Male,Less Interested,Urban,49,5550000,75.2,79.56,False,True
998,Academic,B,Male,Uncertain,Rural,53,5840000,105.8,87.18,True,True


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   type_school            1000 non-null   object 
 1   school_accreditation   1000 non-null   object 
 2   gender                 1000 non-null   object 
 3   interest               1000 non-null   object 
 4   residence              1000 non-null   object 
 5   parent_age             1000 non-null   int64  
 6   parent_salary          1000 non-null   int64  
 7   house_area             1000 non-null   float64
 8   average_grades         1000 non-null   float64
 9   parent_was_in_college  1000 non-null   bool   
 10  will_go_to_college     1000 non-null   bool   
dtypes: bool(2), float64(2), int64(2), object(5)
memory usage: 72.4+ KB


In [69]:
df.count()

type_school              1000
school_accreditation     1000
gender                   1000
interest                 1000
residence                1000
parent_age               1000
parent_salary            1000
house_area               1000
average_grades           1000
parent_was_in_college    1000
will_go_to_college       1000
dtype: int64

**Preprocessing The Data**

In [70]:
missing = pd.DataFrame({
    'Data Kosong': df.isnull().sum(),
    'Data Duplikat': df.duplicated().sum(),
    'Data NaNN': df.isna().sum(),
    'Type Data': df.dtypes})
missing

Unnamed: 0,Data Kosong,Data Duplikat,Data NaNN,Type Data
type_school,0,0,0,object
school_accreditation,0,0,0,object
gender,0,0,0,object
interest,0,0,0,object
residence,0,0,0,object
parent_age,0,0,0,int64
parent_salary,0,0,0,int64
house_area,0,0,0,float64
average_grades,0,0,0,float64
parent_was_in_college,0,0,0,bool


In [71]:
# Encoding The Categorical Value
df['type_school'] = LabelEncoder().fit_transform(df['type_school'])
df['school_accreditation'] = LabelEncoder().fit_transform(df['school_accreditation'])
df['gender'] = LabelEncoder().fit_transform(df['gender'])
df['interest'] = LabelEncoder().fit_transform(df['interest'])
df['residence'] = LabelEncoder().fit_transform(df['residence'])
df['parent_was_in_college'] = LabelEncoder().fit_transform(df['parent_was_in_college'])
df['will_go_to_college'] = LabelEncoder().fit_transform(df['will_go_to_college'])
df.head(10)

Unnamed: 0,type_school,school_accreditation,gender,interest,residence,parent_age,parent_salary,house_area,average_grades,parent_was_in_college,will_go_to_college
0,0,0,1,1,1,56,6950000,83.0,84.09,0,1
1,0,0,1,1,1,57,4410000,76.8,86.91,0,1
2,0,1,0,4,1,50,6500000,80.6,87.43,0,1
3,1,1,1,4,0,49,6600000,78.2,82.12,1,1
4,0,0,0,4,1,57,5250000,75.1,86.79,0,0
5,1,1,0,1,0,48,3770000,65.3,86.79,1,0
6,0,0,1,4,0,52,6680000,85.5,90.39,1,1
7,0,1,1,4,0,53,5890000,83.3,84.65,1,0
8,0,1,0,3,0,52,6730000,80.3,88.5,1,1
9,0,1,0,4,0,47,3880000,68.0,85.43,1,0


In [72]:
df.describe()

Unnamed: 0,type_school,school_accreditation,gender,interest,residence,parent_age,parent_salary,house_area,average_grades,parent_was_in_college,will_go_to_college
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.391,0.519,0.515,2.48,0.539,52.208,5381570.0,74.5153,86.0972,0.52,0.5
std,0.488219,0.499889,0.500025,1.399127,0.498726,3.500427,1397546.0,15.293346,3.378738,0.49985,0.50025
min,0.0,0.0,0.0,0.0,0.0,40.0,1000000.0,20.0,75.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,0.0,50.0,4360000.0,64.6,83.7375,0.0,0.0
50%,0.0,1.0,1.0,3.0,1.0,52.0,5440000.0,75.5,85.575,1.0,0.5
75%,1.0,1.0,1.0,4.0,1.0,54.0,6382500.0,84.825,88.2625,1.0,1.0
max,1.0,1.0,1.0,4.0,1.0,65.0,10000000.0,120.0,98.0,1.0,1.0


In [73]:
df.corr()

Unnamed: 0,type_school,school_accreditation,gender,interest,residence,parent_age,parent_salary,house_area,average_grades,parent_was_in_college,will_go_to_college
type_school,1.0,-0.020216,0.027206,0.062017,-0.245634,-0.108553,0.13498,-0.039654,-0.074534,0.146354,-0.034838
school_accreditation,-0.020216,1.0,-0.049198,0.004122,-0.263958,-0.165869,-0.02074,0.115532,0.159689,0.525279,-0.030022
gender,0.027206,-0.049198,1.0,-0.013164,0.009694,0.090292,-0.017202,-0.008192,0.027878,-0.059274,-0.034015
interest,0.062017,0.004122,-0.013164,1.0,-0.243472,-0.16777,0.170154,0.28515,0.15069,0.140842,0.247421
residence,-0.245634,-0.263958,0.009694,-0.243472,1.0,0.527456,-0.297498,-0.241503,-0.218412,-0.619502,0.006018
parent_age,-0.108553,-0.165869,0.090292,-0.16777,0.527456,1.0,-0.286418,0.10706,-0.006077,-0.438322,0.042873
parent_salary,0.13498,-0.02074,-0.017202,0.170154,-0.297498,-0.286418,1.0,0.138357,0.306712,0.106487,0.475693
house_area,-0.039654,0.115532,-0.008192,0.28515,-0.241503,0.10706,0.138357,1.0,0.409565,0.145186,0.467267
average_grades,-0.074534,0.159689,0.027878,0.15069,-0.218412,-0.006077,0.306712,0.409565,1.0,0.236713,0.505133
parent_was_in_college,0.146354,0.525279,-0.059274,0.140842,-0.619502,-0.438322,0.106487,0.145186,0.236713,1.0,0.048038


In [74]:
df['type_school'] = df['type_school'].astype('category')
df['school_accreditation'] = df['school_accreditation'].astype('category')
df['gender'] = df['gender'].astype('category')
df['interest'] = df['interest'].astype('category')
df['residence'] = df['residence'].astype('category')
df['parent_was_in_college'] = df['parent_was_in_college'].astype('category')
df['will_go_to_college'] = df['will_go_to_college'].astype('category')

In [75]:
# Feature Selection
X = df.drop(columns='will_go_to_college')
y = df.will_go_to_college

**Standardized The Data**

In [76]:
columns = ['parent_salary','house_area','parent_age','average_grades']
scaler = MinMaxScaler()
X[columns] = scaler.fit_transform(X[columns])

**Splitting Training and Testing Data**

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((900, 10), (100, 10), (900,), (100,))

**Train the Model**

In [78]:
model = RandomForestClassifier()

In [79]:
model.fit(X_train, y_train)

**Evaluate The Model**

In [80]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)

In [81]:
print('Akurasi data training adalah :', training_data_accuracy)

Akurasi data training adalah : 1.0


In [82]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)

In [83]:
print('Akurasi data testing :', test_data_accuracy)

Akurasi data testing : 0.85


In [84]:
print(classification_report(X_test_prediction, y_test))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84        47
           1       0.87      0.85      0.86        53

    accuracy                           0.85       100
   macro avg       0.85      0.85      0.85       100
weighted avg       0.85      0.85      0.85       100



In [85]:
cm = confusion_matrix(X_test_prediction, y_test)
cm

array([[40,  7],
       [ 8, 45]], dtype=int64)

**Build the Predictive Model**

**Save The Model**

In [88]:
pickle.dump(model, open("klasifikasi.pkl","wb"))

In [89]:
pickle.dump(scaler, open("scaler.pkl","wb"))