In [1]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, classification_report, confusion_matrix

In [2]:
df= pd.read_csv(r"C:\Users\HandePC\Desktop\data.csv", sep=";")

In [3]:
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [4]:
df.rename(columns = {'Nacionality':'Nationality', 'Age at enrollment':'Age'}, inplace = True)

In [5]:
df['Target'] = df['Target'].map({
    'Dropout':0,
    'Enrolled':1,
    'Graduate':2
})

In [6]:
Xcategorical_columns = df[["Marital status", "Application mode", "Course", "Daytime/evening attendance\t",
        "Previous qualification", "Previous qualification (grade)", "Nationality", "Mother's qualification",
        "Father's qualification", "Mother's occupation", "Father's occupation", "Admission grade", "Displaced",
        "Educational special needs", "Debtor", "Tuition fees up to date", "Gender", "Scholarship holder",
         "International"]]

Xnum= df[["Application order","Age", "Curricular units 1st sem (credited)","Curricular units 1st sem (enrolled)", "Curricular units 1st sem (evaluations)",
        "Curricular units 1st sem (approved)", "Curricular units 1st sem (grade)",
        "Curricular units 1st sem (without evaluations)", "Curricular units 2nd sem (credited)",
        "Curricular units 2nd sem (enrolled)", "Curricular units 2nd sem (evaluations)",
        "Curricular units 2nd sem (approved)", "Curricular units 2nd sem (grade)",
        "Curricular units 2nd sem (without evaluations)", "Unemployment rate", "Inflation rate", "GDP"]]
y = df['Target'] 


In [7]:
correlation_matrix = Xnum.corr()

# korelasyon katsayısını belirledik
high_correlation = (correlation_matrix > 0.6) & (correlation_matrix < 1.0)

# Koralasyona göre numeric verileri drop etme
columns_to_drop = [column for column in high_correlation.columns if any(high_correlation[column])]
Xnum_dropped = Xnum.drop(columns=columns_to_drop)

In [8]:
def calculate_cramer_v_matrix(dataframe, categorical_columns):
    num_cat_columns = len(categorical_columns)
    cramer_v_matrix = np.zeros((num_cat_columns, num_cat_columns))

    for i in range(num_cat_columns):
        for j in range(i + 1, num_cat_columns):
            cat_variable1 = dataframe[categorical_columns[i]]
            cat_variable2 = dataframe[categorical_columns[j]]

            contingency_table = pd.crosstab(cat_variable1, cat_variable2)

            if contingency_table.shape[0] == 0 or contingency_table.shape[1] == 0:
                continue

            chi2, _, _, _ = chi2_contingency(contingency_table)

            n = sum(contingency_table.sum())
            min_val = min(len(contingency_table) - 1, len(contingency_table.columns) - 1)
            cramer_v = np.sqrt(chi2 / (n * min_val))

            cramer_v_matrix[i, j] = cramer_v
            cramer_v_matrix[j, i] = cramer_v

    return cramer_v_matrix

# Dataframe üzerinde Cramer's V katsayıları
cramer_v_matrix = calculate_cramer_v_matrix(df, Xcategorical_columns.columns)

cramer_v_df = pd.DataFrame(cramer_v_matrix, index=Xcategorical_columns.columns, columns=Xcategorical_columns.columns)

def drop_high_correlation(matrix, threshold, categorical_columns):
    upper_triangle = np.triu(matrix, k=1)

    # Eşik değerden yüksek korelasyona sahip hücreleri bul
    high_correlation_cells = np.where(upper_triangle > threshold)

    # Korelasyon değeri eşik değerinden yüksek olan değişken çiftlerini bul
    high_correlation_pairs = [(matrix.columns[i], matrix.columns[j]) for i, j in zip(*high_correlation_cells)]

    for variable1, variable2 in high_correlation_pairs:
        if variable1 not in matrix.columns or variable2 not in matrix.columns:
            continue

        # Drop işlemi
        if matrix[variable1].sum() >= matrix[variable2].sum():
            matrix = matrix.drop([variable1], axis=1)
            print(f"Dropped variable: {variable1}")
            categorical_columns.remove(variable1)

    return categorical_columns  # Güncellenmiş liste

# drop_high_correlation fonksiyonunu çağırırken Xcategorical_columns.columns kullanılabilir
categorical_columns = drop_high_correlation(cramer_v_df, threshold=0.3, categorical_columns=list(Xcategorical_columns.columns))

# Kalan kategorik değişkenleri yazdır
remaining_categorical_columns = categorical_columns
print("Remaining Categorical Columns:", remaining_categorical_columns)

Dropped variable: Application mode
Dropped variable: Course
Dropped variable: Previous qualification (grade)
Dropped variable: Mother's qualification
Dropped variable: Mother's occupation
Dropped variable: Admission grade
Remaining Categorical Columns: ['Marital status', 'Daytime/evening attendance\t', 'Previous qualification', 'Nationality', "Father's qualification", "Father's occupation", 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'International']


In [9]:
df["Target"] = df["Target"].apply(lambda x: x.split()[0] if isinstance(x, str) else x)

# Sütun adlarını birleştirme
selected_columns = remaining_categorical_columns + list(map(str, Xnum_dropped.columns)) + ["Target"]
df_updated = df[selected_columns].copy()

In [10]:
df_updated.drop(df[df['Target'] == 1].index, inplace = True)
df_updated

Unnamed: 0,Marital status,Daytime/evening attendance\t,Previous qualification,Nationality,Father's qualification,Father's occupation,Displaced,Educational special needs,Debtor,Tuition fees up to date,...,Scholarship holder,International,Application order,Age,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,1,1,1,12,9,1,0,0,1,...,0,0,5,20,0,0,10.8,1.4,1.74,0
1,1,1,1,1,3,3,1,0,0,0,...,0,0,1,19,0,0,13.9,-0.3,0.79,2
2,1,1,1,1,37,9,1,0,0,0,...,0,0,5,19,0,0,10.8,1.4,1.74,0
3,1,1,1,1,37,3,1,0,0,1,...,0,0,2,20,0,0,9.4,-0.8,-3.12,2
4,2,0,1,1,38,9,0,0,0,1,...,0,0,1,45,0,0,13.9,-0.3,0.79,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,1,1,1,4,0,0,0,1,...,0,0,6,19,0,0,15.5,2.8,-4.06,2
4420,1,1,1,105,1,9,1,0,1,0,...,0,1,2,18,0,0,11.1,0.6,2.02,0
4421,1,1,1,1,37,9,1,0,0,1,...,1,0,1,30,0,0,13.9,-0.3,0.79,0
4422,1,1,1,1,37,4,1,0,0,1,...,1,0,1,20,0,0,9.4,-0.8,-3.12,2


In [11]:
df_updated['Dropout'] = df['Target'].apply(lambda x: 1 if x==0 else 0)
df_updated

Unnamed: 0,Marital status,Daytime/evening attendance\t,Previous qualification,Nationality,Father's qualification,Father's occupation,Displaced,Educational special needs,Debtor,Tuition fees up to date,...,International,Application order,Age,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Dropout
0,1,1,1,1,12,9,1,0,0,1,...,0,5,20,0,0,10.8,1.4,1.74,0,1
1,1,1,1,1,3,3,1,0,0,0,...,0,1,19,0,0,13.9,-0.3,0.79,2,0
2,1,1,1,1,37,9,1,0,0,0,...,0,5,19,0,0,10.8,1.4,1.74,0,1
3,1,1,1,1,37,3,1,0,0,1,...,0,2,20,0,0,9.4,-0.8,-3.12,2,0
4,2,0,1,1,38,9,0,0,0,1,...,0,1,45,0,0,13.9,-0.3,0.79,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,1,1,1,4,0,0,0,1,...,0,6,19,0,0,15.5,2.8,-4.06,2,0
4420,1,1,1,105,1,9,1,0,1,0,...,1,2,18,0,0,11.1,0.6,2.02,0,1
4421,1,1,1,1,37,9,1,0,0,1,...,0,1,30,0,0,13.9,-0.3,0.79,0,1
4422,1,1,1,1,37,4,1,0,0,1,...,0,1,20,0,0,9.4,-0.8,-3.12,2,0


In [12]:
x = df_updated.iloc[:, :19].values
#x = df[["Tuition fees up to date","Curricular units 1st sem (approved)","Curricular units 1st sem (grade)","Curricular units 2nd sem (approved)","Curricular units 2nd sem (grade)"]].values
print(x)
x = StandardScaler().fit_transform(x)
x

[[ 1.   1.   1.  ...  0.  10.8  1.4]
 [ 1.   1.   1.  ...  0.  13.9 -0.3]
 [ 1.   1.   1.  ...  0.  10.8  1.4]
 ...
 [ 1.   1.   1.  ...  0.  13.9 -0.3]
 [ 1.   1.   1.  ...  0.   9.4 -0.8]
 [ 1.   1.   1.  ...  0.  12.7  3.7]]


array([[-0.30068558,  0.35585028, -0.35242129, ..., -0.19014841,
        -0.31131218,  0.12161459],
       [-0.30068558,  0.35585028, -0.35242129, ..., -0.19014841,
         0.85091858, -1.10607007],
       [-0.30068558,  0.35585028, -0.35242129, ..., -0.19014841,
        -0.31131218,  0.12161459],
       ...,
       [-0.30068558,  0.35585028, -0.35242129, ..., -0.19014841,
         0.85091858, -1.10607007],
       [-0.30068558,  0.35585028, -0.35242129, ..., -0.19014841,
        -0.83619059, -1.46715379],
       [-0.30068558,  0.35585028, -0.35242129, ..., -0.19014841,
         0.4010228 ,  1.78259971]])

In [13]:
y = df_updated['Dropout'].values
y

array([1, 0, 1, ..., 1, 0, 0], dtype=int64)

In [14]:
print("X shape after fix:", x.shape)
print("y shape after fix:", y.shape)

X shape after fix: (3630, 19)
y shape after fix: (3630,)


In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 123)


In [55]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, classification_report
from time import time

def perform(model, x_train, y_train, x_test, y_test):
    # Training
    start_time_train = time()
    model.fit(x_train, y_train)
    end_time_train = time()

    training_time = end_time_train - start_time_train
    print("Training Time: {:.4f} seconds".format(training_time))

    # Testing
    start_time_test = time()
    y_pred = model.predict(x_test)
    end_time_test = time()

    testing_time = end_time_test - start_time_test
    print("Testing Time: {:.4f} seconds".format(testing_time))

    # Evaluation Metrics
    print("Precision : ", precision_score(y_test, y_pred, average='micro'))
    print("Recall : ", recall_score(y_test, y_pred, average='micro'))
    print("Accuracy : ", accuracy_score(y_test, y_pred))
    print("F1 Score : ", f1_score(y_test, y_pred, average='micro'))
    cm = confusion_matrix(y_test, y_pred)
    print("\n", cm)
    print("\n")
    print("**" * 27 + "\n" + " " * 16 + "Classification Report\n" + "**" * 27)
    print(classification_report(y_test, y_pred))
    print("**" * 27 + "\n")


In [60]:
model_nb = GaussianNB()
model_nb.fit(x_train, y_train)

In [61]:
y_pred_nb = model_nb.predict(x_test)

In [62]:
perform(model_nb, x_train, y_train, x_test, y_test)

Training Time: 0.0030 seconds
Testing Time: 0.0020 seconds
Precision :  0.7672176308539945
Recall :  0.7672176308539945
Accuracy :  0.7672176308539945
F1 Score :  0.7672176308539945

 [[407  56]
 [113 150]]


******************************************************
                Classification Report
******************************************************
              precision    recall  f1-score   support

           0       0.78      0.88      0.83       463
           1       0.73      0.57      0.64       263

    accuracy                           0.77       726
   macro avg       0.76      0.72      0.73       726
weighted avg       0.76      0.77      0.76       726

******************************************************



In [65]:
model_rf = RandomForestClassifier(n_estimators=100,criterion='entropy')
model_rf.fit(x_train, y_train)


In [66]:
y_pred_rf = model_rf.predict(x_test)

In [67]:
perform(model_rf, x_train, y_train, x_test, y_test)

Training Time: 0.4907 seconds
Testing Time: 0.0200 seconds
Precision :  0.7575757575757576
Recall :  0.7575757575757576
Accuracy :  0.7575757575757576
F1 Score :  0.7575757575757576

 [[379  84]
 [ 92 171]]


******************************************************
                Classification Report
******************************************************
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       463
           1       0.67      0.65      0.66       263

    accuracy                           0.76       726
   macro avg       0.74      0.73      0.74       726
weighted avg       0.76      0.76      0.76       726

******************************************************



In [68]:
model_svc = SVC(C=0.1,kernel='linear')
model_svc.fit(x_train, y_train)

In [69]:
y_pred_svc = model_svc.predict(x_test)

In [70]:
perform(model_svc ,x_train, y_train, x_test, y_test)

Training Time: 0.1938 seconds
Testing Time: 0.0240 seconds
Precision :  0.7920110192837465
Recall :  0.7920110192837465
Accuracy :  0.7920110192837465
F1 Score :  0.7920110192837465

 [[426  37]
 [114 149]]


******************************************************
                Classification Report
******************************************************
              precision    recall  f1-score   support

           0       0.79      0.92      0.85       463
           1       0.80      0.57      0.66       263

    accuracy                           0.79       726
   macro avg       0.79      0.74      0.76       726
weighted avg       0.79      0.79      0.78       726

******************************************************



In [44]:
from sklearn.ensemble import GradientBoostingClassifier

model_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)

model_gb.fit(x_train, y_train)


In [45]:
y_pred_gb = model_gb.predict(x_test)

In [71]:
perform(model_gb,x_train, y_train, x_test, y_test)

Training Time: 0.2654 seconds
Testing Time: 0.0020 seconds
Precision :  0.7796143250688705
Recall :  0.7796143250688705
Accuracy :  0.7796143250688705
F1 Score :  0.7796143250688705

 [[397  66]
 [ 94 169]]


******************************************************
                Classification Report
******************************************************
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       463
           1       0.72      0.64      0.68       263

    accuracy                           0.78       726
   macro avg       0.76      0.75      0.76       726
weighted avg       0.78      0.78      0.78       726

******************************************************



In [72]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier(max_depth=None, random_state=123)

model_dt.fit(x_train, y_train)


In [73]:
y_pred_dt = model_dt.predict(x_test)

In [74]:
perform(model_dt,x_train, y_train, x_test, y_test)

Training Time: 0.0195 seconds
Testing Time: 0.0010 seconds
Precision :  0.6556473829201102
Recall :  0.6556473829201102
Accuracy :  0.6556473829201102
F1 Score :  0.6556473829201102

 [[317 146]
 [104 159]]


******************************************************
                Classification Report
******************************************************
              precision    recall  f1-score   support

           0       0.75      0.68      0.72       463
           1       0.52      0.60      0.56       263

    accuracy                           0.66       726
   macro avg       0.64      0.64      0.64       726
weighted avg       0.67      0.66      0.66       726

******************************************************

