# 1. IMPORT LIBRABIES AND DATAFRAME

Import libraries

In [None]:
import pandas as pd
import numpy as np

Import dataframe

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Data/data/data.csv', sep=';')

# 2. PREPARING DATA FOR LOGISTIC REGRESSION

In [None]:
df = data.copy()

In [None]:
def encode_target(value: str)->int:
  if value == "Enrolled" or value == "Dropout":
    return 0

  return 1

In [None]:
df["Target"] = df["Target"].apply(encode_target)

In [None]:
print(df["Target"].value_counts())

Target
0    2215
1    2209
Name: count, dtype: int64


In [None]:
continous_columns = [
    "Previous qualification (grade)",
    "Admission grade",
    "Unemployment rate",
    "Inflation rate",
    "GDP"
]
for column in continous_columns:
    df[column] = (df[column] - df[column].mean()) / df[column].std()

In [None]:
normalized_df = (df -df.mean()) / df.std()

In [None]:
normalized_df['Target'] = df['Target']

In [None]:
X = normalized_df.iloc[:, :-1]

In [None]:
X = X.to_numpy()
X.shape

(4424, 36)

In [None]:
y = normalized_df.iloc[:, -1]
y.value_counts()

Target
0    2215
1    2209
Name: count, dtype: int64

In [None]:
y = y.to_numpy().reshape(-1, 1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3539, 36), (885, 36), (3539, 1), (885, 1))

# 3. BUILDING LOGISTIC REGRESSION MODEL

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss

model = LogisticRegression()
model.fit(X_train, y_train.ravel())

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
test_loss = log_loss(y_test, y_pred_proba)
print(f"Accuracy of test: {test_accuracy}")
print(f"Loss of test: {test_loss}")

Accuracy of test: 0.8282485875706215
Loss of test: 0.36924179891051245


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.78      0.83       467
           1       0.78      0.88      0.83       418

    accuracy                           0.83       885
   macro avg       0.83      0.83      0.83       885
weighted avg       0.83      0.83      0.83       885



# 4. PREPARING DATA FOR SOFTMAX REGRESSION

In [None]:
df1 = data.copy()

In [None]:
def encode_target(value: str) -> int:
    if value == "Enrolled":
        return 0
    if value == "Dropout":
        return 1

    return 2

In [None]:
df1["Target"] = df1["Target"].apply(encode_target)

In [None]:
print(df1["Target"].value_counts())

Target
2    2209
1    1421
0     794
Name: count, dtype: int64


In [None]:
continous_columns = [
    "Previous qualification (grade)",
    "Admission grade",
    "Unemployment rate",
    "Inflation rate",
    "GDP"
]
for column in continous_columns:
    df1[column] = (df1[column] - df1[column].mean()) / df1[column].std()

In [None]:
normalized_df1 = (df1 -df1.mean()) / df1.std()

In [None]:
normalized_df1['Target'] = df1['Target']

In [None]:
X = normalized_df1.iloc[:, :-1]

In [None]:
X = X.to_numpy()
X.shape

(4424, 36)

In [None]:
y = normalized_df1.iloc[:, -1]
y.value_counts()

Target
2    2209
1    1421
0     794
Name: count, dtype: int64

In [None]:
y = y.to_numpy().reshape(-1, 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3539, 36), (885, 36), (3539, 1), (885, 1))

# 5. BUILDING MODEL SOFTMAX REGRESSION

In [None]:
model = LogisticRegression(multi_class='ovr',
                           solver= 'liblinear')

model.fit(X_train, y_train.ravel())

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
test_loss = log_loss(y_test, y_pred_proba)
print(f"Accuracy of test: {test_accuracy}")
print(f"Loss of test: {test_loss}")

Accuracy of test: 0.7615819209039548
Loss of test: 0.6154934253703702


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.49      0.22      0.30       151
           1       0.83      0.78      0.81       316
           2       0.76      0.94      0.84       418

    accuracy                           0.76       885
   macro avg       0.69      0.65      0.65       885
weighted avg       0.74      0.76      0.74       885

