# Turkiye Student Evaluation

Dataset is from https://archive.ics.uci.edu/dataset/262/turkiye+student+evaluation

n = 5820

28 features

In [7]:
import pandas as pd

In [8]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/turkiye-student-evaluation_generic.csv"
df = pd.read_csv(url)

In [9]:
df.shape

(5820, 33)

In [10]:
print(df.head())

   instr  class  nb.repeat  attendance  difficulty  Q1  Q2  Q3  Q4  Q5  ...  \
0      1      2          1           0           4   3   3   3   3   3  ...   
1      1      2          1           1           3   3   3   3   3   3  ...   
2      1      2          1           2           4   5   5   5   5   5  ...   
3      1      2          1           1           3   3   3   3   3   3  ...   
4      1      2          1           0           1   1   1   1   1   1  ...   

   Q19  Q20  Q21  Q22  Q23  Q24  Q25  Q26  Q27  Q28  
0    3    3    3    3    3    3    3    3    3    3  
1    3    3    3    3    3    3    3    3    3    3  
2    5    5    5    5    5    5    5    5    5    5  
3    3    3    3    3    3    3    3    3    3    3  
4    1    1    1    1    1    1    1    1    1    1  

[5 rows x 33 columns]


In [11]:
from sklearn import svm
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [12]:
X = df.drop("difficulty", axis=1)
y = df["difficulty"]

In [13]:
X = X.drop("instr", axis=1)
X = X.drop("class", axis=1)
X = X.drop("nb.repeat", axis=1)
X = X.drop("attendance", axis=1)

In [14]:
X.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q19,Q20,Q21,Q22,Q23,Q24,Q25,Q26,Q27,Q28
0,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
1,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
2,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5
3,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [15]:
y.value_counts()

3    1774
1    1620
4    1225
5     652
2     549
Name: difficulty, dtype: int64

In [16]:
y = y.replace({2 : 0, 1 : 0, 3: 1, 4 : 1, 5 : 1})
y.value_counts()

1    3651
0    2169
Name: difficulty, dtype: int64

## Logistic Regression

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create and train the logistic regression model
#model = LogisticRegression()
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Make predictions on the test set
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

# Calculate accuracy
accuracy = accuracy_score(y_train, y_train_pred)
print("Test Accuracy:", accuracy)
accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.6466208476517755
Test Accuracy: 0.6559278350515464


# Divide and Conquer

Divide the training data into 11 batches, train a logistic model on each of the batch, and then combine the 11 prediction results. Consider the following two ensemble methods:
- majority voting
- average (or sum) of the logit output and then make decision based on its sign

In [18]:
def iterate_process(X, y):
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    
    # Define the number of batches
    num_batches = 11
    
    # Randomly shuffle the data indices
    #indices = np.random.permutation(len(X))
    
    #change 7/12
    indices = np.random.permutation(len(X_train))
    
    # Calculate the batch size
    batch_size = len(X_train) // num_batches
    
    # Make predictions on the test set using majority voting
    preds_voting = np.zeros(len(y_test))
    # Make predictions on the test set using average of logit
    preds_logit = np.zeros(len(y_test))
    #Make predictions on the test set using average of probs
    preds_prob = np.zeros(len(y_test))

    
    # Split the training data into batches, fit a logistic regression model on each batch
    for i in range(num_batches):
        # Calculate the starting and ending indices for the current batch
        start_index = i * batch_size
        end_index = (i + 1) * batch_size
        
        # Create a logistic regression model
        model = LogisticRegression(max_iter=500)
        
        # Select the current batch for training
        X_batch = X_train.iloc[indices[start_index:end_index]]
        y_batch = y_train.iloc[indices[start_index:end_index]]
        
        scaler = StandardScaler()
        X_batch_scaled = scaler.fit_transform(X_batch)
        X_test_scaled = scaler.transform(X_test)
        
        # Fit the model on the current batch
        model.fit(X_batch_scaled, y_batch)
               
        y_pred = model.predict(X_test_scaled)
        # Accumulate the predictions using majority voting
        preds_voting += (y_pred == 1)
    
        # Accumulate the predictions using majority voting
        y_pred = model.decision_function(X_test_scaled)
        preds_logit += y_pred
        
        #Accumulate the probs
        y_pred = model.predict_proba(X_test_scaled)
        preds_prob += y_pred[:,1]
   
    
    accuracy = np.zeros(4)
    
    # Majority voting (selecting the most frequent prediction for each sample)
    final_predictions = np.where(preds_voting > num_batches / 2, 1, 0)
    accuracy[0] = accuracy_score(y_test, final_predictions)
    # Average of logit
    final_predictions = np.where(preds_logit > 0, 1, 0)
    accuracy[1] = accuracy_score(y_test, final_predictions)
    #Average of probs
    final_predictions = np.where(preds_prob / num_batches > 0.5, 1, 0)
    accuracy[2] = accuracy_score(y_test, final_predictions)
    
    # Train a model on all 11 batches of training data
    model = LogisticRegression(max_iter=500)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy[3] = accuracy_score(y_test, y_pred)
    
    return accuracy

Try the divide and conquer approaches 10 times, reporting the following error matrix. 
- 10-by-4
- col_1: majority voting
- col_2: average the logit
- col_3: average probabilities
- col_4: using the model trained on all the training data

In [19]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 4))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i] = iterate_process(X, y)
    seed += 2

# Print the accuracies
print("Accuracies:", accuracies)

Accuracies: [[0.65120275 0.65936426 0.66065292 0.65635739]
 [0.63530928 0.64261168 0.64390034 0.64046392]
 [0.64948454 0.65120275 0.64905498 0.6516323 ]
 [0.64905498 0.65506873 0.6580756  0.65292096]
 [0.64948454 0.65034364 0.64905498 0.645189  ]
 [0.64733677 0.64089347 0.64175258 0.64690722]
 [0.64862543 0.6507732  0.65120275 0.64905498]
 [0.64304124 0.64604811 0.64561856 0.64261168]
 [0.63487973 0.63530928 0.63402062 0.63745704]
 [0.63402062 0.63487973 0.63487973 0.63831615]
 [0.6516323  0.6507732  0.64991409 0.64733677]
 [0.66237113 0.66022337 0.66108247 0.66022337]
 [0.63702749 0.64089347 0.64003436 0.64347079]
 [0.63702749 0.63745704 0.63831615 0.63616838]
 [0.64003436 0.64647766 0.64733677 0.64304124]
 [0.64733677 0.64690722 0.64862543 0.64905498]
 [0.62757732 0.62886598 0.62886598 0.62542955]
 [0.64475945 0.64132302 0.64046392 0.64304124]
 [0.63616838 0.64089347 0.64046392 0.63616838]
 [0.63659794 0.64046392 0.64089347 0.6378866 ]
 [0.64819588 0.64390034 0.64561856 0.64604811]
 

In [20]:
np.mean(accuracies, axis = 0)

array([0.64300687, 0.64402062, 0.64416667, 0.6432732 ])

In [21]:
np.std(accuracies, axis = 0)

array([0.0087261 , 0.00829162, 0.00840252, 0.00864426])