In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np

# Load the data with ';' delimiter
file_path = '../data/data.csv'
data = pd.read_csv(file_path, delimiter=';')

# Define functions for transformations and normalization

def convert_to_float(column):
    if column.dtype == 'object':
        return column.str.replace(',', '.').astype(float)
    else:
        return column  # If already numeric, return as is

def min_max_scale(column):
    scaler = MinMaxScaler()
    return scaler.fit_transform(column.values.reshape(-1, 1))

def z_score_standardize(column):
    scaler = StandardScaler()
    return scaler.fit_transform(column.values.reshape(-1, 1))

def log_transform(column):
    return np.log1p(column)

def percentage_to_ratio(column):
    if column.dtype == 'object':
        return column.str.replace(',', '.').str.replace('%', '').astype(float) / 100
    else:
        return column / 100

# Convert columns with commas to floats if necessary
data['Internal Expenditures on R&D'] = convert_to_float(data['Internal Expenditures on R&D'])
data['External Expenditures on R&D'] = convert_to_float(data['External Expenditures on R&D'])
data['Current Revenues'] = convert_to_float(data['Current Revenues'])
data['Current Expenditures'] = convert_to_float(data['Current Expenditures'])

# Log transform first, then apply Min-Max Scaling for large-magnitude columns
data['Internal Expenditures on R&D'] = min_max_scale(log_transform(data[['Internal Expenditures on R&D']]))
data['External Expenditures on R&D'] = min_max_scale(log_transform(data[['External Expenditures on R&D']]))
data['Current Revenues'] = min_max_scale(log_transform(data[['Current Revenues']]))
data['Current Expenditures'] = min_max_scale(log_transform(data[['Current Expenditures']]))

# Apply Min-Max Scaling for student population as well
data['Number of Students'] = min_max_scale(data[['Number of Students']])

# Convert Innovation Activity Level and Student Seat Availability to ratios
data['Innovation Activity Level, %'] = percentage_to_ratio(data['Innovation Activity Level, %'])
data['Student Seat Availability'] = percentage_to_ratio(data['Student Seat Availability'])

# Z-score standardize Innovation Activity Level and Student Seat Availability
data['Innovation Activity Level, %'] = z_score_standardize(data[['Innovation Activity Level, %']])
data['Student Seat Availability'] = z_score_standardize(data[['Student Seat Availability']])

data

Unnamed: 0,Region/City,Number of Students,Internal Expenditures on R&D,External Expenditures on R&D,"Innovation Activity Level, %",Current Revenues,Current Expenditures,Institutions,Teachers,Student Seat Availability
0,Abay,0.014625,0.672628,0.0,-0.418653,0.259665,0.256543,36,12800,0.190291
1,Akmola,0.009492,0.592692,0.454711,-1.001069,0.312471,0.309378,32,16500,-0.704391
2,Aktobe,0.032167,0.584686,0.685379,1.078988,0.341473,0.344694,39,20200,1.13908
3,Almaty,0.008789,0.5531,0.298114,-1.195207,0.40584,0.404252,39,31900,-0.126616
4,Atyrau,0.00914,0.466983,0.420712,-0.80693,0.291643,0.281555,25,12900,-0.779753
5,West Kazakhstan,0.035683,0.539299,0.517277,-1.361612,0.301367,0.289669,33,15900,0.963235
6,Zhambyl,0.027773,0.661913,0.143129,-1.167473,0.39683,0.397005,41,28300,0.961303
7,Zhetisu,0.021515,0.34216,0.503845,0.330168,0.270707,0.271437,22,15900,1.241495
8,Karagandy,0.060819,0.700827,0.53768,1.522734,0.373525,0.370043,47,17600,-0.331446
9,Kostanay,0.020742,0.543276,0.541361,-0.113578,0.299202,0.298698,34,14300,-2.279264


In [None]:
import pandas as pd

# load data from outer folder called data, retrieve data.csv
data = pd.read_csv('data.csv', delimiter=';')
data

Unnamed: 0,Region/City,Number of Students,Internal Expenditures on R&D,External Expenditures on R&D,"Innovation Activity Level, %",Current Revenues,Current Expenditures,Institutions,Teachers,Student Seat Availability
0,Abay,14920,39966548,100597.0,"8,1%",137877983,135552521,36,12800,"89,56%"
1,Akmola,12000,17360504,13670769.0,"6,0%",177064880,174108146,32,16500,"84,93%"
2,Aktobe,24900,15969484,51362995.0,"13,5%",203141605,205817860,39,20200,"94,47%"
3,Almaty,11600,1148681,556583.0,"5,3%",275562056,272914067,39,31900,"87,92%"
4,Atyrau,11800,4678045,11247709.0,"6,7%",160428756,152605460,25,12900,"84,54%"
5,West Kazakhstan,26900,9946607,195757246.0,"4,7%",167992206,158586075,33,15900,"93,56%"
6,Zhambyl,22400,35740087,228710.0,"5,4%",264048641,263703068,41,28300,"93,55%"
7,Zhetisu,18840,1272258,1812348.3,"10,8%",145282228,145462766,22,15900,"95,00%"
8,Karagandy,41200,53634991,22007162.0,"15,1%",236449621,232081637,47,17600,"86,86%"
9,Kostanay,18400,10367943,22477035.0,"9,2%",166278021,165516942,34,14300,"76,78%"


### 1. **Educational Resources**: Teachers, Institutions, and Student Seat Availability

#### Argumentation:
- **Teacher Quality and Availability**: Numerous studies indicate that teacher quality and the student-to-teacher ratio are primary drivers of educational outcomes. Smaller class sizes allow for more individualized instruction, which positively impacts student performance and retention (OECD, 2020). Well-qualified teachers contribute to higher student achievement, as teacher knowledge and experience are crucial for effective learning.
  
- **Institutional Availability**: The presence of sufficient educational institutions ensures that education is accessible to the population. Regions with more schools and colleges per capita can better serve diverse student needs, support specialized education (e.g., STEM or vocational training), and prevent overcrowding, which often detracts from learning quality (UNESCO, 2019).
  
- **Student Seat Availability**: Adequate seating capacity in classrooms is essential for accommodating all students and avoiding overcrowded learning environments, which can negatively impact engagement, safety, and comfort. Sufficient facilities and physical resources indicate a region's readiness to handle current and projected student populations (World Bank, 2018).

---

### 2. **Investments and Innovation**: Innovation Activity Level, Internal and External Expenditures on R&D

#### Argumentation:
- **Innovation Activity in Education**: Regions with active innovation in education, such as the development of digital tools, new teaching methodologies, or enhanced curricula, often demonstrate improved educational outcomes. Innovation helps modernize education, making it more relevant to current labor market demands and accessible through online and blended learning solutions (OECD, 2019).

- **Internal Expenditures on R&D**: Investment in research and development, especially for educational technology, teacher training programs, and curriculum improvements, is a proven driver of educational quality. Higher R&D spending often correlates with more effective education systems, as it allows for continuous improvement based on data and evidence (European Commission, 2020).

- **External Expenditures on R&D**: External funding sources, such as grants, donations, or international aid, enable regions to access resources they may lack internally. These funds can enhance infrastructure, provide scholarships, or improve teacher training programs, particularly in regions with constrained budgets. External funding also helps maintain a level of resilience, allowing educational systems to adapt to global best practices and innovations (UNICEF, 2020).

---

### 3. **Financial Stability**: Current Revenues and Current Expenditures

#### Argumentation:
- **Current Revenues**: Stable revenue streams allow educational institutions to maintain and expand their facilities, hire qualified staff, and invest in resources and technologies. Research shows that financially stable education systems tend to deliver better quality education, as they can provide stable learning environments and cover the costs of ongoing improvements (UNESCO Institute for Statistics, 2018).

- **Current Expenditures**: Consistent and well-allocated spending ensures that facilities are maintained, educational materials are updated, and teachers are compensated adequately. Balanced expenditures also allow regions to handle maintenance costs, unforeseen challenges, or expansions as needed. This stability is essential in ensuring that quality education is consistently accessible across different socioeconomic demographics (World Bank, 2020).

---

### 4. **Environmental Factors**: Number of Students and Population Growth

#### Argumentation:
- **Number of Students and Population Growth**: Regions with rapidly growing student populations must expand their educational capacities to maintain quality. A larger student base can create opportunities for diverse educational offerings but also pressures institutions to keep up with demand. Tracking student population growth helps regions proactively address potential gaps in resources, infrastructure, and teacher availability, which could otherwise strain educational quality (OECD, 2019).
  
- **Demand-Driven Education**: A larger student population often drives demand for higher education standards and resources. Regions with a young, growing population have a greater need for progressive educational policies to create a skilled future workforce. This demand can encourage investment in new institutions, better facilities, and programs aligned with the region’s economic development goals (OECD, 2020).

---

### Summary of Contributing Factors

Based on existing research, **educational potential is heightened in regions that exhibit**:
- Adequate teacher-to-student ratios and sufficient educational institutions to serve the population.
- High levels of R&D spending and an active commitment to educational innovation.
- Financial stability through strong revenue and expenditure management, supporting consistent educational quality.
- A growing or adequately managed student population that aligns with the region’s economic and social goals.

These factors provide an objective foundation for identifying and ranking regions based on their educational potential, grounded in well-documented research and global educational practices.

In [2]:
# Educational Resources Matrix
educational_resources_matrix = np.array([
    [1, 3, 5],
    [1/3, 1, 3],
    [1/5, 1/3, 1]
])

# Investments and Innovation Matrix
investments_innovation_matrix = np.array([
    [1, 2, 2],
    [1/2, 1, 1],
    [1/2, 1, 1]
])

# Financial Stability Matrix
financial_stability_matrix = np.array([
    [1, 2],
    [1/2, 1]
])

# Environmental Factor - single criterion, so the weight is trivially 1

# Function to calculate priority weights using the Eigenvector method
def calculate_priority_weights(matrix):
    eigenvalues, eigenvectors = np.linalg.eig(matrix)
    principal_eigenvector = np.real(eigenvectors[:, np.argmax(eigenvalues)])
    priority_weights = principal_eigenvector / principal_eigenvector.sum()
    return priority_weights

# Calculate weights for each criterion group
weights_educational_resources = calculate_priority_weights(educational_resources_matrix)
weights_investments_innovation = calculate_priority_weights(investments_innovation_matrix)
weights_financial_stability = calculate_priority_weights(financial_stability_matrix)

# Assign calculated weights to criteria
criteria_weights = {
    'Teachers': weights_educational_resources[0],
    'Institutions': weights_educational_resources[1],
    'Student Seat Availability': weights_educational_resources[2],
    'Innovation Activity Level, %': weights_investments_innovation[0],
    'Internal Expenditures on R&D': weights_investments_innovation[1],
    'External Expenditures on R&D': weights_investments_innovation[2],
    'Current Revenues': weights_financial_stability[0],
    'Current Expenditures': weights_financial_stability[1],
    'Number of Students': 1.0  # Environmental factor with a weight of 1
}

# Print the assigned weights for verification
print("Calculated Weights for Each Criterion:")
for criterion, weight in criteria_weights.items():
    print(f"{criterion}: {weight}")

# Calculate the score for each region based on the criteria weights
data['Score'] = data.apply(lambda row: sum(row[criterion] * weight for criterion, weight in criteria_weights.items()), axis=1)

# Display the ranked regions with scores
print("\nRanked Regions by Educational Potential:")
print(data[['Region/City', 'Score']])

Calculated Weights for Each Criterion:
Teachers: 0.6369855717447568
Institutions: 0.2582849943744952
Student Seat Availability: 0.10472943388074796
Innovation Activity Level, %: 0.5
Internal Expenditures on R&D: 0.24999999999999994
External Expenditures on R&D: 0.25
Current Revenues: 0.6666666666666667
Current Expenditures: 0.3333333333333333
Number of Students: 1.0

Ranked Regions by Educational Potential:
                   Region/City          Score
0                         Abay    8162.965587
1                       Akmola   10518.535532
2                       Aktobe   12878.532684
3                       Almaty   20329.928892
4                       Atyrau    8223.605217
5              West Kazakhstan   10136.611363
6                      Zhambyl   18037.424227
7                      Zhetisu   10134.551932
8                    Karagandy   11224.554923
9                     Kostanay    9117.970806
10                   Kyzylorda   14595.696060
11                   Mangystau    987

In [None]:
# remove The Republic of Kazakhstan from the data, recalculate the scores and display the ranked regions
data = data[data['Region/City'] != 'The Republic of Kazakhstan']

# Normalize the teachers column with Min-Max Scaling
data['Teachers'] = min_max_scale(data[['Teachers']])
data['Institutions'] = min_max_scale(data[['Institutions']])

data['Score'] = data.apply(lambda row: sum(row[criterion] * weight for criterion, weight in criteria_weights.items()), axis=1)

# Saving
data.to_csv('data_normalized.csv', index=False)
data

Unnamed: 0,Region/City,Number of Students,Internal Expenditures on R&D,External Expenditures on R&D,"Innovation Activity Level, %",Current Revenues,Current Expenditures,Institutions,Teachers,Student Seat Availability,Score
0,Abay,0.014625,0.672628,0.0,-0.418653,0.259665,0.256543,0.323944,0.146774,0.190291,0.429172
1,Akmola,0.009492,0.592692,0.454711,-1.001069,0.312471,0.309378,0.267606,0.206452,-0.704391,0.209103
2,Aktobe,0.032167,0.584686,0.685379,1.078988,0.341473,0.344694,0.366197,0.266129,1.13908,1.615123
3,Almaty,0.008789,0.5531,0.298114,-1.195207,0.40584,0.404252,0.366197,0.454839,-0.126616,0.400348
4,Atyrau,0.00914,0.466983,0.420712,-0.80693,0.291643,0.281555,0.169014,0.148387,-0.779753,0.17239
5,West Kazakhstan,0.035683,0.539299,0.517277,-1.361612,0.301367,0.289669,0.28169,0.196774,0.963235,0.215467
6,Zhambyl,0.027773,0.661913,0.143129,-1.167473,0.39683,0.397005,0.394366,0.396774,0.961303,0.49746
7,Zhetisu,0.021515,0.34216,0.503845,0.330168,0.270707,0.271437,0.126761,0.196774,1.241495,0.957155
8,Karagandy,0.060819,0.700827,0.53768,1.522734,0.373525,0.370043,0.478873,0.224194,-0.331446,1.735959
9,Kostanay,0.020742,0.543276,0.541361,-0.113578,0.299202,0.298698,0.295775,0.170968,-2.279264,0.480738
