# Graniczna Analiza Danych
### Kacper Perz 145261, Maciej Wieczorek 148141

In [None]:
import pandas as pd
import numpy as np
import pulp

In [None]:
inputs = pd.read_csv('inputs.csv', delimiter=';')
outputs = pd.read_csv('outputs.csv', delimiter=';')

inputs.rename(columns={'Unnamed: 0': 'city'}, inplace=True)
outputs.rename(columns={'Unnamed: 0': 'city'}, inplace=True)

airports_df = pd.merge(inputs, outputs, on='city')
airports_input_df = airports_df.filter(regex='^i')
airports_output_df = airports_df.filter(regex='^o')
print(airports_df)
print(airports_input_df)
print(airports_output_df)

#### Notacja
Dane (wartości stałe):
- $K$ – liczba jednostek, $(k = 1, 2, \dots, K)$
- $M$ – liczba nakładów (wejść) w analizowanym problemie $(m =1, 2, \dots ,M)$
- $N$ – liczba efektów (wyjść) w analizowanym problemie $(n =1, 2, \dots ,N)$
- $x_{mk}$ – wartość m-go wejścia k-ej jednostki
- $y_{nk}$ – wartość n-go wyjścia k-ej jednostki

Zmienne:
- $ν_m$ – waga m-go nakładu,
- $μ_n$ – waga n-go efektu.

## Efektywność

In [None]:
def get_lp_problem(M, N, K, x, y, x_o, y_o):
    lp = pulp.LpProblem("Minimize_theta", pulp.LpMinimize)

    theta = pulp.LpVariable("theta", lowBound=0)
    lambda_vars = pulp.LpVariable.dicts("l", range(K), lowBound=0)

    # Objective function
    lp += theta

    # Add constraints
    for m in range(M):
        lp += pulp.lpSum(lambda_vars[k] * x[k][m] for k in range(K)) <= theta * x_o[m], f"Input_constraint_{m}"

    for n in range(N):
        lp += pulp.lpSum(lambda_vars[k] * y[k][n] for k in range(K)) >= y_o[n], f"Output_constraint_{n}"

    return lp

In [None]:

def get_effectiveness(input_df, output_df, df, i):
    lp = get_lp_problem(
        len(input_df.columns),
        len(output_df.columns),
        len(df),
        input_df.to_numpy(),
        output_df.to_numpy(),
        input_df.iloc[i].to_numpy(),
        output_df.iloc[i].to_numpy()
    )

    lp.solve()

    lambda_values = \
    list(
        map(
            lambda x: x.value(),
            sorted(
                filter(
                    lambda x: 'l' in x.name, 
                    lp.variables()
                ),
                key=lambda x: int(x.name.split('_')[1])
            )
        )
    )
    
    return lp.objective.value(), lambda_values

In [None]:
effectiveness_df = airports_df[['city']].copy()
effectiveness_df['effectiveness'] = 0
for k in range(len(airports_df)):
    effectiveness_df[f'l{k}'] = 0

for i in range(len(airports_df)):
    effectiveness, lambda_values = get_effectiveness(airports_input_df, airports_output_df, airports_df, i)
    effectiveness_df.loc[i,'effectiveness'] = effectiveness
    for k in range(len(lambda_values)):
        effectiveness_df.loc[i, f'l{k}'] = lambda_values[k]

effectiveness_df.round(3)

In [None]:
ineffective_df = effectiveness_df[effectiveness_df['effectiveness'] < 1].copy()
ineffective_df.round(3)

## Hipotetyczna jednostka porównawcza

In [None]:
hcu_df = ineffective_df[['city']].copy()

for i in range(len(airports_input_df.columns)):
    hcu_df[f'hcu_x{i}'] = 0
for i in range(len(airports_output_df.columns)):
    hcu_df[f'hcu_y{i}'] = 0
    
for i in range(len(ineffective_df)):
    lambda_values = ineffective_df.iloc[i].filter(regex='^l').to_numpy()
    inputs = airports_input_df.to_numpy()
    x_hcu = lambda_values @ inputs
    outputs = airports_output_df.to_numpy()
    y_hcu = lambda_values @ outputs

    hcu_df.loc[hcu_df.filter(regex='^hcu_x').index[i], hcu_df.filter(regex='^hcu_x').columns] = x_hcu
    hcu_df.loc[hcu_df.filter(regex='^hcu_y').index[i], hcu_df.filter(regex='^hcu_y').columns] = y_hcu

hcu_df.round(3)

In [None]:
hcu_df

In [None]:
airports_df[airports_df['city'].isin(hcu_df['city'])].round(3)

In [None]:
input_corrections_df = pd.DataFrame((airports_df[airports_df['city'].isin(hcu_df['city'])].to_numpy()[:, 1:-2] - hcu_df.to_numpy()[:, 1:-2]).astype(np.float64))
input_corrections_df.round(3)

# Superefektywność

In [None]:
def get_se_lp_problem(M, N, K, x, y, x_o, y_o, i):
    lp = pulp.LpProblem("Minimize_theta", pulp.LpMinimize)

    theta = pulp.LpVariable("theta", lowBound=0)
    lambda_vars = pulp.LpVariable.dicts("l", range(K), lowBound=0)

    # Objective function
    lp += theta

    # Add constraints
    for m in range(M):
        lp += pulp.lpSum(lambda_vars[k] * x[k][m] for k in range(K)) <= theta * x_o[m], f"Input_constraint_{m}"

    for n in range(N):
        lp += pulp.lpSum(lambda_vars[k] * y[k][n] for k in range(K) if k != i) >= y_o[n], f"Output_constraint_{n}"

    return lp

In [None]:

def get_super_effectiveness(input_df, output_df, df, i):
    lp = get_se_lp_problem(
        len(input_df.columns),
        len(output_df.columns),
        len(df),
        input_df.to_numpy(),
        output_df.to_numpy(),
        input_df.iloc[i].to_numpy(),
        output_df.iloc[i].to_numpy(),
        i
    )

    lp.solve()
    
    return lp.objective.value()

In [None]:
super_effectiveness_df = airports_df[['city']].copy()
super_effectiveness_df['super_effectiveness'] = 0

for i in range(len(airports_df)):
    super_effectiveness = get_super_effectiveness(airports_input_df, airports_output_df, airports_df, i)
    super_effectiveness_df.loc[i,'super_effectiveness'] = super_effectiveness

super_effectiveness_df.round(3)

In [None]:
for i, row in super_effectiveness_df.iterrows():
    print(f'{row["city"]} & {row["super_effectiveness"]} \\\\')

In [None]:
for i, row in super_effectiveness_df.sort_values('super_effectiveness', ascending=False).iterrows():
    print(f' {row["city"]} ', end='\\succ')

## Efektywność krzyżowa

In [None]:
num_dmus = airports_input_df.shape[0]
num_inputs = airports_input_df.shape[1]
num_outputs = airports_output_df.shape[1]
print(num_dmus, num_inputs, num_outputs)

def solve_dea(dmu_idx):
    prob = pulp.LpProblem(f"DEA_DMU_{dmu_idx}", pulp.LpMaximize)
    print('1')

    input_weights = [pulp.LpVariable(f"u{i}", lowBound=0) for i in range(len(airports_input_df))]
    output_weights = [pulp.LpVariable(f"v{j}", lowBound=0) for j in range(len(airports_output_df))]
    print('1')

    outputs = np.array(airports_output_df).reshape(num_dmus, num_outputs)
    print(outputs.shape, airports_output_df.shape)
    inputs = np.array(airports_input_df).reshape(num_dmus, num_inputs)
    print(inputs.shape, airports_input_df.shape)

    prob += pulp.lpSum([output_weights[j] * outputs[dmu_idx][j] for j in range(num_outputs)])
    print('2')

    # Add Constraints
    for i in range(num_dmus):
        prob += (pulp.lpSum([output_weights[j] * outputs[i, j] for j in range(num_outputs)]) <=
                 pulp.lpSum([input_weights[j] * inputs[i, j] for j in range(num_inputs)]))
        
    prob += pulp.lpSum([input_weights[i] * inputs[dmu_idx, i] for i in range(num_inputs)]) == 1

    # Solve the problem
    prob.solve()
    
    return [pulp.value(var) for var in input_weights], [pulp.value(var) for var in output_weights]

efficiencies = np.zeros(num_dmus)
cross_efficiencies = np.zeros((num_dmus, num_dmus))

for i in range(num_dmus):
    outputs = np.array(airports_output_df).reshape(num_dmus, num_outputs)
    inputs = np.array(airports_input_df).reshape(num_dmus, num_inputs)
    u, v = solve_dea(i)
    efficiencies[i] = np.sum([v[j] * outputs[i, j] for j in range(num_outputs)]) / np.sum([u[j] * inputs[i, j] for j in range(num_inputs)])
    for j in range(num_dmus):
        cross_efficiencies[i, j] = np.sum([v[k] * outputs[j, k] for k in range(num_outputs)]) / np.sum([u[k] * inputs[j, k] for k in range(num_inputs)])


# Output results
print("Efficiencies:", efficiencies)
print("Cross-Efficiencies:\n", cross_efficiencies.round(3))

for i in cross_efficiencies.T:
    print(np.mean(i).round(3))

## Rozkład efektywności

In [None]:
weights_df = pd.read_csv('samples_homework.csv', delimiter=';')

In [None]:
buckets_df = airports_df[['city']].copy()
buckets_df['0-0.2'] = 0
buckets_df['0.2-0.4'] = 0
buckets_df['0.4-0.6'] = 0
buckets_df['0.6-0.8'] = 0
buckets_df['0.8-1.0'] = 0
buckets_df['EE'] = 0

In [None]:
airports_df

In [None]:
sample_e = np.zeros((len(weights_df), len(airports_df)))
for i, sample in weights_df.iterrows():
    i_weights = sample.filter(regex='^i')
    o_weights = sample.filter(regex='^o')
    for j in range(len(airports_df)):
        sample_e[i][j] = ((airports_output_df.iloc[j] @ o_weights) / (airports_input_df.iloc[j] @ i_weights))
sample_e = sample_e / sample_e.max(axis=1, keepdims=True) # normalize

In [None]:
for i in range(len(buckets_df)):
    buckets_df.loc[i, '0-0.2'] = np.sum(sample_e[:, i] < 0.2)
    buckets_df.loc[i, '0.2-0.4'] = np.sum(np.logical_and(sample_e[:, i] >= 0.2, sample_e[:, i] < 0.4))
    buckets_df.loc[i, '0.4-0.6'] = np.sum(np.logical_and(sample_e[:, i] >= 0.4, sample_e[:, i] < 0.6))
    buckets_df.loc[i, '0.6-0.8'] = np.sum(np.logical_and(sample_e[:, i] >= 0.6, sample_e[:, i] < 0.8))
    buckets_df.loc[i, '0.8-1.0'] = np.sum(sample_e[:, i] >= 0.8)
    buckets_df.loc[i, 'EE'] = sample_e[:, i].mean()
buckets_df

In [None]:
for i, row in buckets_df.iterrows():
    print(f'{row["city"]} & {row["0-0.2"]} & {row["0.2-0.4"]} & {row["0.4-0.6"]} & {row["0.6-0.8"]} & {row["0.8-1.0"]} & {round(row["EE"], 3)} \\\\')

In [None]:
for i, row in buckets_df.sort_values('EE', ascending=False).iterrows():
    print(f' {row["city"]} ', end='\\succ')