In [1]:
import random
import numpy as np
import pandas as pd
import pickle
import sklearn



In [2]:
print("scikit-learn version:", sklearn.__version__) #버젼을 맞춰줘야합니다 1.2.2
print("numpy version:", np.__version__) #버젼을 맞춰줘야합니다 1.24.3

scikit-learn version: 1.2.2
numpy version: 1.24.3


In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
"""
chromosome_length:
염색체의 길이를 나타낸다.
population_size:
유전 알고리즘에서 사용하는 인구 크기를 나타낸다. 이 크기만큼의 염색체들이 각 세대마다 존재하게 된다.

mutation_rate:
변이 확률을 나타낸다. 이 확률에 따라 각 염색체에 대해 변이가 발생할지를 결정한다.

crossover_rate:
교차 확률을 나타낸다. 이 확률에 따라 선택된 부모 염색체들 사이에서 교차 연산이 일어날지를 결정한다. 

max_generations:
유전 알고리즘이 실행되는 최대 세대 수를 나타낸다. 이 값을 초과하면 알고리즘이 종료된다.

df
최적화 대상이 되는 공연들의 feature들을 포함한 데이터프레임(좌석 점유율은 포함하지 않는다).
인덱스를 초기화 한 후 입력해야 한다.

option_date
조정가능한 날짜들의 데이터프레임.

model
좌석 점유율을 예측할 수 있는 모델.

entropy
적합도 식에 entropy 제약식을 추가할 것인지 결정한다.
"""

class GeneticAlgorithm:
    def __init__(self, population_size, mutation_rate, crossover_rate, max_generations, df, option_date, model, entropy = False):
        self.chromosome_length = option_date.shape[0]
        self.population_size = population_size
        self.mutation_rate = mutation_rate
        self.crossover_rate = crossover_rate
        self.max_generations = max_generations
        self.population = self.initialize_population()
        self.df = df
        self.option_date = option_date
        self.model = model
        self.entropy = entropy
        assert self.df.shape[0] <= option_date.shape[0]
        
    def _entropy(self,pred):
        base = len(pred)
        pk = pred/np.sum(pred)
        return -np.sum(pk * np.log(pk)) / np.log(base)
    
    def initialize_population(self):
        #chromosome의 숫자는 df의 인덱스의 숫자이다.
        return [self.create_chromosome() for _ in range(self.population_size)]

    def create_chromosome(self):
        return random.sample(range(self.chromosome_length), self.chromosome_length)

    def fitness_function(self, chromosome):
        #염색체에서 option date의 행의 개수 이상의 숫자는 더미 공연을 의미한다.
        chromosome = np.array(chromosome)
        sub_df = self.df.drop(self.option_date.columns, axis = 1)
        adj_df = pd.concat([self.option_date[chromosome<self.df.shape[0]].reset_index(drop=True),
                            sub_df.loc[chromosome[chromosome<self.df.shape[0]],:].reset_index(drop=True)], axis = 1)
        adj_df = adj_df[self.df.columns]
        pred = self.model.predict(adj_df)
        modified_pred = np.where(pred > 1, 1, pred)
        if self.entropy:
            return self._entropy(modified_pred)*np.sum(modified_pred)
        else:
            return np.sum(modified_pred)

    def select_parents(self):
        # Roulette wheel selection
        fitness_values = [self.fitness_function(chromosome) for chromosome in self.population]
        total_fitness = sum(fitness_values)
        pick = random.uniform(0, total_fitness)
        current = 0
        for chromosome, fitness in zip(self.population, fitness_values):
            current += fitness
            if current > pick:
                return chromosome

    def partially_matched_crossover(self, parent1, parent2):
        size = len(parent1)
        p1, p2 = [-1] * size, [-1] * size

        # 랜덤한 범위 내에서 부모의 일부 구간 선택
        cxpoint1 = random.randint(0, size - 1)
        cxpoint2 = random.randint(0, size - 1)
        if cxpoint2 < cxpoint1:
            cxpoint1, cxpoint2 = cxpoint2, cxpoint1

        # 선택된 구간을 자식에 복사
        for i in range(cxpoint1, cxpoint2 + 1):
            p1[i] = parent2[i]
            p2[i] = parent1[i]

        # 아직 복사하지 않은 부분을 찾아서 자식에 추가
        for i in range(size):
            if parent1[i] not in p1:
                for j in range(size):
                    if p1[j] == -1:
                        p1[j] = parent1[i]
                        break
            if parent2[i] not in p2:
                for j in range(size):
                    if p2[j] == -1:
                        p2[j] = parent2[i]
                        break

        return p1, p2

    def mutate(self, chromosome):
        # Swap mutation
        if random.random() < self.mutation_rate:
            index1, index2 = random.sample(range(self.chromosome_length), 2)
            chromosome[index1], chromosome[index2] = chromosome[index2], chromosome[index1]
        return chromosome

    def run(self):
        for generation in range(self.max_generations):
            new_population = []

            while len(new_population) < self.population_size:
                parent1 = self.select_parents()
                parent2 = self.select_parents()

                if random.random() < self.crossover_rate:
                    child1, child2 = self.partially_matched_crossover(parent1, parent2)
                else:
                    child1, child2 = parent1[:], parent2[:]

                child1 = self.mutate(child1)
                child2 = self.mutate(child2)
                
                new_population.extend([child1, child2])

            self.population = new_population[:self.population_size]

            # 현재 세대의 가장 우수한 염색체의 적합도를 출력.
            best_fitness = max([self.fitness_function(chromo) for chromo in self.population])
            print(f"Generation {generation}: Best Fitness = {best_fitness}")

        return max(self.population, key=self.fitness_function)
    

In [5]:
# load the model from disk
model = pickle.load(open('../model/gbm_fin_model_sac.sav', 'rb'))
model
print(model)

GradientBoostingRegressor(learning_rate=0.14, loss='huber', n_estimators=300,
                          random_state=40)


In [59]:
df = pd.read_csv(r'../preprocessed_data/예술의 전당/모델학습및테스트데이터_최종.csv', index_col = 0)
df.reset_index(drop = True, inplace = True)
display(df)

Unnamed: 0,소요시간,관람연령,아동공연 여부,축제 여부,내한공연 여부,공연시작년도,공연시작월,공연시작일,공연시작시분,공휴일여부,...,한국작곡가협회ACL코리아,한국페스티발앙상블,한국피아노듀오협회,한국피아노학회,한양대학교,현대문화기획,현대앙상블소리,현대음악앙상블소리,화음,좌석점유율
0,90,4,0,0,0,2022,6,18,1200,0,...,0,0,0,0,0,0,0,0,0,0.954225
1,100,4,0,0,0,2019,8,30,1200,0,...,0,0,0,0,0,0,0,0,0,0.963277
2,100,4,0,0,0,2021,7,28,1170,0,...,0,0,0,0,0,0,0,0,0,0.978142
3,100,4,0,0,0,2022,7,24,840,0,...,0,0,0,0,0,0,0,0,0,0.949153
4,110,4,0,0,0,2019,7,21,1200,0,...,0,0,0,0,0,0,0,0,0,0.983051
5,100,4,0,0,0,2021,5,26,1170,0,...,0,0,0,0,0,0,0,0,0,0.983784
6,100,4,0,0,0,2022,1,28,1170,0,...,0,0,0,0,0,0,0,0,0,0.929412
7,90,4,0,0,0,2020,11,6,1170,0,...,1,0,0,0,0,1,0,0,0,0.516556
8,90,4,0,0,0,2022,5,15,840,0,...,0,0,0,0,0,0,0,0,0,0.988701
9,90,4,0,0,0,2021,2,3,1170,0,...,0,0,0,0,0,0,0,0,0,0.944


In [61]:
sub_df = df[np.logical_and(df['공연시작년도'] == 2022, df['공연시작월'].between(9,11))]
sub_df_date = sub_df[['공연시작년도','공연시작월','공연시작일' ,'공연시작시분','dayofweek_fri', 'dayofweek_sat', 'dayofweek_sun',
       'dayofweek_thu', 'dayofweek_tue', 'dayofweek_wed','공휴일여부']]
sub_df.drop(['좌석점유율'], axis = 1, inplace = True)
sub_df.reset_index(drop = True, inplace = True)

display(sub_df_date)
display(sub_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df.drop(['좌석점유율'], axis = 1, inplace = True)


Unnamed: 0,공연시작년도,공연시작월,공연시작일,공연시작시분,dayofweek_fri,dayofweek_sat,dayofweek_sun,dayofweek_thu,dayofweek_tue,dayofweek_wed,공휴일여부
15,2022,11,2,1170,0.0,0.0,0.0,0.0,0.0,1.0,0
23,2022,10,8,840,0.0,1.0,0.0,0.0,0.0,0.0,0
39,2022,9,28,1170,0.0,0.0,0.0,0.0,0.0,1.0,0
40,2022,9,4,840,0.0,0.0,1.0,0.0,0.0,0.0,0
43,2022,10,23,1200,0.0,0.0,1.0,0.0,0.0,0.0,0
46,2022,10,5,1170,0.0,0.0,0.0,0.0,0.0,1.0,0
50,2022,11,10,1170,0.0,0.0,0.0,1.0,0.0,0.0,0
57,2022,10,25,1170,0.0,0.0,0.0,0.0,1.0,0.0,0
75,2022,11,19,1200,0.0,1.0,0.0,0.0,0.0,0.0,0
76,2022,11,16,1170,0.0,0.0,0.0,0.0,0.0,1.0,0


Unnamed: 0,소요시간,관람연령,아동공연 여부,축제 여부,내한공연 여부,공연시작년도,공연시작월,공연시작일,공연시작시분,공휴일여부,...,한국작곡가협회,한국작곡가협회ACL코리아,한국페스티발앙상블,한국피아노듀오협회,한국피아노학회,한양대학교,현대문화기획,현대앙상블소리,현대음악앙상블소리,화음
0,90,4,0,1,0,2022,11,2,1170,0,...,0,0,0,0,0,0,1,0,0,0
1,80,4,0,0,0,2022,10,8,840,0,...,0,0,0,0,0,0,0,0,0,0
2,90,4,0,0,0,2022,9,28,1170,0,...,0,0,0,0,0,0,0,0,0,0
3,95,4,0,0,0,2022,9,4,840,0,...,0,0,0,0,0,0,0,0,0,0
4,115,4,0,1,0,2022,10,23,1200,0,...,0,0,0,0,0,0,0,0,0,0
5,90,4,0,0,0,2022,10,5,1170,0,...,0,0,0,0,0,0,0,0,0,0
6,90,4,0,0,0,2022,11,10,1170,0,...,0,0,0,0,0,0,0,0,0,0
7,80,4,0,0,0,2022,10,25,1170,0,...,0,0,0,0,0,0,0,0,0,0
8,90,4,0,0,0,2022,11,19,1200,0,...,0,0,0,0,0,0,0,0,0,0
9,90,4,0,0,0,2022,11,16,1170,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
input_arr = [
    [2022,9,9,1170,1,0,0,0,0,0,0],
    [2022,9,10,840,0,1,0,0,0,0,0],
    [2022,9,10,1200,0,1,0,0,0,0,0],
    [2022,9,11,840,0,0,1,0,0,0,0],
    [2022,9,11,1200,0,0,1,0,0,0,0],
    [2022,9,13,1170,0,0,0,0,0,1,0],
    [2022,9,18,840,0,0,1,0,0,0,0],
    [2022,9,24,840,0,1,0,0,0,0,0],
    [2022,9,25,1200,0,0,1,0,0,0,0],
    [2022,10,29,840,0,1,0,0,0,0,0],
    [2022,11,5,840,0,1,0,0,0,0,0],
    [2022,11,5,1200,0,1,0,0,0,0,0],
    [2022,11,8,1170,0,0,0,0,0,1,0],
    [2022,11,26,840,0,1,0,0,0,0,0]
]
add_df = pd.DataFrame(input_arr,columns = sub_df_date.columns)
sub_df_date = pd.concat([sub_df_date, add_df], axis = 0)
sub_df_date.sort_values(by = ['공연시작월','공연시작일'], inplace = True)
sub_df_date.reset_index(drop = True, inplace = True)
display(sub_df_date)

Unnamed: 0,공연시작년도,공연시작월,공연시작일,공연시작시분,dayofweek_fri,dayofweek_sat,dayofweek_sun,dayofweek_thu,dayofweek_tue,dayofweek_wed,공휴일여부
0,2022,9,1,1170,0.0,0.0,0.0,1.0,0.0,0.0,0
1,2022,9,2,1170,1.0,0.0,0.0,0.0,0.0,0.0,0
2,2022,9,3,840,0.0,1.0,0.0,0.0,0.0,0.0,0
3,2022,9,3,1200,0.0,1.0,0.0,0.0,0.0,0.0,0
4,2022,9,4,840,0.0,0.0,1.0,0.0,0.0,0.0,0
5,2022,9,4,1200,0.0,0.0,1.0,0.0,0.0,0.0,0
6,2022,9,6,1170,0.0,0.0,0.0,0.0,1.0,0.0,0
7,2022,9,7,1170,0.0,0.0,0.0,0.0,0.0,1.0,0
8,2022,9,8,1170,0.0,0.0,0.0,1.0,0.0,0.0,0
9,2022,9,9,1170,1.0,0.0,0.0,0.0,0.0,0.0,0


In [10]:
genetic = GeneticAlgorithm(population_size = 100, mutation_rate = 0.01, crossover_rate = 0.8, max_generations = 50, df = sub_df, 
                          option_date = sub_df_date, model = model)
genetic.run()

Generation 0: Best Fitness = 78.35981044316941
Generation 1: Best Fitness = 78.40822316263834
Generation 2: Best Fitness = 78.3383110301805
Generation 3: Best Fitness = 78.35132859933938
Generation 4: Best Fitness = 78.44908143512042
Generation 5: Best Fitness = 78.36362309695006
Generation 6: Best Fitness = 78.37494436271415
Generation 7: Best Fitness = 78.44667406979173
Generation 8: Best Fitness = 78.54610434670361
Generation 9: Best Fitness = 78.54610434670361
Generation 10: Best Fitness = 78.54610434670361
Generation 11: Best Fitness = 78.48316354374394
Generation 12: Best Fitness = 78.5010826464329
Generation 13: Best Fitness = 78.32806650239337
Generation 14: Best Fitness = 78.41750444152395
Generation 15: Best Fitness = 78.41750444152395
Generation 16: Best Fitness = 78.44624615307694
Generation 17: Best Fitness = 78.44624615307694
Generation 18: Best Fitness = 78.43447121668257
Generation 19: Best Fitness = 78.54627330078816
Generation 20: Best Fitness = 78.33780866598262
Gene

[3,
 88,
 96,
 27,
 70,
 91,
 52,
 47,
 94,
 84,
 20,
 45,
 65,
 40,
 99,
 5,
 15,
 1,
 101,
 75,
 44,
 30,
 26,
 8,
 74,
 10,
 58,
 68,
 69,
 67,
 36,
 56,
 59,
 87,
 0,
 82,
 29,
 83,
 23,
 17,
 55,
 33,
 31,
 57,
 49,
 77,
 90,
 34,
 93,
 32,
 53,
 85,
 38,
 21,
 48,
 80,
 78,
 42,
 86,
 14,
 43,
 62,
 7,
 60,
 22,
 39,
 89,
 102,
 63,
 19,
 12,
 103,
 51,
 76,
 37,
 35,
 73,
 50,
 28,
 97,
 98,
 9,
 13,
 41,
 18,
 25,
 66,
 92,
 81,
 11,
 95,
 4,
 2,
 71,
 16,
 24,
 6,
 79,
 54,
 72,
 46,
 64,
 61,
 100]

In [9]:
df[np.logical_and(df['공연시작년도'] == 2022, df['공연시작월'].between(9,11))]['좌석점유율'].sum()

76.09311789280137

In [65]:
best = np.array([3, 88, 96, 27, 70, 91, 52, 47, 94, 84, 20, 45, 65, 40, 99, 5, 15, 1, 101, 75, 44, 30, 26, 8, 74, 10, 58, 68, 69, 67, 36, 56, 59, 87, 0, 82, 29, 83, 23, 17, 55, 33, 31, 57, 49, 77, 90, 34, 93, 32, 53, 85, 38, 21, 48, 80, 78, 42, 86, 14, 43, 62, 7, 60, 22, 39, 89, 102, 63, 19, 12, 103, 51, 76, 37, 35, 73, 50, 28, 97, 98, 9, 13, 41, 18, 25, 66, 92, 81, 11, 95, 4, 2, 71, 16, 24, 6, 79, 54, 72, 46, 64, 61, 100])
sdf = sub_df.drop(sub_df_date.columns, axis = 1)
adj_df = pd.concat([sub_df_date[best<sub_df.shape[0]].reset_index(drop=True),
                    sdf.loc[best[best<sub_df.shape[0]],:].reset_index(drop=True)], axis = 1)
adj_df = adj_df[sub_df.columns]
pred = model.predict(adj_df)
modified_pred = np.where(pred > 1, 1, pred)

In [66]:
print('변경된 좌석점유율 :')
print(modified_pred)
print('변경된 좌석점유율 합 :')
print(modified_pred.sum())

변경된 좌석점유율 :
[0.97413213 0.91547699 0.76038829 0.95204225 0.9478346  0.94091682
 0.73122611 0.93673915 0.94012317 0.94810331 0.96539892 0.62133179
 0.92736313 0.6040026  0.94612848 0.95563183 0.96027237 0.54424014
 0.95349079 0.95542804 0.92964914 0.7715124  0.95180952 0.68558763
 0.94265569 0.72404682 0.51780415 0.93621272 0.93798921 0.49997305
 0.96488493 0.94132489 0.56340093 0.9339859  0.90836078 0.92623319
 0.95194074 0.92004358 0.93952351 0.61746266 0.92897689 0.93794924
 0.92399871 0.93049804 0.92971783 0.93989578 0.9470146  0.86527088
 0.93835894 0.92775381 0.49156142 0.94396715 0.96040298 0.94671408
 0.93793804 0.93355147 0.93816945 0.93729319 0.92948726 0.96914018
 0.94085819 0.96703369 0.9399765  0.91637723 0.91852503 0.91251125
 0.94695053 0.87642534 0.95192443 0.92798642 0.56840472 0.44971769
 0.93771598 0.94561304 0.94443036 0.93355147 0.9181031  0.92571166
 0.64380154 0.95840288 0.9470146  0.9158125  0.93946324 0.9497178
 0.93076881 0.95792131 0.93913389 0.94427943 0.4304