In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval
from sklearn.preprocessing import TargetEncoder

In [None]:
df = pd.read_csv('./movie_dataset.csv', index_col=0)

In [None]:
df['budget'] = df['budget'].replace({0: np.NAN})
df.dropna(subset=['budget', 'mpaa', 'director', 'genres', 'domestic_distributor', 'production_companies', 'production_countries', 'spoken_languages'], inplace=True) 

df = df[df['status'] == 'Released']
df = df.drop(columns=['status'], axis=1)

In [None]:
cols = ['genres', 'production_countries', 'production_companies', 'actors', 'spoken_languages']

mlb_classes = {}
for col in cols:
    df[col] = df.apply(lambda x: literal_eval(x[col]), axis=1)

    mlb = MultiLabelBinarizer()
    mlb.fit_transform(df[col])
    mlb_classes[col] = mlb.classes_
    print(col, len(mlb.classes_))

In [None]:
mlb = MultiLabelBinarizer()
dum = mlb.fit_transform(df['genres'])
classes = ['genre_' + x.replace(' ', '_').lower() for x in mlb.classes_]
df = df.join(pd.DataFrame(dum.astype(bool), df.index, classes))

In [None]:
genres = set(mlb.classes_)
print('Количество жанров:', len(genres))
print('Жанры: ', genres)

In [None]:
df = df.drop(columns=['genres'], axis=1)

In [None]:
df = df.join(pd.DataFrame(df['actors'].values.tolist(), df.index, ['actor_1', 'actor_2', 'actor_3']))
df = df.drop(['actors'], axis=1)

Выделяем год, месяц и день выхода фильма

In [None]:
from dateutil.parser import parse

y = df.apply(lambda x: parse(x['release_date']).year, axis=1)
m = df.apply(lambda x: parse(x['release_date']).month, axis=1)
d = df.apply(lambda x: parse(x['release_date']).day, axis=1)


df = df.join(y.rename('release_year'))
df = df.join(m.rename('release_month'))
df = df.join(d.rename('release_day'))

df = df.drop(columns=['release_date'], axis=1)

In [None]:
df = df[df['actor_1'].notna()]
df = df[df['actor_2'].notna()]
df = df[df['actor_3'].notna()]

In [None]:
df['production_companies'] = df.apply(lambda row: row['production_companies'][0], axis=1)
df['production_countries'] = df.apply(lambda row: row['production_countries'][0], axis=1)
df['spoken_languages'] = df.apply(lambda row: row['spoken_languages'][0], axis=1)

In [None]:
def target_encoding_col(col):
    y = df['revenue'].values
    X = df[col]
    X = X.values.reshape(-1, 1)

    enc_auto = TargetEncoder(target_type='continuous', smooth="auto")
    return enc_auto.fit_transform(X, y)


df['production_companies'] = target_encoding_col('production_companies')
df['production_countries'] = target_encoding_col('production_countries')
df['spoken_languages'] = target_encoding_col('spoken_languages')
df['actor_1'] = target_encoding_col('actor_1')
df['actor_2'] = target_encoding_col('actor_2')
df['actor_3'] = target_encoding_col('actor_3')


In [None]:
df = df.drop(columns=['id', 'imdb_id', 'tagline', 'overview', 'vote_average', 'vote_count', 'domestic_opening'])

In [None]:
from keras import layers
from keras.models import Model

def build_model(input_shape):
    input = layers.Input(shape=input_shape)
    hidden = layers.Dense(32, activation='relu')(input)
    hidden = layers.Dense(64, activation='relu')(hidden)
    hidden = layers.Dense(128, activation='relu')(hidden)
    hidden = layers.Dense(64, activation='relu')(hidden)
    hidden = layers.Dense(32, activation='relu')(hidden)
    output = layers.Dense(1)(hidden)

    model = Model(inputs=input, outputs=output) # To define a model, just specify its input and output layers

    model.compile(loss='rmsprop', optimizer='mse', metrics=['mae'])

    return model

In [None]:


# batch_size = 32 # in each iteration, we consider 32 training examples at once
# num_epochs = 200 # we iterate 200 times over the entire training set
# kernel_size = 3 # we will use 3x3 kernels throughout
# pool_size = 2 # we will use 2x2 pooling throughout
# conv_depth_1 = 32 # we will initially have 32 kernels per conv. layer...
# conv_depth_2 = 64 # ...switching to 64 after the first pooling layer
# drop_prob_1 = 0.25 # dropout after pooling with probability 0.25
# drop_prob_2 = 0.5 # dropout in the dense layer with probability 0.5
# hidden_size = 512 # the dense layer will have 512 neurons


# model = build_model()

# # Настриваем сохранение лучшей модели
# checkpoint_path = "./best-model.keras"
# checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, mode='min', verbose=1)
# # Настраиваем callback для ранней остановки модели
# early_stopping = EarlyStopping(monitor="val_loss", min_delta=0.01, patience=4, verbose=1)

# model.fit(X_train, Y_train, # Train the model using the training set...

#           batch_size=batch_size, epochs=num_epochs,

#           verbose=1, validation_split=0.1) # ...holding out 10% of the data for validation

# model.evaluate(X_test, Y_test, verbose=1) # Evaluate the trained model on the test set!


In [None]:
# def dict_list_to_df(s, col):
#     rows = []
#     for index, row in s.items():
#         for item in row:
#             rows.append(item)
#     df = pd.DataFrame(rows)
#     return df


# df_genres = df['genres'].apply(lambda i: literal_eval(i))
# df_genres = dict_list_to_df(df_genres, 'genres')
# df_unique_genres = df_genres.drop_duplicates()
# df_unique_genres.set_index('id', inplace=True)

In [None]:
# def sortArray(column):
#     def f(row):
#         l = row[column]
#         l.sort()
#         return l

#     return f

# df['production_companies'] = df.apply(sortArray('production_companies'), axis=1)
# df['production_countries'] = df.apply(sortArray('production_countries'), axis=1)
# df['spoken_languages'] = df.apply(sortArray('spoken_languages'), axis=1)
