In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import shap
from tqdm.notebook import tnrange
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, RandomizedSearchCV
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score as f1

### [Target Assignment]

In [None]:
videos = ['Lips', 'SunBath', 'Netflix_Dancers', 'BuildingHall', 'ToddlerFountain',
          'Touchdown_pass','Jockey_1080p', 'Beauty_1080p', 'RushFieldCuts', 'Netflix_TunnelFlag',
          'Vidyo4', 'Dark', 'NetflixDinnerScene', 'KristenAndSara', 'Netflix_DrivingPOV']

videos.reverse()
qps = ['22','27','32','37']

for video in tqdm(videos, total = len(videos)):
    for qp in qps:
        df_features = pd.read_csv('../../VTM-9.0/features/dataset_' + video + '_' + qp + '_features.csv')
        df_features = df_features.drop(columns = ['cost','qtdepth', 'mtdepth']).drop_duplicates()
        df_features = df_features[((df_features['CU_width'] == 32) & (df_features['CU_height'] == 16)) | ((df_features['CU_width'] == 16) & (df_features['CU_height'] == 32)) | 
             ((df_features['CU_width'] == 64) & (df_features['CU_height'] == 8)) | ((df_features['CU_width'] == 8) & (df_features['CU_height'] == 64))]
        
        df_target = pd.read_csv('../datasets/classes/' + video + '_' + qp + '_classes.csv')
        
        for index, row in df_target.iterrows():
            df_features.loc[(df_features['POC'] == row['POC']) & 
                          (df_features['paramQP'] == row['paramQP']) & 
                          (df_features['topLeft_x'] <= row['topLeft_x']) & 
                          (df_features['topLeft_y'] <= row['topLeft_y']) &
                          (df_features['bottomRight_x'] >= row['bottomRight_x']) & 
                          (df_features['bottomRight_y'] >= row['bottomRight_y']), 'target'] = 1
            
            df_features.loc[(df_features['POC'] == row['POC']) & 
                          (df_features['paramQP'] == row['paramQP']) & 
                          (df_features['topLeft_x'] == row['topLeft_x']) & 
                          (df_features['topLeft_y'] == row['topLeft_y']) &
                          (df_features['bottomRight_x'] == row['bottomRight_x']) & 
                          (df_features['bottomRight_y'] == row['bottomRight_y']), 'target'] = 0
            
        df_features.loc[(df_features['target'].isnull()), 'target'] = 0
        
        df_features.to_csv('../datasets/s2/' + video + '_' + qp + '_s2.csv', index = False)

dfAll = pd.DataFrame()
for video in videos:
    for qp in qps:
        df = pd.read_csv('../datasets/s2/' + video + '_' + qp + '_s2.csv')
        
        dfAll = pd.concat([dfAll, df])
dfAll.to_csv('../datasets/s2/all_s2.csv', index = False)

### [Variance Drop and RandomUnderSampling]

In [None]:
def constantColumns(X, y):
    
    constant_filter = VarianceThreshold(threshold = 0.01)
    constant_filter.fit(X)
    xFilter = constant_filter.transform(X)
    columnsFilter = X.columns[constant_filter.get_support()]

    x_T = xFilter.T
    x_T = pd.DataFrame(x_T)

    duplicatedFeatures = x_T.duplicated()
    featuresToKeep = [not index for index in duplicatedFeatures]

    xUnique = x_T[featuresToKeep].T
    columnsUnique = columnsFilter[featuresToKeep]

    xUnique.columns = columnsUnique
    X = xUnique
    
    return X, y

In [None]:
datasets = ['s3', 's4', 's5', 's6'] #'s0', 's1', 's2',

for d in tnrange(len(datasets)):
    df = pd.read_csv('../datasets/' + datasets[d] + '/all_' + datasets[d] + '.csv')
    df = df[(df['splitType'] == 2) | (df['splitType'] == 4)]
    df.drop(columns = ['splitType', 'videoname'], inplace = True)
    X = df.drop(columns = 'target')
    y = df['target']
    
    del df
    under = RandomUnderSampler(sampling_strategy={0:450000, 1:450000})
    X, y = under.fit_resample(X, y)

    X, y = constantColumns(X, y)
    df = pd.concat([X, y], axis = 1)
    df.to_csv('../datasets/' + datasets[d] + '/horz_' + datasets[d] + '_analisys.csv', index = False)

### [Features Importance]

#### [Mutual Info]

In [None]:
datasets = ['s0', 's1', 's2', 's3', 's4', 's5', 's6']
mi = pd.DataFrame()
for i in tnrange(len(datasets)):

    df = pd.read_csv('../datasets/' + datasets[i] + '/vert_' + datasets[i] + '.csv')
    X = df.drop(columns = 'target')
    y = df['target']
    
    df = mutual_info_classif(X, y)
    df = pd.Series(df)
    df.index = X.columns
    
    mi = pd.concat([mi, df], axis = 1, sort = False)
mi.columns = datasets
mi.to_csv('../feature_importance_mutual_vertical.csv')

In [None]:
mi = pd.read_csv('../feature_importance_mutual_vertical.csv')
mi.set_index('Unnamed: 0', inplace = True)
mi.index.name = None

mi.plot.bar(figsize = (30,15));
plt.ylabel('Mutual Information (MI)')
plt.title('Dataset - Vertical')
plt.grid()

In [None]:
axes = mi.plot.bar(figsize = ((15,15)), subplots = True)
axes[1].legend(loc=1);

In [None]:
mi = pd.read_csv('../feature_importance_mutual_horizontal.csv')
mi.set_index('Unnamed: 0', inplace = True)
mi.index.name = None

mi.plot.bar(figsize = (30,15));
plt.ylabel('Mutual Information (MI)')
plt.title('Dataset - Horizontal')
plt.grid()

In [None]:
axes = mi.plot.bar(figsize = ((15,15)), subplots = True)
axes[1].legend(loc=1);

#### [SHAP]

In [None]:
df = pd.read_csv('../datasets/s0/vert_s0.csv')
X = df.drop(columns = 'target')
y = df['target']

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2)
model = RandomForestClassifier().fit(X_train, Y_train)

In [None]:
shap_values = shap.TreeExplainer(model).shap_values(X_train)
shap.summary_plot(shap_values, X_train, plot_type = "bar")

In [None]:
shap.summary_plot(shap_values, X_test)

### [Dataset Size Analisys]

In [None]:
scores = []
tam = [i for i in range(45000,450001,45000)]
datasets = ['s4','s5','s6']#, 's4', 's5', 's6']

allscores = pd.DataFrame()
for d in tnrange(len(datasets)):

    df = pd.read_csv('../datasets/' + datasets[d] + '/vert_' + datasets[d] + '_analisys.csv')

    for t in tam:
        X = df.drop(columns = 'target')
        y = df['target']
        under = RandomUnderSampler(sampling_strategy={0:t, 1:t})
        X, y = under.fit_resample(X, y)

        X, XT, y, yT = train_test_split(X, y, test_size = 0.25)
        forest = RandomForestClassifier().fit(X, y)
        scores.append(forest.score(XT, yT))
    allscores = pd.concat([allscores, pd.DataFrame(scores)], sort = False, axis = 1)

In [None]:
sizes = [i for i in range(90000,900001,90000)]
plt.figure(figsize=(10, 7))
plt.plot(sizes, np.array(scoresS3)*100, linewidth=2, marker='o', color = 'tab:red')
plt.plot(sizes, np.array(scoresS4)*100, linewidth=2, marker='o', color = 'tab:orange')
plt.plot(sizes, np.array(scoresS5)*100, linewidth=2, marker='o', color = 'tab:olive')
plt.plot(sizes, np.array(scoresS6)*100, linewidth=2, marker='o', color = 'tab:green')
plt.title('MTT Vertical')
plt.ylabel('Acurácia (%)')
plt.xlabel('Quantidade de Instâncias')
plt.legend(['S3', 'S4', 'S5', 'S6'])
plt.grid();