# Prueba de Entrenamiento

## Paquetes

In [1]:
import numpy as np
import re
import pandas as pd

import os
import sys
import inspect

parentdir = r'F:\Clase\Universidad\04 - Cuarto\TFG\DQM-DC NMF'
sys.path.append(parentdir)


from utils.df_utils import *
from utils.plot_utils import *

pd.set_option('colheader_justify', 'center')

import functools

import matplotlib.pyplot as plt
# %matplotlib qt
plt.rcParams['figure.dpi'] = 300
plt.rcParams["figure.figsize"] = (3,2)
plt.rcParams.update({'font.size': 6})

import datetime
import logging
timestamp = datetime.datetime.utcnow().strftime('%Y%m%d')
filename=f'{parentdir}/tmp/read_data_{timestamp}.log'
formatter = logging.Formatter('[%(asctime)s] %(name)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s')

file_handler = logging.FileHandler(filename=filename, mode='a+')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(formatter)

stream_handler = logging.StreamHandler(sys.stdout)
stream_handler.setLevel(logging.INFO)

# The handlers have to be at a root level since they are the final output
logging.basicConfig(
    level=logging.DEBUG, 
    format='[{%(filename)s:%(lineno)d} %(levelname)s - %(message)s',
    handlers=[
        file_handler,
        stream_handler
    ]
)

logging.info(f'\n\n\nRun at {datetime.datetime.utcnow().strftime("%H-%M-%S")}')

[{199696191.py:49} INFO - 


Run at 10-55-06


## Código

### Obtención de los datos

In [2]:
names = np.array([f.split('.')[0] for f in os.listdir(f'{parentdir}/data/csv') if os.path.isfile(os.path.join(f'{parentdir}/data/csv', f))])

cols =['chi2','eta','phi','pt']
inds = ['A','B','C','D']

names_df  =pd.DataFrame(names.reshape(4,4).T, index=inds,columns=cols)
df_pprint(names_df)


Unnamed: 0,chi2,eta,phi,pt
A,GlbMuon_Glb_chi2OverDf_MuonCert_Labeled_UL2018A_Reduced,GlbMuon_Glb_eta_MuonCert_Labeled_UL2018A_Reduced,GlbMuon_Glb_phi_MuonCert_Labeled_UL2018A_Reduced,GlbMuon_Glb_pt_MuonCert_Labeled_UL2018A_Reduced
B,GlbMuon_Glb_chi2OverDf_MuonCert_Labeled_UL2018B_Reduced,GlbMuon_Glb_eta_MuonCert_Labeled_UL2018B_Reduced,GlbMuon_Glb_phi_MuonCert_Labeled_UL2018B_Reduced,GlbMuon_Glb_pt_MuonCert_Labeled_UL2018B_Reduced
C,GlbMuon_Glb_chi2OverDf_MuonCert_Labeled_UL2018C_Reduced,GlbMuon_Glb_eta_MuonCert_Labeled_UL2018C_Reduced,GlbMuon_Glb_phi_MuonCert_Labeled_UL2018C_Reduced,GlbMuon_Glb_pt_MuonCert_Labeled_UL2018C_Reduced


In [3]:
observable = 'eta'
file = pd.read_csv(f'{parentdir}/data/csv/'+names_df.at['A',observable]+'.csv')

file['histo'] = file['histo'].apply(str2arr)

df_pprint(file)

Unnamed: 0,fromrun,fromlumi,labels,hname,histo,entries,Xbins,Xmin,Xmax
0,315489,707,1,GlbMuon_Glb_eta,[ 0. 0. 0. 0. 2. 0. 3. 2. 3. 10. 33. 35. 44. 36. 47. 43. 35. 45.  47. 38. 38. 39. 34. 40. 35. 39. 37. 38. 47. 36. 41. 26. 38. 33. 34. 32.  26. 29. 31. 34. 44. 39. 42. 38. 29. 23. 29. 29. 31. 31. 41. 29. 29. 27.  28. 30. 45. 40. 38. 32. 37. 35. 30. 33. 35. 37. 32. 36. 39. 28. 32. 33.  42. 58. 38. 47. 28. 57. 34. 40. 47. 48. 56. 47. 47. 54. 37. 34. 49. 44.  12. 10. 3. 1. 0. 2. 0. 0. 0. 0.],3046.0,100,-3.0,3.0
1,316060,547,1,GlbMuon_Glb_eta,[ 0. 0. 0. 1. 1. 2. 3. 7. 6. 18. 62. 66. 56. 55. 64. 55. 59. 46.  65. 45. 64. 50. 66. 53. 47. 66. 59. 60. 51. 52. 46. 49. 44. 59. 49. 56.  53. 48. 65. 52. 62. 54. 48. 48. 61. 42. 43. 57. 56. 51. 59. 46. 55. 48.  38. 39. 53. 59. 45. 41. 40. 52. 55. 44. 47. 48. 54. 49. 54. 63. 40. 60.  63. 62. 53. 62. 59. 57. 60. 46. 48. 74. 67. 78. 76. 56. 56. 70. 48. 40.  18. 9. 6. 4. 0. 0. 0. 0. 0. 0.],4423.0,100,-3.0,3.0
2,316060,548,1,GlbMuon_Glb_eta,[ 0. 0. 0. 1. 1. 2. 4. 2. 6. 22. 44. 63. 66. 48. 69. 67. 42. 50.  39. 59. 57. 52. 66. 54. 60. 50. 54. 61. 61. 64. 55. 51. 56. 55. 51. 34.  61. 52. 55. 46. 64. 52. 48. 60. 55. 47. 40. 54. 80. 60. 53. 73. 52. 49.  50. 54. 62. 62. 55. 48. 67. 58. 65. 52. 59. 55. 47. 52. 47. 54. 57. 54.  52. 54. 64. 57. 65. 68. 52. 67. 50. 76. 73. 73. 63. 67. 55. 75. 62. 50.  22. 14. 6. 6. 4. 2. 0. 0. 0. 0.],4642.0,100,-3.0,3.0


### Entrenamiento

Importamos los paquetes necesarios para entrenar la red

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text

Tratamos los datos y nos aseguramos de que se hayan separado bien

In [5]:
file = file[file.entries != 0]

all = np.stack(file['histo'].to_numpy())/np.c_[file['entries'].to_numpy()]
# nonzero_id = np.where(entries!=0)[0]

# all = all[nonzero_id]
# entries = entries[nonzero_id]




# print(all[:3])
goodness = np.stack(file['labels'].values)
# print(goodness)

good = all[goodness==True]
bad = all[goodness==False]

ratio_gb = len(good)/len(bad); print(ratio_gb) #aprox 80 buenas por cada mala
print(f'{len(bad)}+{len(good)} == {len(bad)+len(good)} == {len(all)}')

191.32958801498128
267+51085 == 51352 == 51352


Damos forma a los datos

In [6]:
span = np.linspace(file['Xmin'].values[0],file['Xmax'].values[0],num=file['Xbins'].values[0])
V_matrix = pd.DataFrame(all)

df_pprint(V_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,0.0,0.0,0.0,0.0,0.000657,0.0,0.000985,0.000657,0.000985,0.003283,0.010834,0.01149,0.014445,0.011819,0.01543,0.014117,0.01149,0.014773,0.01543,0.012475,0.012475,0.012804,0.011162,0.013132,0.01149,0.012804,0.012147,0.012475,0.01543,0.011819,0.01346,0.008536,0.012475,0.010834,0.011162,0.010506,0.008536,0.009521,0.010177,0.011162,0.014445,0.012804,0.013789,0.012475,0.009521,0.007551,0.009521,0.009521,0.010177,0.010177,0.01346,0.009521,0.009521,0.008864,0.009192,0.009849,0.014773,0.013132,0.012475,0.010506,0.012147,0.01149,0.009849,0.010834,0.01149,0.012147,0.010506,0.011819,0.012804,0.009192,0.010506,0.010834,0.013789,0.019041,0.012475,0.01543,0.009192,0.018713,0.011162,0.013132,0.01543,0.015758,0.018385,0.01543,0.01543,0.017728,0.012147,0.011162,0.016087,0.014445,0.00394,0.003283,0.000985,0.000328,0.0,0.000657,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000226,0.000226,0.000452,0.000678,0.001583,0.001357,0.00407,0.014018,0.014922,0.012661,0.012435,0.01447,0.012435,0.013339,0.0104,0.014696,0.010174,0.01447,0.011305,0.014922,0.011983,0.010626,0.014922,0.013339,0.013565,0.011531,0.011757,0.0104,0.011078,0.009948,0.013339,0.011078,0.012661,0.011983,0.010852,0.014696,0.011757,0.014018,0.012209,0.010852,0.010852,0.013792,0.009496,0.009722,0.012887,0.012661,0.011531,0.013339,0.0104,0.012435,0.010852,0.008591,0.008818,0.011983,0.013339,0.010174,0.00927,0.009044,0.011757,0.012435,0.009948,0.010626,0.010852,0.012209,0.011078,0.012209,0.014244,0.009044,0.013565,0.014244,0.014018,0.011983,0.014018,0.013339,0.012887,0.013565,0.0104,0.010852,0.016731,0.015148,0.017635,0.017183,0.012661,0.012661,0.015826,0.010852,0.009044,0.00407,0.002035,0.001357,0.000904,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000215,0.000215,0.000431,0.000862,0.000431,0.001293,0.004739,0.009479,0.013572,0.014218,0.01034,0.014864,0.014433,0.009048,0.010771,0.008402,0.01271,0.012279,0.011202,0.014218,0.011633,0.012925,0.010771,0.011633,0.013141,0.013141,0.013787,0.011848,0.010987,0.012064,0.011848,0.010987,0.007324,0.013141,0.011202,0.011848,0.00991,0.013787,0.011202,0.01034,0.012925,0.011848,0.010125,0.008617,0.011633,0.017234,0.012925,0.011417,0.015726,0.011202,0.010556,0.010771,0.011633,0.013356,0.013356,0.011848,0.01034,0.014433,0.012495,0.014003,0.011202,0.01271,0.011848,0.010125,0.011202,0.010125,0.011633,0.012279,0.011633,0.011202,0.011633,0.013787,0.012279,0.014003,0.014649,0.011202,0.014433,0.010771,0.016372,0.015726,0.015726,0.013572,0.014433,0.011848,0.016157,0.013356,0.010771,0.004739,0.003016,0.001293,0.001293,0.000862,0.000431,0.0,0.0,0.0,0.0


Elegimos el número de vectores

In [7]:
N = 6

Entrenamos el modelo

In [8]:
nmf_model = NMF(N,max_iter=10000)
# Learn an NMF model for given Document Term Matrix 'V' 
# Extract the document-topic matrix 'W'
W = nmf_model.fit_transform(V_matrix)
# Extract top words from the topic-term matrix 'H'



Representamos las componentes y las guardamos.

In [12]:
components = nmf_model.components_
# print(components.dtype)
# print(span.dtype)

titles = [f'Component{str(i).zfill(2)}' for i in range(N)]

for i in range(components.shape[0]):
    histo = components[i]
    fig, ax = plt.subplots(1,1)
    fig.suptitle(titles[i])
    # _ = ax.bar(span,histo,align='edge')
    _ = ax.step(span,histo,where='mid',linewidth=0.3)
    ax.set_xlim(0,10)
    try:
        plt.savefig(f'../graphs/test_nmf_{observable}/N={N}/{titles[i].lower()}.jpeg')
    except FileNotFoundError:
        os.makedirs(f'../graphs/test_nmf_{observable}/N={N}/')
        plt.savefig(f'../graphs/test_nmf_{observable}/N={N}/{titles[i].lower()}.jpeg')
    
    print(f'Imagen guardada en ../graphs/test_nmf_{observable}/N={N}/{titles[i].lower()}.jpeg')
    plt.close()

# df_pprint(components)
# plotnsave_hist()


Imagen guardada en ../graphs/test_nmf_eta/N=6/component00.jpeg
Imagen guardada en ../graphs/test_nmf_eta/N=6/component01.jpeg
Imagen guardada en ../graphs/test_nmf_eta/N=6/component02.jpeg
Imagen guardada en ../graphs/test_nmf_eta/N=6/component03.jpeg
Imagen guardada en ../graphs/test_nmf_eta/N=6/component04.jpeg
Imagen guardada en ../graphs/test_nmf_eta/N=6/component05.jpeg


In [11]:
plt.plot(W[good[0]],'-r')
plt.plot(W[bad[0]],'-b')

IndexError: arrays used as indices must be of integer (or boolean) type