# Échantillonnage direct de l'espace des motifs

### BRUNEAU Richard - VASLIN Pierre





In [54]:
import numpy as np
import scipy.special as sps
import math
import matplotlib
import pandas as pd
import random

class DataSet():
    def __init__(self,df:pd.DataFrame):
        self.df = df
        self.sizes = np.zeros(self.df.shape[0],dtype=int)
        for i in range(self.df.shape[0]):
            self.sizes[i] = self.df.iloc[i].count()
        # Sizes est pour connaitre la taille d'une ligne en o(1)
        # Pandas gère mal la variation du nombre de colonne dans une ligne dans un dataFrame, par concéquent 
        # il recalcul à chaque fois le nombre d'élément non null o(n)


In [55]:
# Le dataframe pour les tests
df = pd.read_table("https://bitbucket.org/anesbendimerad/sigibbssamplingcode/raw/6699a50508fe177ee0c00dcc7d8e5390ee53688a/ItemsetDatasets/chess.txt", sep=" ",header=None)
del df[37]
ds = DataSet(df)
ds.df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,1,3,5,7,9,11,13,15,17,19,...,56,58,60,62,64,66,68,70,72,74
1,1,3,5,7,9,12,13,15,17,19,...,56,58,60,62,64,66,68,70,72,74
2,1,3,5,7,9,12,13,16,17,19,...,56,58,60,62,64,66,68,70,72,74
3,1,3,5,7,9,11,13,15,17,20,...,56,58,60,62,64,66,68,70,72,74
4,1,3,5,7,9,11,13,15,17,19,...,56,58,60,62,64,66,68,70,72,74


## Question 1

In [56]:
def algoFrequences(ds:DataSet,nb_pattern)-> tuple:
  df = ds.df
  sizes = ds.sizes
  R = []
  IsInR = set()
  P: dict = dict()
  w = np.zeros(df.shape[0])
  totalW = 0
    
  # set les probas
  for i in range(len(w)):
    w[i] = math.pow(2,sizes[i])
    totalW += w[i]
  
  # on selectionne 
  while len(R) < nb_pattern:
    random_row = random.uniform(0,totalW)
    # On cherche la ligne
    row = 0
    v = 0
    for i in range(len(w)):
      if  v > random_row:
        row = i - 1
        break
      v += w[i]
    # On selctionne un motif 
    pattern = np.array(df.iloc[row][:sizes[row]])
    random_v = random.randint(1, len(pattern) - 1 )
    for i in range(len(pattern)- random_v):
      pattern = np.delete(pattern, random.randint(0, len(pattern) - 1 ))
    # On ajoute seulement les motifs non présent dans l'ensemble R
    IsInR.add(np.array2string(pattern))
    if len(IsInR) != len(R):
      R.append(pattern)
  return R

In [57]:
algoFrequences(ds,10)

[array([ 2,  3,  5,  7, 12, 14, 18, 20, 22, 25, 27, 29, 32, 34, 36, 40, 42,
        44, 46, 48, 50, 52, 54, 58, 60, 62, 64, 66, 68, 71, 73, 74],
       dtype=int64),
 array([ 2,  3,  5,  7, 10, 11, 13, 16, 18, 20, 22, 23, 25, 27, 29, 31, 36,
        39, 40, 44, 46, 48, 51, 52, 55, 56, 58, 60, 62, 66, 69, 71, 72, 74],
       dtype=int64),
 array([ 1,  3,  7,  9, 11, 13, 15, 17, 19, 23, 25, 27, 36, 39, 42, 44, 46,
        48, 51, 52, 56, 58, 60, 62, 64, 66, 69, 72, 74], dtype=int64),
 array([ 2,  3, 11, 13, 23, 25, 28, 29, 42, 44, 50, 56, 60, 64, 70, 72],
       dtype=int64),
 array([ 2,  3,  5,  7,  9, 12, 14, 16, 17, 19, 22, 24, 25, 27, 29, 32, 34,
        36, 38, 40, 42, 44, 46, 48, 52, 54, 56, 58, 60, 62, 64, 69, 70, 73,
        74], dtype=int64),
 array([ 3,  8, 36, 52, 55, 66, 68, 72, 75], dtype=int64),
 array([ 1,  3,  5,  7, 10, 12, 14, 16, 20, 21, 23, 28, 29, 31, 34, 39, 40,
        42, 44, 46, 51, 58, 60, 62, 64, 66, 68, 71, 72, 75], dtype=int64),
 array([50, 52], dtype=int64),

## Question 2

In [115]:
def algoArea(ds:DataSet,nb_pattern)-> tuple:
  df = ds.df
  sizes = ds.sizes
  R = []
  IsInR = set()
  
  # set les probas
  w = np.zeros(df[0].count(),dtype=np.ulonglong)
  totalW:np.ulonglong= 0
  for i in range(1, len(w)):
    w[i] = np.multiply(sizes[i], np.power(2, (sizes[i] - 1)))
    totalW += w[i]
  
  # on selectionne 
  while len(R) < nb_pattern:
    # On cherche la ligne
    random_row = random.uniform(0,totalW)
    v, row = 0,0
    for i in range(len(w)):
      if v > random_row:
        row = i - 1
        break
      v += w[i]
      
    # On set les probabilités de k (taille du motif)
    # On souhaite que le sous-ensemble est une taille calculée 
    # proportionnellement avec les tailles des datarecords (ligne) 
    ks = np.zeros(int(sizes[row]))
    totalK = 0
    for i in range(len(ks)):
        ks[i] = sps.binom(len(ks), i + 1)
        totalK += ks[i]
    #totalK = (len(ks) * (len(ks)+1))/2
    random_kp = random.uniform(0, totalK)
    # On cherche la ligne
    k, v = 0,0
    for i in range(len(ks)):
      if v > random_kp:
        k = i - 1
        break
      v += ks[i]
    
    pattern = np.array(df.iloc[row][:sizes[row]])
    for i in range(len(pattern)- k):
      pattern = np.delete(pattern, random.randint(0, len(pattern) - 1 ))
    
    # On ajoute seulement les motifs non présent dans l'ensemble R
    IsInR.add(np.array2string(pattern))
    if len(IsInR) != len(R):
      R.append(pattern)
  return R

In [75]:
algoArea(ds,10)

[array([ 3,  7, 19, 29, 31, 36, 40, 42, 46, 50, 60, 62, 66, 70],
       dtype=int64),
 array([ 1,  7, 11, 13, 23, 25, 38, 40, 46, 56, 62, 68, 72], dtype=int64),
 array([ 1,  5,  9, 13, 27, 29, 31, 38, 44, 48, 60, 64, 74], dtype=int64),
 array([ 1,  3,  5,  9, 13, 23, 27, 34, 40, 42, 44, 50, 52, 56, 58, 64, 72,
        74], dtype=int64),
 array([ 1,  3,  5,  7, 15, 19, 36, 40, 42, 44, 46, 52, 54, 58, 60, 70, 72,
        74], dtype=int64),
 array([ 5, 11, 15, 19, 21, 25, 29, 34, 36, 42, 46, 54, 58, 60, 62, 70],
       dtype=int64),
 array([ 1,  5, 13, 23, 27, 29, 40, 42, 44, 48, 54, 56, 64, 68, 70, 74],
       dtype=int64),
 array([ 5,  7, 11, 13, 17, 19, 25, 27, 34, 36, 38, 40, 42, 44, 48, 50, 52,
        54, 56, 58, 60, 66, 68, 72], dtype=int64),
 array([ 5,  9, 11, 17, 19, 23, 29, 31, 34, 36, 42, 46, 48, 52, 54, 58, 70,
        72], dtype=int64),
 array([ 1,  3, 13, 15, 19, 21, 23, 25, 29, 42, 48, 52, 54, 62, 64, 66, 74],
       dtype=int64)]

## Question 3

In [60]:
def frequences(ds, patterns):
  df = ds.df
  sizes = ds.sizes
  frequencesP = np.zeros(len(patterns),dtype=float)
  indexs = np.zeros(len(patterns),dtype=int)
  lenPat = len(patterns)
  for i in range(df.shape[0]):
    indexs[True] = 0
    for j in range(ds.sizes[i]):
      for ip in range(lenPat):
        if (len(patterns[ip])!= indexs[ip] and 
            df.iloc[i][j] == patterns[ip][indexs[ip]]):
          indexs[ip] += 1
          if len(patterns[ip]) == indexs[ip]:
            frequencesP[ip] += 1.
  for i in range(len(frequencesP)):
    frequencesP[i] = (frequencesP[i])/float(df.shape[0])
  return frequencesP

In [61]:
patterns = algoFrequences(ds,10)

In [62]:
frequences(ds,patterns)

array([0.00876095, 0.00031289, 0.00156446, 0.01595745, 0.00093867,
       0.20025031, 0.00312891, 0.00125156, 0.01877347, 0.01220275])

In [63]:
def aire(ds, patterns):
  df = ds.df
  aireP = np.zeros(len(patterns),dtype=int)
  indexs = np.zeros(len(patterns),dtype=int)
  lenPat = len(patterns)
  for i in range(df.shape[0]):
    indexs[True] = 0
    for j in range(ds.sizes[i]):
      for ip in range(lenPat):
        if (len(patterns[ip])!= indexs[ip] and 
            df.iloc[i][j] == patterns[ip][indexs[ip]]):
          indexs[ip] += 1
          if len(patterns[ip]) == indexs[ip]:
            aireP[ip] += 1
  for i in range(len(aireP)):
    aireP[i] = aireP[i]*len(patterns[i])
  return aireP

In [64]:
patterns = algoArea(ds,10)

In [65]:
aire(ds,patterns)

array([ 672, 2318,  765,  480, 3056,  570,  323, 3120, 1660, 2680])

## Question 4

Nous allons maintenant tester nos algorithmes avec des jeux de données suggérés dans le sujet.

In [66]:
dataset_1 = pd.read_table("https://bitbucket.org/anesbendimerad/sigibbssamplingcode/raw/6699a50508fe177ee0c00dcc7d8e5390ee53688a/ItemsetDatasets/chess.txt", sep=" ",header=None)
del dataset_1[37]
ds1 = DataSet(dataset_1)
ds1.df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,1,3,5,7,9,11,13,15,17,19,...,56,58,60,62,64,66,68,70,72,74
1,1,3,5,7,9,12,13,15,17,19,...,56,58,60,62,64,66,68,70,72,74
2,1,3,5,7,9,12,13,16,17,19,...,56,58,60,62,64,66,68,70,72,74
3,1,3,5,7,9,11,13,15,17,20,...,56,58,60,62,64,66,68,70,72,74
4,1,3,5,7,9,11,13,15,17,19,...,56,58,60,62,64,66,68,70,72,74


In [67]:
frequences(ds1,algoFrequences(ds1,10))

array([1.25156446e-02, 3.12891114e-04, 5.31914894e-03, 2.47183980e-02,
       8.12578223e-01, 8.29161452e-02, 3.12891114e-04, 6.25782228e-04,
       3.53566959e-02, 3.44180225e-03])

In [68]:
aire(ds1,algoArea(ds1,10))

array([ 442,  440,  722,  665,  266, 1065,  816, 1598, 3689, 2496])

In [69]:
dataset_2 = pd.read_fwf("https://www.philippe-fournier-viger.com/spmf/datasets/LEVIATHAN.txt", sep=" ",header=None,)
dataset_2 = dataset_2[0].str.split(' ', expand=True)
ds2 = DataSet(dataset_2)
ds2.df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,176,177,178,179,180,181,182,183,184,185
0,36,-1,37,-1,38,-1,17,-1,39,-1,...,,,,,,,,,,
1,36,-1,64,-1,17,-1,8,-1,65,-1,...,,,,,,,,,,
2,78,-1,79,-1,80,-1,81,-1,82,-1,...,,,,,,,,,,
3,155,-1,8,-1,156,-1,14,-1,157,-1,...,,,,,,,,,,
4,1,-1,1,-1,1,-1,172,-1,173,-1,...,,,,,,,,,,


In [70]:
frequences(ds2, algoFrequences(ds2, 10))

array([0.00017141, 0.00017141, 0.00017141, 0.00017141, 0.00017141,
       0.00017141, 0.00017141, 0.00017141, 0.00017141, 0.00017141])

In [118]:
aire(ds2, algoArea(ds2, 10))

array([16, 14,  9, 15, 16, 13, 15, 12, 17, 12])

In [119]:
dataset_3 = pd.read_fwf("https://www.philippe-fournier-viger.com/spmf/datasets/FIFA.txt", sep=" ",header=None,)
dataset_3 = dataset_3[0].str.split(' ', expand=True)
ds3 = DataSet(dataset_3)
ds3.df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
0,3,-1,28,-1,58,-1,61,-1,64,-1,...,,,,,,,,,,
1,4,-1,7,-1,23,-1,255,-1,3,-1,...,,,,,,,,,,
2,8,-1,10,-1,13,-1,14,-1,21,-1,...,,,,,,,,,,
3,11,-1,25,-1,40,-1,54,-1,55,-1,...,,,,,,,,,,
4,12,-1,39,-1,63,-1,72,-1,3,-1,...,,,,,,,,,,


In [120]:
frequences(ds3, algoFrequences(ds3, 10))

array([4.88997555e-05, 4.88997555e-05, 4.88997555e-05, 3.86308068e-03,
       4.88997555e-05, 4.88997555e-05, 4.88997555e-05, 4.88997555e-05,
       4.88997555e-05, 4.88997555e-05])

In [121]:
aire(ds3, algoArea(ds3,10))

array([  15,  380,   18,   16,   30,   12, 3090,   13,   52,   22])

In [122]:
dataset_4 = pd.read_fwf("https://www.philippe-fournier-viger.com/spmf/datasets/BIBLE.txt", sep=" ",header=None,)
dataset_4 = dataset_4[0].str.split(' ', expand=True)
ds4 = DataSet(dataset_4)
ds4.df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
0,1,-1,2,-1,3,-1,4,-1,5,-1,...,,,,,,,,,,
1,40,-1,41,-1,42,-1,43,-1,44,-1,...,,,,,,,,,,
2,31,-1,50,-1,51,-1,15,-1,52,-1,...,,,,,,,,,,
3,69,-1,62,-1,70,-1,15,-1,71,-1,...,,,,,,,,,,
4,77,-1,78,-1,51,-1,79,-1,18,-1,...,,,,,,,,,,


In [123]:
frequences(ds4, algoFrequences(ds4, 10))

array([2.74959443e-05, 2.74959443e-05, 2.74959443e-05, 2.74959443e-05,
       2.74959443e-05, 2.74959443e-05, 2.74959443e-05, 2.74959443e-05,
       2.74959443e-05, 2.74959443e-05])

In [124]:
aire(ds4, algoArea(ds4,10))

array([18, 15, 17, 12, 14, 14, 11, 16, 17,  9])

## Question 5

### Etude emprique

Nous allons étudier les dataset ci-dessus.

### Dataset n°1

#### Algorithme de fréquence


Dans un premier temps, nous allons executer l'algorithme de la fréquence et chercher à obtenir 5 échantillons.

In [125]:
echantillons = algoFrequences(ds1, 5)
print(echantillons)

[array([ 2,  3,  5,  7,  9, 12, 18, 20, 22, 24, 25, 27, 29, 31, 34, 36, 39,
       40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 66, 68, 71, 72, 74],
      dtype=int64), array([ 2,  5, 11, 21, 25, 27, 34, 39, 47, 48, 50, 54, 56, 58, 62, 73, 74],
      dtype=int64), array([11, 16, 18, 21, 25, 33, 34, 42, 46, 48, 60, 68, 71], dtype=int64), array([ 2,  3,  9, 15, 23, 25, 28, 42, 44, 60, 62], dtype=int64), array([15, 17, 29, 31, 48, 52, 56, 60], dtype=int64)]


Nous obtenons donc nos 5 motifs que nous allons analyser avec la fonction implémentées à la question 3

In [126]:
frequences(ds1, algoFrequences(ds1, 5))

array([0.00250313, 0.00250313, 0.00187735, 0.0034418 , 0.00250313])

## Question 6

On a constaté que lorsque l'on avait une ligne beaucoup plus grande que les autres, les algorithmes de fréquence et d'aire favorisent la ligne la plus grande pour créer des motifs. On souhaite trouver des motifs fréquents, donc on souhaite éviter des motifs qui s'applique seulement dans des grandes lignes. Nous avons trouver trois solutions:

*   Supprimer les grandes lignes du dataset
*   Réduire la taille des grandes lignes
*   Modifier le poids accorder à la ligne la plus grande


In [127]:
dsetGL = pd.read_fwf("https://www.philippe-fournier-viger.com/spmf/datasets/FIFA.txt", sep=" ",header=None,)
df_gl= dsetGL[0].str.split(' ', expand=True)
ds_gl = DataSet(df_gl)
ds_gl.df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
0,3,-1,28,-1,58,-1,61,-1,64,-1,...,,,,,,,,,,
1,4,-1,7,-1,23,-1,255,-1,3,-1,...,,,,,,,,,,
2,8,-1,10,-1,13,-1,14,-1,21,-1,...,,,,,,,,,,
3,11,-1,25,-1,40,-1,54,-1,55,-1,...,,,,,,,,,,
4,12,-1,39,-1,63,-1,72,-1,3,-1,...,,,,,,,,,,


## Cas où le dataset est chargé dans un df
On va calculer la moyenne du nombre d'item par ligne et en fonction de cela on suprimera les colonnes supérieurs à la taille moyenne du nombre d'item/ligne
On ne regarde pas toute les lignes on en regarde que 40% (on pourras changer)

In [128]:
ds_gl.df.shape

(20450, 175)

In [129]:
def resizeDataSet(ds,percentage):
  df = ds.df
  shape = ds.df.shape
  sum = 0
  #for index, row in df.iterrows():
  for _ in range(int(shape[0]*percentage)):
    #if random.uniform(0.,100.) < 5.:
    sum += shape[1] - ds.sizes[random.randint(0,shape[0]-1)]
  mean = sum/(shape[0]*percentage)
  for i in range(shape[1]-1,int(mean),-1):
    del df[i]
  for i in range(shape[0]):
    if ds.sizes[i] > mean:
        ds.sizes[i] = math.ceil(mean)
  return df

In [130]:
resizeDataSet(ds_gl,0.4)
ds_gl.df.shape

(20450, 104)

## Troisième solution
Text  a mettre richard

Dans cette troisième approche, on va faire une moyenne mobile de la valeur de poid

[Info pour blabla](https://www.educatim.fr/tq/co/Module_TQ_web/co/moyenne_glissante.html)

In [131]:
def algoFrequencesSolutionA(ds,nb_pattern)-> tuple:
  df = ds.df
  sizes = ds.sizes
  R = []
  IsInR = set()
  P: dict = dict()
  w = np.zeros(df.shape[0])
  totalW = 0
    
  # set les probas
  for i in range(len(w)):
    w[i] = math.pow(2,sizes[i])
    if i >= 2:
      w[i] = (w[i] + w[i-1] + w[i-2])/ 3
    totalW += w[i]
  
  # on selectionne 
  while len(R) < nb_pattern:
    random_row = random.uniform(0,totalW)
    # On cherche la ligne
    row = 0
    v = 0
    for i in range(len(w)):
      if  v > random_row:
        row = i - 1
        break
      v += w[i]
    # On selctionne un motif 
    pattern = np.array(df.iloc[row][:sizes[row]])
    random_v = random.randint(1, len(pattern) - 1 )
    for i in range(len(pattern)- random_v):
      pattern = np.delete(pattern, random.randint(0, len(pattern) - 1 ))
    # On ajoute seulement les motifs non présent dans l'ensemble R
    IsInR.add(np.array2string(pattern))
    if len(IsInR) != len(R):
      R.append(pattern)
  return R

In [132]:
patterns = algoFrequencesSolutionA(ds_gl,10)
frequences(ds,patterns)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])