<a href="https://colab.research.google.com/github/louispaulet/pattern_mining_course/blob/main/PMSNA_HW1_oneFunction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This version is separated into 3 parts to be closer to the assignment specification :

*   imports section with all the needed libraries
*   functions section with all the function declarations
*   the main section with the constants set and the call to the minepi() function

In [None]:
#imports
import pandas as pd
import re
from itertools import permutations
import statistics


In [None]:
#functions
def joinSeqs(Xseq, Yseq, maxSpan, minGap):
  joinSeq = []
  for Xseq_occurence in Xseq:
    for Yseq_occurrence in Yseq:
      #we want to perform a forward-join of at least Xseq_occurence+minGap
      if (Xseq_occurence + minGap > Yseq_occurrence):
        #case where the Xseq_occurrence is too low
        None;
      #we want to add the number to the join sequence if the maxSpan is respected
      elif (Yseq_occurrence - Xseq_occurence <= maxSpan):
        joinSeq.append((Xseq_occurence, Yseq_occurrence))
        #we add a break here because we don't care for non-minimal sub-occurrences
        break;
      else:
        #else we finished with this Xseq_occurrence and go to the next one
        break;
      
  #remove non-minimal included occurrences
  prec_occurrence = (-1, -1) #initialize with impossible values
  minimal_joinSeq = [] #final list to be returned

  for joinSeq_pair in joinSeq:
    #when a new pair is found, add it to the list
    if (prec_occurrence[1] != joinSeq_pair[1]):
      minimal_joinSeq.append(joinSeq_pair)
    #if the new pair has the same upper bound, replace first item
    else:
      #squash the previous pair with the current one
      minimal_joinSeq[len(minimal_joinSeq)-1] = joinSeq_pair

    prec_occurrence = joinSeq_pair

  return minimal_joinSeq

def length_of_seq(seq):
  length = 0
  for pair in seq:
    length+=pair[1] - pair[0]
  return length

def frequency(head_ab, L):
  if (L == 0):
    return None
  return head_ab / L

def confidence(head_ab, na):
  if (na == 0):
    return None
  return head_ab / na

def recall(head_ab, nb):
  if (nb == 0):
    return None
  return head_ab / nb

def minepi(minFrequency, maxSpan, minGap):
  df = pd.read_csv('YahooFinance.data', sep='	')
  #the "ID_Sequence" column only contains the value "1" so we drop it
  df = df.drop("ID_Sequence", axis=1)
  event_names = df.Cotation.unique().tolist()

  # An id-list is a list of all sequences that contain the event. 
  # We must create an id_list for every event_name
  id_lists = []
  for event_name in event_names:
    id_lists.append((event_name, df[df.Cotation == event_name].ID_Time.tolist()))

  # For the next step, we create a second list of id_lists that doesn't contains 
  # the "0" variations (exclude the lack of variation).
  zeroless_id_list = []

  for id_list in id_lists:

    #exclude all values that contain "=0" string
    match = re.search("^.*(\=0)", id_list[0])

    if not match:
      zeroless_id_list.append(id_list)

  # We are only interested in frequent events with a frequency >= minFrequency
  frequent_id_list = []

  for zeroless_id in zeroless_id_list:
    if len(zeroless_id[1]) >= minFrequency:
      frequent_id_list.append(zeroless_id)

  #gather cardinality of frequent event_types
  card_event_types = []
  for frequent_id in frequent_id_list:
    card_event_types.append(frequent_id[0])

  #list all pairs of even_types
  pairs_event_types = list(permutations(card_event_types, 2))

  # We build a dataframe that contains the permutations,
  # the cardinalities, their head_frequencies, confidence, and recall
  df_event_types = pd.DataFrame(columns=["Xn", "Yn", "XYhead", "frequency", "confidence", "recall", "Xseq", "Yseq", "XYjoinSeq", "L", "hMeanConfRec"])
  df_event_typesXY = pd.DataFrame(pairs_event_types, columns=["X", "Y"])
  df_event_types = pd.concat([df_event_typesXY, df_event_types], axis=1)

  # Then we add the cardinalities to the columns (as well as sequences for later)
  for frequent_id in frequent_id_list:
    df_event_types.loc[df_event_types.X == frequent_id[0], "Xn"] = len(frequent_id[1])
    df_event_types.loc[df_event_types.X == frequent_id[0], "Xseq"] = [frequent_id[1]]

    df_event_types.loc[df_event_types.Y == frequent_id[0], "Yn"] = len(frequent_id[1])
    df_event_types.loc[df_event_types.Y == frequent_id[0], "Yseq"] = [frequent_id[1]]

  # We need to join Xseq and Yseq into XYjoinSeq with maxSpan param
  df_event_types['XYjoinSeq'] = df_event_types.apply(lambda row: joinSeqs(row.Xseq, row.Yseq, maxSpan, minGap), axis=1)

  # From the XYjoinSeq, we can count the number of pairs to obtain XYhead
  df_event_types['XYhead'] = df_event_types.apply(lambda row: len(row.XYjoinSeq), axis=1)

  # One last information needed before computing the remaining stats is the length of the sequence.
  df_event_types['L'] = df_event_types.apply(lambda row: length_of_seq(row.XYjoinSeq), axis=1)

  # We define the missing statistics and apply them to the dataframe
  df_event_types['frequency'] = df_event_types.apply(lambda row: frequency(row.XYhead, row.L), axis=1)
  df_event_types['confidence'] = df_event_types.apply(lambda row: confidence(row.XYhead, row.Xn), axis=1)
  df_event_types['recall'] = df_event_types.apply(lambda row: recall(row.XYhead, row.Yn), axis=1)

  # We search the best tradeoff between frequency and recall.
  # First, we will drop the rows that have either frequency 
  # or recall set as NaN because of a division by 0.
  # Then, we will compute the harmonic mean between confidence 
  # and recall and select the 10 best episodes.
  df_event_types['hMeanConfRec'] = df_event_types.apply(lambda row: statistics.harmonic_mean([row.confidence, row.recall]), axis=1)

  return df_event_types.nlargest(10, 'hMeanConfRec')



The answer to question 2 is the first line of the output dataframe. For efficient trading, the other rules below should also be considered.

In [None]:
#parameters and main call
minFrequency = 50
maxSpan = 2
minGap = 1
minepi(minFrequency, maxSpan, minGap)

Unnamed: 0,X,Y,Xn,Yn,XYhead,frequency,confidence,recall,Xseq,Yseq,XYjoinSeq,L,hMeanConfRec
193,Boeing=-1,Boeing=1,269,300,119,0.798658,0.442379,0.396667,"[2, 7, 11, 12, 17, 18, 24, 34, 39, 53, 54, 62,...","[6, 8, 14, 23, 25, 29, 35, 37, 38, 46, 56, 60,...","[(7, 8), (12, 14), (24, 25), (34, 35), (54, 56...",149,0.418278
759,General_Motors=-1,Exxon_Mobil=1,266,263,108,0.755245,0.406015,0.410646,"[4, 6, 10, 12, 14, 15, 17, 26, 27, 30, 32, 33,...","[8, 13, 14, 16, 19, 20, 23, 27, 32, 41, 45, 57...","[(6, 8), (12, 13), (15, 16), (17, 19), (26, 27...",143,0.408318
1226,Microsoft=-1,Boeing=1,230,300,108,0.715232,0.469565,0.36,"[8, 9, 13, 16, 19, 22, 32, 33, 35, 45, 53, 61,...","[6, 8, 14, 23, 25, 29, 35, 37, 38, 46, 56, 60,...","[(13, 14), (22, 23), (33, 35), (35, 37), (45, ...",151,0.407547
1221,Microsoft=-1,General_Motors=-1,230,266,100,0.70922,0.434783,0.37594,"[8, 9, 13, 16, 19, 22, 32, 33, 35, 45, 53, 61,...","[4, 6, 10, 12, 14, 15, 17, 26, 27, 30, 32, 33,...","[(9, 10), (13, 14), (16, 17), (32, 33), (33, 3...",141,0.403226
765,General_Motors=-1,General_Motors=1,266,244,102,0.744526,0.383459,0.418033,"[4, 6, 10, 12, 14, 15, 17, 26, 27, 30, 32, 33,...","[11, 16, 19, 21, 23, 24, 28, 31, 36, 40, 42, 4...","[(10, 11), (15, 16), (17, 19), (27, 28), (30, ...",137,0.4
1362,AT&T=-1,Exxon_Mobil=1,227,263,97,0.697842,0.427313,0.368821,"[8, 9, 12, 13, 17, 22, 25, 34, 42, 46, 51, 54,...","[8, 13, 14, 16, 19, 20, 23, 27, 32, 41, 45, 57...","[(12, 13), (13, 14), (17, 19), (22, 23), (25, ...",139,0.395918
1455,Verizon=-1,American_Express=-1,250,230,95,0.766129,0.38,0.413043,"[9, 13, 16, 25, 27, 33, 39, 40, 42, 51, 54, 62...","[12, 20, 24, 25, 29, 33, 40, 42, 59, 71, 76, 8...","[(27, 29), (39, 40), (40, 42), (69, 71), (84, ...",124,0.395833
953,Boeing=1,Exxon_Mobil=-1,300,228,103,0.725352,0.343333,0.451754,"[6, 8, 14, 23, 25, 29, 35, 37, 38, 46, 56, 60,...","[2, 5, 6, 9, 10, 11, 17, 18, 29, 30, 31, 38, 3...","[(8, 9), (29, 30), (37, 38), (38, 39), (46, 47...",142,0.390152
1450,Verizon=-1,AT&T=-1,250,227,93,0.699248,0.372,0.409692,"[9, 13, 16, 25, 27, 33, 39, 40, 42, 51, 54, 62...","[8, 9, 12, 13, 17, 22, 25, 34, 42, 46, 51, 54,...","[(16, 17), (33, 34), (40, 42), (62, 63), (94, ...",133,0.389937
446,Intel=-1,General_Motors=-1,269,266,104,0.753623,0.386617,0.390977,"[2, 9, 10, 23, 25, 34, 35, 37, 38, 39, 49, 50,...","[4, 6, 10, 12, 14, 15, 17, 26, 27, 30, 32, 33,...","[(2, 4), (9, 10), (10, 12), (25, 26), (37, 38)...",138,0.388785
