<a href="https://colab.research.google.com/github/louispaulet/pattern_mining_course/blob/main/PMSNA_HW1_oneFunction_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This version is separated into 3 parts to be closer to the assignment specification :

*   imports section with all the needed libraries
*   functions section with all the function declarations
*   the main section with the constants set and the call to the minepi() function

In [None]:
#imports
import pandas as pd
import re
from itertools import permutations
import statistics


In [None]:
#functions
def joinSeqs(Xseq, Yseq, maxSpan, minGap):
  joinSeq = []
  for Xseq_occurence in Xseq:
    for Yseq_occurrence in Yseq:
      #we want to perform a forward-join of at least Xseq_occurence+minGap
      if (Xseq_occurence + minGap > Yseq_occurrence):
        #case where the Xseq_occurrence is too low
        None;
      #we want to add the number to the join sequence if the maxSpan is respected
      elif (Yseq_occurrence - Xseq_occurence <= maxSpan):
        joinSeq.append((Xseq_occurence, Yseq_occurrence))
      else:
        #else we finished with this Xseq_occurrence and go to the next one
        break;
  return joinSeq

#length of a sequence is first time to last time observed
def length_of_seq(seq):
  return seq[len(seq)-1][1] - seq[0][0]

def frequency(head_ab, L):
  if (L == 0):
    return None
  return head_ab / L

def confidence(head_ab, na):
  if (na == 0):
    return None
  return head_ab / na

def recall(head_ab, nb):
  if (nb == 0):
    return None
  return head_ab / nb

def minepi(minFrequency, maxSpan, minGap):
  df = pd.read_csv('YahooFinance.data', sep='	')
  #the "ID_Sequence" column only contains the value "1" so we drop it
  df = df.drop("ID_Sequence", axis=1)
  event_names = df.Cotation.unique().tolist()

  # An id-list is a list of all sequences that contain the event. 
  # We must create an id_list for every event_name
  id_lists = []
  for event_name in event_names:
    id_lists.append((event_name, df[df.Cotation == event_name].ID_Time.tolist()))

  # For the next step, we create a second list of id_lists that doesn't contains 
  # the "0" variations (exclude the lack of variation).
  zeroless_id_list = []

  for id_list in id_lists:

    #exclude all values that contain "=0" string
    match = re.search("^.*(\=0)", id_list[0])

    if not match:
      zeroless_id_list.append(id_list)

  # We are only interested in frequent events with a frequency >= minFrequency
  frequent_id_list = []

  for zeroless_id in zeroless_id_list:
    if len(zeroless_id[1]) >= minFrequency:
      frequent_id_list.append(zeroless_id)

  #gather cardinality of frequent event_types
  card_event_types = []
  for frequent_id in frequent_id_list:
    card_event_types.append(frequent_id[0])

  #list all pairs of even_types
  pairs_event_types = list(permutations(card_event_types, 2))

  # We build a dataframe that contains the permutations,
  # the cardinalities, their head_frequencies, confidence, and recall
  df_event_types = pd.DataFrame(columns=["Xn", "Yn", "XYhead", "frequency", "confidence", "recall", "Xseq", "Yseq", "XYjoinSeq", "L", "hMeanConfRec"])
  df_event_typesXY = pd.DataFrame(pairs_event_types, columns=["X", "Y"])
  df_event_types = pd.concat([df_event_typesXY, df_event_types], axis=1)

  # Then we add the cardinalities to the columns (as well as sequences for later)
  for frequent_id in frequent_id_list:
    df_event_types.loc[df_event_types.X == frequent_id[0], "Xn"] = len(frequent_id[1])
    df_event_types.loc[df_event_types.X == frequent_id[0], "Xseq"] = [frequent_id[1]]

    df_event_types.loc[df_event_types.Y == frequent_id[0], "Yn"] = len(frequent_id[1])
    df_event_types.loc[df_event_types.Y == frequent_id[0], "Yseq"] = [frequent_id[1]]

  # We need to join Xseq and Yseq into XYjoinSeq with maxSpan param
  df_event_types['XYjoinSeq'] = df_event_types.apply(lambda row: joinSeqs(row.Xseq, row.Yseq, maxSpan, minGap), axis=1)

  # From the XYjoinSeq, we can count the number of pairs to obtain XYhead
  df_event_types['XYhead'] = df_event_types.apply(lambda row: len(row.XYjoinSeq), axis=1)

  # One last information needed before computing the remaining stats is the length of the sequence.
  df_event_types['L'] = df_event_types.apply(lambda row: length_of_seq(row.XYjoinSeq), axis=1)

  # We define the missing statistics and apply them to the dataframe
  df_event_types['frequency'] = df_event_types.apply(lambda row: frequency(row.XYhead, row.L), axis=1)
  df_event_types['confidence'] = df_event_types.apply(lambda row: confidence(row.XYhead, row.Xn), axis=1)
  df_event_types['recall'] = df_event_types.apply(lambda row: recall(row.XYhead, row.Yn), axis=1)

  # We search the best tradeoff between frequency and recall.
  # First, we will drop the rows that have either frequency 
  # or recall set as NaN because of a division by 0.
  # Then, we will compute the harmonic mean between confidence 
  # and recall and select the 10 best episodes.
  df_event_types['hMeanConfRec'] = df_event_types.apply(lambda row: statistics.harmonic_mean([row.confidence, row.recall]), axis=1)

  return df_event_types.nlargest(10, 'hMeanConfRec')



The answer to question 2 is the first line of the output dataframe. For efficient trading, the other rules below should also be considered.

In [None]:
#parameters and main call
minFrequency = 50
maxSpan = 2
minGap = 1
minepi(minFrequency, maxSpan, minGap)

Unnamed: 0,X,Y,Xn,Yn,XYhead,frequency,confidence,recall,Xseq,Yseq,XYjoinSeq,L,hMeanConfRec
193,Boeing=-1,Boeing=1,269,300,153,0.123288,0.568773,0.51,"[2, 7, 11, 12, 17, 18, 24, 34, 39, 53, 54, 62,...","[6, 8, 14, 23, 25, 29, 35, 37, 38, 46, 56, 60,...","[(7, 8), (12, 14), (24, 25), (34, 35), (54, 56...",1241,0.537786
759,General_Motors=-1,Exxon_Mobil=1,266,263,141,0.116529,0.530075,0.536122,"[4, 6, 10, 12, 14, 15, 17, 26, 27, 30, 32, 33,...","[8, 13, 14, 16, 19, 20, 23, 27, 32, 41, 45, 57...","[(6, 8), (12, 13), (12, 14), (14, 16), (15, 16...",1210,0.533081
765,General_Motors=-1,General_Motors=1,266,244,134,0.107631,0.503759,0.54918,"[4, 6, 10, 12, 14, 15, 17, 26, 27, 30, 32, 33,...","[11, 16, 19, 21, 23, 24, 28, 31, 36, 40, 42, 4...","[(10, 11), (14, 16), (15, 16), (17, 19), (26, ...",1245,0.52549
200,Boeing=-1,Exxon_Mobil=1,269,263,134,0.109121,0.498141,0.509506,"[2, 7, 11, 12, 17, 18, 24, 34, 39, 53, 54, 62,...","[8, 13, 14, 16, 19, 20, 23, 27, 32, 41, 45, 57...","[(7, 8), (11, 13), (12, 13), (12, 14), (17, 19...",1228,0.503759
1453,Verizon=-1,General_Motors=1,250,244,123,0.101737,0.492,0.504098,"[9, 13, 16, 25, 27, 33, 39, 40, 42, 51, 54, 62...","[11, 16, 19, 21, 23, 24, 28, 31, 36, 40, 42, 4...","[(9, 11), (27, 28), (39, 40), (40, 42), (42, 4...",1209,0.497976
1597,American_Express=-1,Du_Pont=1,230,234,115,0.094572,0.5,0.491453,"[12, 20, 24, 25, 29, 33, 40, 42, 59, 71, 76, 8...","[2, 20, 28, 29, 35, 41, 57, 58, 67, 70, 73, 77...","[(33, 35), (40, 41), (71, 73), (76, 77), (84, ...",1216,0.49569
195,Boeing=-1,Altria=1,269,231,123,0.100655,0.457249,0.532468,"[2, 7, 11, 12, 17, 18, 24, 34, 39, 53, 54, 62,...","[6, 8, 13, 15, 18, 19, 23, 26, 27, 32, 41, 44,...","[(7, 8), (11, 13), (12, 13), (17, 18), (17, 19...",1222,0.492
1226,Microsoft=-1,Boeing=1,230,300,130,0.105605,0.565217,0.433333,"[8, 9, 13, 16, 19, 22, 32, 33, 35, 45, 53, 61,...","[6, 8, 14, 23, 25, 29, 35, 37, 38, 46, 56, 60,...","[(13, 14), (22, 23), (33, 35), (35, 37), (45, ...",1231,0.490566
607,Du_Pont=-1,Citigroup=1,237,221,112,0.092639,0.472574,0.506787,"[3, 11, 12, 17, 21, 22, 26, 32, 33, 36, 39, 46...","[2, 7, 9, 14, 15, 18, 23, 28, 48, 52, 62, 69, ...","[(12, 14), (17, 18), (21, 23), (22, 23), (26, ...",1209,0.489083
950,Boeing=1,Boeing=-1,300,269,139,0.111557,0.463333,0.516729,"[6, 8, 14, 23, 25, 29, 35, 37, 38, 46, 56, 60,...","[2, 7, 11, 12, 17, 18, 24, 34, 39, 53, 54, 62,...","[(6, 7), (23, 24), (37, 39), (38, 39), (60, 62...",1246,0.488576
