# Exemple 1

In [366]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [368]:
# des données
data = {'ID':[1,2,3,4,5,6],
       'Onion':[1,0,0,1,1,1],
       'Potato':[1,1,0,1,1,1],
       'Burger':[1,1,0,0,1,1],
       'Milk':[0,1,1,1,0,1],
       'Beer':[0,0,1,0,1,0]}


In [370]:
# Création d'un DataFrame avec pandas à partir des données
df = pd.DataFrame(data, index=["Element 1", "Element 2","Element 3","Element 4","Element 5","Element 6"])

In [372]:

df = df[["ID", "Onion", "Potato", "Burger", "Milk", "Beer"]]
df

Unnamed: 0,ID,Onion,Potato,Burger,Milk,Beer
Element 1,1,1,1,1,0,0
Element 2,2,0,1,1,1,0
Element 3,3,0,0,0,1,1
Element 4,4,1,1,0,1,0
Element 5,5,1,1,1,0,1
Element 6,6,1,1,1,1,0


In [374]:
frequent_itemsets = apriori(df[['Onion', 'Potato', 'Burger', 'Milk', 'Beer' ]], min_support=0.50, use_colnames=True)



In [376]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.666667,(Onion)
1,0.833333,(Potato)
2,0.666667,(Burger)
3,0.666667,(Milk)
4,0.666667,"(Onion, Potato)"
5,0.5,"(Burger, Onion)"
6,0.666667,"(Burger, Potato)"
7,0.5,"(Potato, Milk)"
8,0.5,"(Burger, Onion, Potato)"


In [378]:
# Appliquer l'extraction des règles d'association à partir des itemsets fréquents
# Nous utilisons la métrique "lift" pour évaluer la qualité des règles. 
# Le lift mesure l'attractivité d'une règle, et nous définissons un seuil minimal de 1 pour le lift.
# `num_itemsets=1` n'est pas un paramètre valide pour association_rules. Cette ligne pourrait générer une erreur.
rules = association_rules(frequent_itemsets, 1, metric='lift', min_threshold=1.0)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Onion),(Potato),0.666667,0.833333,0.666667,1.0,1.2,1.0,0.111111,inf,0.5,0.8,1.0,0.9
1,(Potato),(Onion),0.833333,0.666667,0.666667,0.8,1.2,1.0,0.111111,1.666667,1.0,0.8,0.4,0.9
2,(Burger),(Onion),0.666667,0.666667,0.5,0.75,1.125,1.0,0.055556,1.333333,0.333333,0.6,0.25,0.75
3,(Onion),(Burger),0.666667,0.666667,0.5,0.75,1.125,1.0,0.055556,1.333333,0.333333,0.6,0.25,0.75
4,(Burger),(Potato),0.666667,0.833333,0.666667,1.0,1.2,1.0,0.111111,inf,0.5,0.8,1.0,0.9
5,(Potato),(Burger),0.833333,0.666667,0.666667,0.8,1.2,1.0,0.111111,1.666667,1.0,0.8,0.4,0.9
6,"(Burger, Onion)",(Potato),0.5,0.833333,0.5,1.0,1.2,1.0,0.083333,inf,0.333333,0.6,1.0,0.8
7,"(Burger, Potato)",(Onion),0.666667,0.666667,0.5,0.75,1.125,1.0,0.055556,1.333333,0.333333,0.6,0.25,0.75
8,"(Onion, Potato)",(Burger),0.666667,0.666667,0.5,0.75,1.125,1.0,0.055556,1.333333,0.333333,0.6,0.25,0.75
9,(Burger),"(Onion, Potato)",0.666667,0.666667,0.5,0.75,1.125,1.0,0.055556,1.333333,0.333333,0.6,0.25,0.75


In [380]:
# Filtrer les règles d'association selon deux critères :
# 1. Le lift doit être supérieur à 1.125 (indiquant que la règle est plus intéressante que ce à quoi on s'attendait par hasard)
# 2. La confiance (probabilité de l'achat du conséquent) doit être supérieure à 0.8 (indiquant que la règle est forte)
rules [ (rules['lift'] >1.125)  & (rules['confidence']> 0.8)  ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Onion),(Potato),0.666667,0.833333,0.666667,1.0,1.2,1.0,0.111111,inf,0.5,0.8,1.0,0.9
4,(Burger),(Potato),0.666667,0.833333,0.666667,1.0,1.2,1.0,0.111111,inf,0.5,0.8,1.0,0.9
6,"(Burger, Onion)",(Potato),0.5,0.833333,0.5,1.0,1.2,1.0,0.083333,inf,0.333333,0.6,1.0,0.8


# Exemple 2

In [383]:
# exemple des données
retail_shopping_basket = {'ID':[1,2,3,4,5,6],
                         'Basket':[['Beer', 'Diaper', 'Pretzels', 'Chips', 'Aspirin'],
                                   ['Diaper', 'Beer', 'Chips', 'Lotion', 'Juice', 'BabyFood', 'Milk'],
                                   ['Soda', 'Chips', 'Milk'],
                                   ['Soup', 'Beer', 'Diaper', 'Milk', 'IceCream'],
                                   ['Soda', 'Coffee', 'Milk', 'Bread'],
                                   ['Beer', 'Chips']
                                  ]
                         }

In [385]:
# Création d'un DataFrame avec pandas à partir des données
retail = pd.DataFrame(retail_shopping_basket, index=["Element 1", "Element 2","Element 3","Element 4","Element 5","Element 6"])

In [387]:
retail = retail[['ID', 'Basket']]
retail # afficher la colonne ID et Basket

Unnamed: 0,ID,Basket
Element 1,1,"[Beer, Diaper, Pretzels, Chips, Aspirin]"
Element 2,2,"[Diaper, Beer, Chips, Lotion, Juice, BabyFood,..."
Element 3,3,"[Soda, Chips, Milk]"
Element 4,4,"[Soup, Beer, Diaper, Milk, IceCream]"
Element 5,5,"[Soda, Coffee, Milk, Bread]"
Element 6,6,"[Beer, Chips]"


In [389]:
from sklearn.preprocessing import MultiLabelBinarizer # Importer la classe MultiLabelBinarizer de la bibliothèque scikit-learn.
mlb = MultiLabelBinarizer() # Initialisation d'une instance de la classe MultiLabelBinarizer.
# Appliquer la transformation et convertir les étiquettes multi-catégories en format binaire. # Spécifier les noms des colonnes basés sur les classes identifiées par MultiLabelBinarizer.
pd.DataFrame(mlb.fit_transform(retail.Basket), columns=mlb.classes_ , index=["Element 1", "Element 2","Element 3","Element 4","Element 5","Element 6"])
# index= :  Définir les noms des lignes.

Unnamed: 0,Aspirin,BabyFood,Beer,Bread,Chips,Coffee,Diaper,IceCream,Juice,Lotion,Milk,Pretzels,Soda,Soup
Element 1,1,0,1,0,1,0,1,0,0,0,0,1,0,0
Element 2,0,1,1,0,1,0,1,0,1,1,1,0,0,0
Element 3,0,0,0,0,1,0,0,0,0,0,1,0,1,0
Element 4,0,0,1,0,0,0,1,1,0,0,1,0,0,1
Element 5,0,0,0,1,0,1,0,0,0,0,1,0,1,0
Element 6,0,0,1,0,1,0,0,0,0,0,0,0,0,0


In [391]:
retail = retail.drop('Basket' ,axis=1).join(retail.Basket.str.join(',').str.get_dummies(','))
# retail.drop('Basket' ,axis=1) : pour supprimer la colonne basket
# join(retail.Basket.str.join(',') : Convertir chaque liste dans 'Basket' en une chaîne de caractères, où les éléments sont séparés par des virgules.
# str.get_dummies(',') :  Créer des colonnes binaires pour chaque élément unique, en utilisant la virgule comme séparateur.
retail

Unnamed: 0,ID,Aspirin,BabyFood,Beer,Bread,Chips,Coffee,Diaper,IceCream,Juice,Lotion,Milk,Pretzels,Soda,Soup
Element 1,1,1,0,1,0,1,0,1,0,0,0,0,1,0,0
Element 2,2,0,1,1,0,1,0,1,0,1,1,1,0,0,0
Element 3,3,0,0,0,0,1,0,0,0,0,0,1,0,1,0
Element 4,4,0,0,1,0,0,0,1,1,0,0,1,0,0,1
Element 5,5,0,0,0,1,0,1,0,0,0,0,1,0,1,0
Element 6,6,0,0,1,0,1,0,0,0,0,0,0,0,0,0


In [393]:
frequent_itemsets_2 = apriori(retail.drop('ID',axis=1), use_colnames=True)
frequent_itemsets_2



Unnamed: 0,support,itemsets
0,0.666667,(Beer)
1,0.666667,(Chips)
2,0.5,(Diaper)
3,0.666667,(Milk)
4,0.5,"(Beer, Chips)"
5,0.5,"(Beer, Diaper)"


In [395]:
association_rules(frequent_itemsets_2, metric='lift',num_itemsets= 1)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Beer),(Chips),0.666667,0.666667,0.5,0.75,1.125,1.0,0.055556,1.333333,0.333333,0.6,0.25,0.75
1,(Chips),(Beer),0.666667,0.666667,0.5,0.75,1.125,1.0,0.055556,1.333333,0.333333,0.6,0.25,0.75
2,(Beer),(Diaper),0.666667,0.5,0.5,0.75,1.5,1.0,0.166667,2.0,1.0,0.75,0.5,0.875
3,(Diaper),(Beer),0.5,0.666667,0.5,1.0,1.5,1.0,0.166667,inf,0.666667,0.75,1.0,0.875


In [397]:
association_rules(frequent_itemsets_2, num_itemsets=1)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Diaper),(Beer),0.5,0.666667,0.5,1.0,1.5,1.0,0.166667,inf,0.666667,0.75,1.0,0.875


# Exemple 3

In [400]:
movies = pd.read_csv(r'C:\Users\yacine.medjbeur\Documents\ml-latest-small\ml-latest-small\movies.csv') # recupérer les données

In [402]:
movies.head(10) # afficher les 10 premier movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [404]:
movies_ohe = movies.join(movies.genres.str.get_dummies())
movies_ohe

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9739,193585,Flint (2017),Drama,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [406]:
pd.options.display.max_columns=100

In [408]:
movies_ohe.head()

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [410]:
stat1 = movies_ohe.drop(['title', 'movieId'],axis=1).apply(pd.value_counts)

  stat1 = movies_ohe.drop(['title', 'movieId'],axis=1).apply(pd.value_counts)
  stat1 = movies_ohe.drop(['title', 'movieId'],axis=1).apply(pd.value_counts)


TypeError: '<' not supported between instances of 'int' and 'str'

In [412]:
stat1 = stat1.transpose().drop(0,axis=1).sort_values(by=1, ascending=False).rename(columns={1:'No. of movies'})

KeyError: '[0] not found in axis'

In [414]:
stat2 = movies.join(movies.genres.str.split('|').reset_index().genres.str.len(), rsuffix='r').rename(columns={'genresr':'genre_count'}).drop(["title",	"movieId"],axis=1)

In [416]:
stat2 = stat2[stat2['genre_count']==1].drop([],axis=1).groupby('genres').sum().sort_values(by='genre_count', ascending=False)

In [418]:
stat = stat1.merge(stat2, how='left', left_index=True, right_index=True).fillna(0)

In [420]:
stat.genre_count=stat.genre_count.astype(int)
stat.rename(columns={'genre_count': 'No. of movies with only 1 genre'},inplace=True)

In [422]:
stat

Unnamed: 0,No. of movies,No. of movies with only 1 genre
Drama,4361,1053
Comedy,3756,946
Thriller,1894,84
Action,1828,60
Romance,1596,21
Adventure,1263,12
Crime,1199,12
Sci-Fi,980,37
Horror,978,167
Fantasy,779,4


In [449]:
import matplotlib.pyplot as plt
%matplotlib inline
movies_ohe.set_index(['movieId','title']).sum(axis=1).hist()
plt.title('distribution of number of genres')



KeyError: "None of ['movieId', 'title'] are in the columns"

# Let's get back to analysing the genre associations:

In [445]:
movies_ohe.set_index(["movieId","title"], inplace=True)

In [447]:
frequent_itemsets_movies = apriori(movies_ohe,use_colnames=True, min_support=0.025)



ValueError: The allowed values for a DataFrame are True, False, 0, 1. Found value Adventure|Animation|Children|Comedy|Fantasy

In [431]:
frequent_itemsets_movies

Unnamed: 0,support,itemsets
0,0.187641,(Action)
1,0.129645,(Adventure)
2,0.062718,(Animation)
3,0.068158,(Children)
4,0.385547,(Comedy)
5,0.123075,(Crime)
6,0.045165,(Documentary)
7,0.447649,(Drama)
8,0.079963,(Fantasy)
9,0.10039,(Horror)


In [433]:
rules_movies =  association_rules(frequent_itemsets_movies, metric='lift', min_threshold=1.25, num_itemsets=1)

In [435]:
rules_movies

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Adventure),(Action),0.129645,0.187641,0.062615,0.482977,2.57394,1.0,0.038289,1.571224,0.702576,0.245869,0.363553,0.408338
1,(Action),(Adventure),0.187641,0.129645,0.062615,0.333698,2.57394,1.0,0.038289,1.306247,0.752735,0.245869,0.234448,0.408338
2,(Action),(Crime),0.187641,0.123075,0.042907,0.228665,1.857929,1.0,0.019813,1.136892,0.568426,0.160215,0.120409,0.288645
3,(Crime),(Action),0.123075,0.187641,0.042907,0.348624,1.857929,1.0,0.019813,1.247142,0.526575,0.160215,0.198167,0.288645
4,(Action),(Sci-Fi),0.187641,0.100595,0.046294,0.246718,2.452576,1.0,0.027419,1.193981,0.729069,0.191345,0.162466,0.353461
5,(Sci-Fi),(Action),0.100595,0.187641,0.046294,0.460204,2.452576,1.0,0.027419,1.504937,0.658508,0.191345,0.33552,0.353461
6,(Action),(Thriller),0.187641,0.194416,0.067235,0.358315,1.843034,1.0,0.030754,1.25542,0.563072,0.213564,0.203454,0.352072
7,(Thriller),(Action),0.194416,0.187641,0.067235,0.345829,1.843034,1.0,0.030754,1.241814,0.567807,0.213564,0.194726,0.352072
8,(Adventure),(Animation),0.129645,0.062718,0.025354,0.195566,3.118175,1.0,0.017223,1.165145,0.780486,0.151813,0.141737,0.299911
9,(Animation),(Adventure),0.062718,0.129645,0.025354,0.404255,3.118175,1.0,0.017223,1.460953,0.724755,0.151813,0.315515,0.299911


In [437]:
rules_movies[(rules_movies.conviction>1.25)] # mettre un condition que le soit le confiction > 1.25

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Adventure),(Action),0.129645,0.187641,0.062615,0.482977,2.57394,1.0,0.038289,1.571224,0.702576,0.245869,0.363553,0.408338
1,(Action),(Adventure),0.187641,0.129645,0.062615,0.333698,2.57394,1.0,0.038289,1.306247,0.752735,0.245869,0.234448,0.408338
5,(Sci-Fi),(Action),0.100595,0.187641,0.046294,0.460204,2.452576,1.0,0.027419,1.504937,0.658508,0.191345,0.33552,0.353461
6,(Action),(Thriller),0.187641,0.194416,0.067235,0.358315,1.843034,1.0,0.030754,1.25542,0.563072,0.213564,0.203454,0.352072
9,(Animation),(Adventure),0.062718,0.129645,0.025354,0.404255,3.118175,1.0,0.017223,1.460953,0.724755,0.151813,0.315515,0.299911
11,(Children),(Adventure),0.068158,0.129645,0.032026,0.46988,3.62436,1.0,0.02319,1.641806,0.777052,0.193189,0.390915,0.358455
12,(Adventure),(Fantasy),0.129645,0.079963,0.034285,0.26445,3.307149,1.0,0.023918,1.250815,0.80154,0.19555,0.200521,0.346602
13,(Fantasy),(Adventure),0.079963,0.129645,0.034285,0.428755,3.307149,1.0,0.023918,1.52361,0.758257,0.19555,0.343664,0.346602
15,(Sci-Fi),(Adventure),0.100595,0.129645,0.03141,0.312245,2.408464,1.0,0.018369,1.265502,0.650205,0.157976,0.209799,0.277263
16,(Children),(Animation),0.068158,0.062718,0.031,0.454819,7.251799,1.0,0.026725,1.719213,0.925161,0.31038,0.418339,0.474545


In [439]:
rules_movies[(rules_movies.conviction>1.5)].sort_values(by=['lift','conviction'], ascending=False) 
# mettre un condition que le soit le confiction > 1.25
# sort_values(by=['lift','conviction'], ascending=False :  ordre de plus bas jusqu'a plus haut

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
17,(Animation),(Children),0.062718,0.068158,0.031,0.494272,7.251799,1.0,0.026725,1.842573,0.919791,0.31038,0.457281,0.474545
16,(Children),(Animation),0.068158,0.062718,0.031,0.454819,7.251799,1.0,0.026725,1.719213,0.925161,0.31038,0.418339,0.474545
11,(Children),(Adventure),0.068158,0.129645,0.032026,0.46988,3.62436,1.0,0.02319,1.641806,0.777052,0.193189,0.390915,0.358455
13,(Fantasy),(Adventure),0.079963,0.129645,0.034285,0.428755,3.307149,1.0,0.023918,1.52361,0.758257,0.19555,0.343664,0.346602
31,(Mystery),(Thriller),0.058817,0.194416,0.036338,0.617801,3.177729,1.0,0.024902,2.107761,0.728137,0.167534,0.525563,0.402354
0,(Adventure),(Action),0.129645,0.187641,0.062615,0.482977,2.57394,1.0,0.038289,1.571224,0.702576,0.245869,0.363553,0.408338
37,"(Crime, Drama)",(Thriller),0.065387,0.194416,0.031718,0.485086,2.495096,1.0,0.019006,1.564503,0.641136,0.139064,0.360819,0.324117
5,(Sci-Fi),(Action),0.100595,0.187641,0.046294,0.460204,2.452576,1.0,0.027419,1.504937,0.658508,0.191345,0.33552,0.353461
23,(Crime),(Thriller),0.123075,0.194416,0.058407,0.474562,2.440963,1.0,0.034479,1.533167,0.673177,0.225436,0.347755,0.387492
29,(Horror),(Thriller),0.10039,0.194416,0.047116,0.469325,2.414026,1.0,0.027598,1.518037,0.65112,0.19022,0.341255,0.355835


In [441]:
pd.options.display.max_rows=50

In [443]:
movies[(movies.genres.str.contains('Adventure')) & (movies.genres.str.contains('Children')) & (~movies.genres.str.contains('Animation'))]

Unnamed: 0,movieId,title,genres
1,2,Jumanji (1995),Adventure|Children|Fantasy
7,8,Tom and Huck (1995),Adventure|Children
53,60,"Indian in the Cupboard, The (1995)",Adventure|Children|Fantasy
95,107,Muppet Treasure Island (1996),Adventure|Children|Comedy|Musical
109,126,"NeverEnding Story III, The (1994)",Adventure|Children|Fantasy
...,...,...,...
9428,166203,Sapphire Blue (2014),Adventure|Children|Fantasy|Sci-Fi
9565,173873,Gulliver's Travels (1996),Adventure|Children|Fantasy
9636,179401,Jumanji: Welcome to the Jungle (2017),Action|Adventure|Children
9697,184987,A Wrinkle in Time (2018),Adventure|Children|Fantasy|Sci-Fi
