In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import warnings
from collections import Counter
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv("Dataset-Exos2.csv")
df.head()

Unnamed: 0,Watcher,videoCategoryId,videoCategoryLabel,definition
0,Billy,29.0,Nonprofits & Activism,hd
1,Leila,22.0,People & Blogs,sd
2,Billy,22.0,People & Blogs,sd
3,Mark,24.0,Entertainment,hd
4,Billy,24.0,Entertainment,hd


### Jeu de données

#### Analyse du dataset

In [3]:
dataset_description = {}
dataset_description["Nombre de lignes"] = df.shape[0]
dataset_description["Nombre de colonnes"] = df.shape[1]
dataset_description["Usage en memoire"] = str(df.memory_usage(index=False).sum() / 1024) + " ko"
dataset_description["Type de donnees"] = list(map(str, df.dtypes.unique().tolist()))
pd.DataFrame.from_dict(dataset_description, orient='index', columns = [""])

Unnamed: 0,Unnamed: 1
Nombre de lignes,999
Nombre de colonnes,4
Usage en memoire,31.21875 ko
Type de donnees,"[object, float64]"


In [4]:
colonnes_description = []
for d in df:
    colonnes_description.append([d, df[d].count(), str(df.dtypes[d]), len(df[d].unique())])
pd.DataFrame(colonnes_description, columns = ["Nom","Valeur non null","Type", "Nombre de valeur unique"])

Unnamed: 0,Nom,Valeur non null,Type,Nombre de valeur unique
0,Watcher,115,object,40
1,videoCategoryId,115,float64,14
2,videoCategoryLabel,115,object,14
3,definition,112,object,3


In [5]:
d = df["definition"].fillna("no def")
df["definition"] = d
print(df["definition"].unique())

['hd' 'sd' 'no def']


#### Prétraitement des données

In [6]:
pd.DataFrame(list(df["videoCategoryLabel"].unique()), columns = ["videoCategoryLabel Unique"])

Unnamed: 0,videoCategoryLabel Unique
0,Nonprofits & Activism
1,People & Blogs
2,Entertainment
3,News & Politics
4,Science & Technology
5,Education
6,Music
7,Travel & Events
8,Film & Animation
9,Sports


In [7]:
df['videoCategoryLabel'] = df['videoCategoryLabel'].astype(str)

for d in df["videoCategoryLabel"].unique():
    dd = d.replace(" ", "_")
    df["videoCategoryLabel"] = df["videoCategoryLabel"].replace(d, dd)
pd.DataFrame(list(df["videoCategoryLabel"].unique()), columns = ["videoCategoryLabel Unique"])

Unnamed: 0,videoCategoryLabel Unique
0,Nonprofits_&_Activism
1,People_&_Blogs
2,Entertainment
3,News_&_Politics
4,Science_&_Technology
5,Education
6,Music
7,Travel_&_Events
8,Film_&_Animation
9,Sports


In [8]:
def create_data_table(df):
    for d in df["videoCategoryLabel"].unique():
        dd = d.replace(" ", "_")
        df["videoCategoryLabel"] = df["videoCategoryLabel"].replace(d, dd)

    data = dict()
    for d in df["Watcher"].unique():
        t = df.loc[df["Watcher"] == d]
        data[d] = list(set(t["videoCategoryLabel"]))

    table_data = []
    for d in data.items():
        table_data.append([d[0],d[1]])
    return table_data, data
table_data, data = create_data_table(df)
pd.DataFrame(table_data, columns = ["Watcher","Categories"])

Unnamed: 0,Watcher,Categories
0,Billy,"[Entertainment, Howto_&_Style, Science_&_Technology, Nonprofits_&_Activism, People_&_Blogs]"
1,Leila,"[Film_&_Animation, Comedy, Science_&_Technology, Nonprofits_&_Activism, People_&_Blogs]"
2,Mark,"[Entertainment, Education, Science_&_Technology, Nonprofits_&_Activism, People_&_Blogs]"
3,Jane,"[Entertainment, Education, People_&_Blogs]"
4,Babs,[People_&_Blogs]
5,Jeff,"[Entertainment, Gaming, News_&_Politics, Education, Science_&_Technology]"
6,Naomy,"[Nonprofits_&_Activism, Music, People_&_Blogs, Science_&_Technology]"
7,Flo,[Science_&_Technology]
8,Phoebe,"[People_&_Blogs, Science_&_Technology]"
9,Rachel,"[People_&_Blogs, Science_&_Technology]"


### Application de l'algorithme Apriori

In [9]:
# fonction qui retourne une liste d'item dans le meme format que dans lequel ils se trouvent dans le dictionaire de base
def item_format(item):
    item_list = list(item.split("'"))
    special_characters = "[', ']"
    item_list_format = [i for i in item_list if  i not in special_characters]
    return item_list_format

# fonction qui crée les tables C1, C2, C3,...,Ck
def create_ck_table(data, lk, k):
    c = Counter()
    if k == 1:
        item_set = list(set(sum(data.values(), [])))
    else:
        item_set = set()
        temp = list(lk)
        for i in range(0,len(temp)):
            for j in range(i+1,len(temp)):
                t = {z for z in item_format(temp[i])}.union({w for w in item_format(temp[j])})
                if(len(t) == k):
                    t = sorted(t)
                    item_set.add(str(t))
        item_set = list(item_set)
    for i in item_set:
        c[i] = 0
        for d in data.values():
            if all(item in d for item in item_format(i)):
                c[i] += 1
    return c

# fonction qui affiche la table Ck
def display_ck_table(ck, k):
    print("C"+str(k)+":")
    for i in ck:
        i_set = set()
        for it in item_format(i):
            i_set.add(it)
        print(str(i_set)+": "+str(ck[i]))
    print()

# fonction qui crée les tables L1, L2, L3,...,Lk
def create_lk_table(data, ck, k, s):
    l = Counter()
    for i in ck:
        if(ck[i] >= s):
            l[str(i)] += ck[i]
    return l

# fonction qui affiche la table Lk
def display_lk_table(lk, k):
    final = []
    print("L"+str(k)+":")
    for i in lk:
        i_set = set()
        for it in item_format(i):
            i_set.add(it)
        print(str(i_set)+": "+str(lk[i]))
        final.append(i_set)
    print()
    return final

In [10]:
def apriori(data,s):
    s = len(data) * s
    final = []
    ck = Counter()
    lk = Counter()
    for k in range(1,1000):
        ck = create_ck_table(data,lk,k)
        if len(ck) == 0:
            break
        display_ck_table(ck,k)
        lk = create_lk_table(data,ck,k,s)
        if len(lk) == 0:
            break
        l_items = display_lk_table(lk,k)
        for li in l_items:
            final.append(li)
    return final
final_items = apriori(data,0.2)
print("L = L1 U L2 U L3 :")
print(final_items)

C1:
{'Entertainment'}: 7
{'Film_&_Animation'}: 1
{'Gaming'}: 1
{'Travel_&_Events'}: 1
{'Howto_&_Style'}: 1
{'Comedy'}: 1
{'News_&_Politics'}: 5
{'Education'}: 8
{'Science_&_Technology'}: 24
{'Sports'}: 1
{'Nonprofits_&_Activism'}: 9
{'Music'}: 2
{'People_&_Blogs'}: 31

L1:
{'Education'}: 8
{'Science_&_Technology'}: 24
{'Nonprofits_&_Activism'}: 9
{'People_&_Blogs'}: 31

C2:
{'Nonprofits_&_Activism', 'People_&_Blogs'}: 8
{'Nonprofits_&_Activism', 'Education'}: 3
{'Education', 'Science_&_Technology'}: 5
{'Nonprofits_&_Activism', 'Science_&_Technology'}: 9
{'People_&_Blogs', 'Science_&_Technology'}: 17
{'People_&_Blogs', 'Education'}: 6

L2:
{'Nonprofits_&_Activism', 'People_&_Blogs'}: 8
{'Nonprofits_&_Activism', 'Science_&_Technology'}: 9
{'People_&_Blogs', 'Science_&_Technology'}: 17

C3:
{'Nonprofits_&_Activism', 'People_&_Blogs', 'Science_&_Technology'}: 8

L3:
{'Nonprofits_&_Activism', 'People_&_Blogs', 'Science_&_Technology'}: 8

L = L1 U L2 U L3 :
[{'Education'}, {'Science_&_Techno

In [11]:
def pair_up(items):
    pairs = []
    for i in range(len(items)):
        for j in range(len(items)):
            pairs.append((items[i],items[j]))
    return pairs

def make_rules(items):
    rules = pair_up(items)
    final_rules = list()
    for r in rules :
        X = list(r[0])
        Y = list(r[1])
        for x in X:
            if x in Y:
                Y.remove(x)
        if (X,Y) not in final_rules and len(Y) != 0:
            final_rules.append((X,Y))
    return final_rules

final_rules = make_rules(final_items)

table_rule = []
for fr in final_rules:
    table_rule.append(str(set(fr[0]))+" ---> "+str(set(fr[1])))
pd.DataFrame(table_rule, columns = ["Rule"])

Unnamed: 0,Rule
0,{'Education'} ---> {'Science_&_Technology'}
1,{'Education'} ---> {'Nonprofits_&_Activism'}
2,{'Education'} ---> {'People_&_Blogs'}
3,"{'Education'} ---> {'Nonprofits_&_Activism', 'People_&_Blogs'}"
4,"{'Education'} ---> {'Nonprofits_&_Activism', 'Science_&_Technology'}"
5,"{'Education'} ---> {'People_&_Blogs', 'Science_&_Technology'}"
6,"{'Education'} ---> {'Nonprofits_&_Activism', 'People_&_Blogs', 'Science_&_Technology'}"
7,{'Science_&_Technology'} ---> {'Education'}
8,{'Science_&_Technology'} ---> {'Nonprofits_&_Activism'}
9,{'Science_&_Technology'} ---> {'People_&_Blogs'}


In [12]:
def association_correlation_rules(items, min_conf):
    table = []
    rules = make_rules(items)
    min_c = min_conf * len(data.values())
    for fr in rules:
        x, y = fr
        xy = sum(fr,[])
        count_x, count_y, count_xy = 0, 0, 0
        str_x, str_y = str(set(x)).replace("_", " "), str(set(y)).replace("_", " ")
        rule = str_x +" ---> "+ str_y
        for d in data.values():
            if x[0] in d:
                count_x += 1
            if y[0] in d:
                count_y += 1
            check =  all(item in d for item in xy)
            if check:
                count_xy += 1
        support_x = count_x / len(data.values())
        support_y = count_y / len(data.values())
        support_xy = count_xy / len(data.values())
        conf = support_xy / support_x 
        lift = support_xy / (support_x * support_y)
        if (conf * len(data.values()) >= min_c):
            table.append([rule, str(int(conf*100))+"%", "{:.2f}".format(round(support_xy, 2))])
    return table
table = association_correlation_rules(final_items, 0.15)
pd.DataFrame(table, columns = ["Rule","Confidence","Lift"])



Unnamed: 0,Rule,Confidence,Lift
0,{'Education'} ---> {'Science & Technology'},62%,0.12
1,{'Education'} ---> {'Nonprofits & Activism'},37%,0.07
2,{'Education'} ---> {'People & Blogs'},74%,0.15
3,"{'Education'} ---> {'Nonprofits & Activism', 'People & Blogs'}",37%,0.07
4,"{'Education'} ---> {'Nonprofits & Activism', 'Science & Technology'}",37%,0.07
5,"{'Education'} ---> {'People & Blogs', 'Science & Technology'}",50%,0.1
6,"{'Education'} ---> {'Nonprofits & Activism', 'People & Blogs', 'Science & Technology'}",37%,0.07
7,{'Science & Technology'} ---> {'Education'},20%,0.12
8,{'Science & Technology'} ---> {'Nonprofits & Activism'},37%,0.23
9,{'Science & Technology'} ---> {'People & Blogs'},70%,0.42
