<a id='libraries'></a>
<h1 style="color:forestgreen" >Packages</h1> 

In [261]:
# ----------------------------
#BASE
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

# ----------------------------
# INSTALL
# !pip install mlxtend

# ----------------------------
#TRANSACTION ENCODER
from mlxtend.preprocessing import TransactionEncoder

# ----------------------------
#APRIORI FUNCTION 
from mlxtend.frequent_patterns import apriori, association_rules

# ----------------------------
#ITERTOOLS 
import itertools

# ----------------------------
#CONFIGURATION
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format

<a id='load'></a>
<h1 style="color:forestgreen" >Preprocessing of data</h1> 

In [262]:
df = pd.read_csv("labsup.csv")
df

Unnamed: 0,rownames,kids,morekids,boys2,girls2,boy1st,boy2nd,samesex,multi2nd,age,agefstm,black,hispan,worked,weeks,hours,labinc,faminc,nonmomi,educ,agesq
0,1,2,0,0,0,1,0,0,0,35,27,1,0,0,0,0,0.00,44.45,44.45,14,1225
1,2,3,1,0,0,0,1,0,0,32,20,1,0,1,40,40,14.70,38.05,23.34,12,1024
2,3,2,0,1,0,1,1,1,0,30,19,1,0,1,52,35,20.16,20.59,0.43,12,900
3,4,2,0,0,0,1,0,0,0,33,23,1,0,1,50,40,21.00,83.99,62.99,14,1089
4,5,2,0,0,1,0,0,1,0,31,26,1,0,1,26,48,21.86,88.83,66.96,12,961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31852,31853,5,1,0,1,0,0,1,0,35,19,1,0,1,40,40,14.70,56.70,41.99,12,1225
31853,31854,5,1,0,0,1,0,0,0,34,18,0,1,1,48,35,13.87,82.10,68.23,13,1156
31854,31855,3,1,0,0,1,0,0,0,31,21,0,1,1,52,35,9.46,30.46,21.00,12,961
31855,31856,3,1,1,0,1,1,1,0,27,19,1,0,1,25,32,4.21,46.20,41.99,12,729


In [263]:
df.shape

# to do for the report part:

# accroitre le min sup avec nbr d'itemset
# accroitre min sup nd check temps d'éxecution



(31857, 21)

In [264]:
# Define the categories and corresponding ranges
categories = ['teen','twenties', 'thirties', 'forties', 'fifties']
ranges = [(10,20),(20, 29), (30, 39), (40, 49), (50, 59)]

def categorize_age(df, col_name, categories, ranges):
    """
    This function creates new columns in a DataFrame and sets them to 1 if the value
    in the specified column falls within the corresponding category's range.

    Args:
    df (pandas.DataFrame): The DataFrame containing the data.
    col_name (str): The name of the column whose values will be used for categorization.
    categories (list): A list of category names to be created as columns.
    ranges (list of tuples): A list of tuples where each tuple represents the
    lower and upper bounds (inclusive) for a category.

    Returns:
    pandas.DataFrame: The DataFrame with the new categorical columns added.
    """

    # Create empty columns with initial values of 0
    for category in categories:
        df[category] = 0

    # Set values to 1 for rows meeting category conditions
    for i, (lower_bound, upper_bound) in enumerate(ranges):
        df.loc[df[col_name].between(lower_bound, upper_bound, inclusive='both'), categories[i]] = 1

    return df

df = categorize_age(df, 'age', categories, ranges)

In [265]:
def label_working_hours(df, hours_col):
    """
    This function categorizes working hours into part-time, full-time, and no work based on a threshold.
    It also adds three new columns: `partime`, `fulltime`, and `no_work` to indicate the category
    for each individual in the DataFrame.

    Args:
    df (pandas.DataFrame): The DataFrame containing the working hours data.
    hours_col (str): The name of the column containing working hours.

    Returns:
    pandas.DataFrame: The DataFrame with the new working hour category columns.
    """

    # Define threshold for full-time work (hours per week)
    threshold = 40

    # Create three new columns with initial values of 0
    df['partime'] = 0
    df['fulltime'] = 0
    df['no_work'] = 0

    # Assign values to new columns based on working hours
    df.loc[df[hours_col] >= threshold, 'fulltime'] = 1
    df.loc[(df[hours_col] < threshold) & (df[hours_col] > 0), 'partime'] = 1
    df.loc[df[hours_col] <= 0, 'no_work'] = 1

    return df

# Example usage
label_working_hours(df,'hours')
df = df.drop('worked', axis=1)  # Optionally drop the original hours column

In [266]:
# Categorize the family income

def label_income(df, income_col):
    """
    This function labels income based on its comparison to the mean and quartiles. It also 
    adds three new columns indicating whether the income is high, average, or low.

    Args:
    df (pandas.DataFrame): The DataFrame containing the income data.
    income_col (str): The name of the column containing income values.

    Returns:
    pandas.DataFrame: The DataFrame with the new income label and category columns.
    """

    # Calculate income quartiles for more informative labeling
    q1 = df[income_col].quantile(0.25)
    q3 = df[income_col].quantile(0.75)

    # Define thresholds based on quartiles
    high_threshold = 1.5 * q3
    low_threshold = 0.75 * q1
    below_average_threshold = q3
    above_average_threshold = q1

    # Create new columns for income categories
    df['high_income'] = 0
    df['average_income'] = 0
    df['low_income'] = 0

    # Label income based on quartiles and assign values to category columns
    df.loc[df[income_col] >= high_threshold, 'high_income'] = 1
    df.loc[df[income_col] <= low_threshold, 'low_income'] = 1
    df.loc[(df[income_col] >= below_average_threshold) &
    (df[income_col] <= above_average_threshold), 'average_income'] = 1
    df.loc[(df[income_col] >= above_average_threshold) &
    (df[income_col] < high_threshold), 'average_income'] = 1

    return df
df = label_income(df, 'faminc')
df = df.drop('faminc', axis=1)
df = df.drop('labinc', axis=1)

In [267]:
df = df.drop('hours', axis=1)
df = df.drop('agesq', axis=1)
df = df.drop('rownames', axis=1)
df = df.drop('nonmomi', axis=1)
df = df.drop('weeks', axis=1)
df = df.drop('educ', axis=1)
df = df.drop('kids', axis=1)
df = df.drop('age', axis=1)
df = df.drop('agefstm', axis=1)

dataset = df

dataset

Unnamed: 0,morekids,boys2,girls2,boy1st,boy2nd,samesex,multi2nd,black,hispan,teen,twenties,thirties,forties,fifties,partime,fulltime,no_work,high_income,average_income,low_income
0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0
1,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
2,0,1,0,1,1,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0
3,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0
4,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31852,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,0
31853,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0
31854,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0
31855,1,1,0,1,1,1,0,1,0,0,1,0,0,0,1,0,0,0,1,0


In [268]:
df.to_csv('final_dataset.csv', encoding='utf-8')

**First Iteration:** Find support values for each variable.

- n: 20 (df.shape[0])

In [270]:
# Find Frequency of Items
df.sum()

morekids          15642
boys2              8281
girls2             7736
boy1st            16288
boy2nd            16114
samesex           16017
multi2nd            276
black             13156
hispan            18897
teen                  0
twenties          14054
thirties          17803
forties               0
fifties               0
partime            6296
fulltime          12493
no_work           13068
high_income        1773
average_income    22124
low_income         4732
dtype: int64

In [271]:
# Product Frequency / Total Sales
first = pd.DataFrame(df.sum() / df.shape[0], columns = ["Support"]).sort_values("Support", ascending = False)
first

Unnamed: 0,Support
average_income,0.69
hispan,0.59
thirties,0.56
boy1st,0.51
boy2nd,0.51
samesex,0.5
morekids,0.49
twenties,0.44
black,0.41
no_work,0.41


In [272]:
# Elimination by Support Value
first[first.Support >= 0.15]

Unnamed: 0,Support
average_income,0.69
hispan,0.59
thirties,0.56
boy1st,0.51
boy2nd,0.51
samesex,0.5
morekids,0.49
twenties,0.44
black,0.41
no_work,0.41


<a id='myfunction'></a>
<h1 style="color:forestgreen" >My Function</h1> 


In [275]:
def ar_iterations(data, num_iter = 1, support_value = 0.1, iterationIndex = None):
    
    # Next Iterations
    def ar_calculation(iterationIndex = iterationIndex): 
        # Calculation of support value
        value = []
        for i in range(0, len(iterationIndex)):
            result = data.T.loc[iterationIndex[i]].sum() 
            result = len(result[result == data.T.loc[iterationIndex[i]].shape[0]]) / data.shape[0]
            value.append(result)
        # Bind results
        result = pd.DataFrame(value, columns = ["Support"])
        result["index"] = [tuple(i) for i in iterationIndex]
        result['length'] = result['index'].apply(lambda x:len(x))
        result = result.set_index("index").sort_values("Support", ascending = False)
        # Elimination by Support Value
        result = result[result.Support > support_value]
        return result    
    
    # First Iteration
    first = pd.DataFrame(df.T.sum(axis = 1) / df.shape[0], columns = ["Support"]).sort_values("Support", ascending = False)
    first = first[first.Support > support_value]
    first["length"] = 1
    
    if num_iter == 1:
        res = first.copy()
        
    # Second Iteration
    elif num_iter == 2:
        
        second = list(itertools.combinations(first.index, 2))
        second = [list(i) for i in second]
        res = ar_calculation(second)
        
    # All Iterations > 2
    else:
        nth = list(itertools.combinations(set(list(itertools.chain(*iterationIndex))), num_iter))
        nth = [list(i) for i in nth]
        res = ar_calculation(nth)
    
    return res

In [276]:
iteration1 = ar_iterations(df, num_iter=1, support_value=0.1)
iteration1

Unnamed: 0,Support,length
average_income,0.69,1
hispan,0.59,1
thirties,0.56,1
boy1st,0.51,1
boy2nd,0.51,1
samesex,0.5,1
morekids,0.49,1
twenties,0.44,1
black,0.41,1
no_work,0.41,1


In [277]:
iteration2 = ar_iterations(df, num_iter=2, support_value=0.1)
iteration2

Unnamed: 0_level_0,Support,length
index,Unnamed: 1_level_1,Unnamed: 2_level_1
"(average_income, thirties)",0.40,2
"(average_income, hispan)",0.40,2
"(average_income, boy1st)",0.36,2
"(average_income, boy2nd)",0.35,2
"(average_income, samesex)",0.35,2
...,...,...
"(no_work, low_income)",0.10,2
"(boy1st, partime)",0.10,2
"(boy2nd, partime)",0.10,2
"(no_work, girls2)",0.10,2


In [278]:
iteration3 = ar_iterations(df, num_iter=3, support_value=0.01,
              iterationIndex=iteration2.index)
iteration3

Unnamed: 0_level_0,Support,length
index,Unnamed: 1_level_1,Unnamed: 2_level_1
"(boy2nd, samesex, boy1st)",0.26,3
"(boy2nd, boys2, boy1st)",0.26,3
"(boys2, samesex, boy1st)",0.26,3
"(boy2nd, boys2, samesex)",0.26,3
"(average_income, hispan, thirties)",0.23,3
...,...,...
"(fulltime, low_income, boy1st)",0.01,3
"(boys2, black, low_income)",0.01,3
"(black, girls2, low_income)",0.01,3
"(fulltime, thirties, low_income)",0.01,3


In [279]:
iteration4 = ar_iterations(df, num_iter=4, support_value=0.01,
              iterationIndex=iteration3.index)
iteration4

Unnamed: 0_level_0,Support,length
index,Unnamed: 1_level_1,Unnamed: 2_level_1
"(boy2nd, boys2, samesex, boy1st)",0.26,4
"(average_income, boys2, samesex, boy1st)",0.18,4
"(boy2nd, average_income, boys2, boy1st)",0.18,4
"(boy2nd, average_income, boys2, samesex)",0.18,4
"(boy2nd, average_income, samesex, boy1st)",0.18,4
...,...,...
"(boys2, hispan, thirties, low_income)",0.01,4
"(hispan, thirties, girls2, low_income)",0.01,4
"(morekids, boys2, twenties, low_income)",0.01,4
"(black, samesex, thirties, low_income)",0.01,4


<a id='ar'></a>
<h1 style="color:forestgreen" >Association Rules</h1> 

There are two main functions here. 
- apriori() function evaluate support value for each product.
- association_rules() function help us to understand relationship between antecedents and consequences products. It gives some remarkable information about products.

Especially, "antecedent support", "consequent support", "support", "confidence" and	"lift" variables are very important to make some business decisions.

In [280]:
# Apriori
freq_items = apriori(df, min_support = 0.1, use_colnames = True, verbose = 1)
freq_items.sort_values("support", ascending = False)

Processing 126 combinations | Sampling itemset size 64


Unnamed: 0,support,itemsets
13,0.69,(average_income)
7,0.59,(hispan)
9,0.56,(thirties)
3,0.51,(boy1st)
4,0.51,(boy2nd)
...,...,...
187,0.10,"(boy2nd, fulltime, black)"
117,0.10,"(morekids, no_work, twenties)"
107,0.10,"(morekids, samesex, twenties)"
37,0.10,"(girls2, black)"


In [281]:
freq_items.sort_values("support", ascending = False).head(5)

Unnamed: 0,support,itemsets
13,0.69,(average_income)
7,0.59,(hispan)
9,0.56,(thirties)
3,0.51,(boy1st)
4,0.51,(boy2nd)


In [282]:
freq_items.sort_values("support", ascending = False).tail(5)

Unnamed: 0,support,itemsets
187,0.1,"(boy2nd, fulltime, black)"
117,0.1,"(morekids, no_work, twenties)"
107,0.1,"(morekids, samesex, twenties)"
37,0.1,"(girls2, black)"
147,0.1,"(samesex, girls2, black)"


In [283]:
# Association Rules & Info
df_ar = association_rules(freq_items, metric = "confidence", min_threshold = 0.5)
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(boys2),(morekids),0.26,0.49,0.13,0.52,1.05,0.01,1.05
1,(girls2),(morekids),0.24,0.49,0.13,0.52,1.06,0.01,1.06
2,(morekids),(boy1st),0.49,0.51,0.25,0.51,1.00,0.00,1.00
3,(morekids),(boy2nd),0.49,0.51,0.25,0.50,1.00,-0.00,1.00
4,(morekids),(samesex),0.49,0.50,0.26,0.53,1.06,0.01,1.06
...,...,...,...,...,...,...,...,...,...
1104,"(average_income, samesex, boy1st)","(boy2nd, thirties, boys2)",0.18,0.14,0.11,0.58,4.04,0.08,2.05
1105,"(average_income, thirties, boy1st)","(boy2nd, samesex, boys2)",0.21,0.26,0.11,0.51,1.96,0.05,1.51
1106,"(samesex, thirties, boy1st)","(boy2nd, average_income, boys2)",0.14,0.18,0.11,0.73,4.04,0.08,3.04
1107,"(boys2, average_income)","(boy2nd, samesex, thirties, boy1st)",0.18,0.14,0.11,0.58,4.04,0.08,2.05


In [284]:
df_ar[(df_ar.support > 0.15) & (df_ar.confidence > 0.5)].sort_values("confidence", ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
618,"(boys2, hispan)","(boy2nd, samesex)",0.16,0.26,0.16,1.00,3.85,0.12,inf
247,"(boy2nd, samesex)",(boy1st),0.26,0.51,0.26,1.00,1.96,0.13,inf
644,"(boy2nd, average_income, boys2)",(samesex),0.18,0.50,0.18,1.00,1.99,0.09,inf
199,"(boys2, hispan)",(boy2nd),0.16,0.51,0.16,1.00,1.98,0.08,inf
211,"(boys2, average_income)",(boy2nd),0.18,0.51,0.18,1.00,1.98,0.09,inf
...,...,...,...,...,...,...,...,...,...
354,"(average_income, fulltime)",(boy2nd),0.31,0.51,0.16,0.50,0.99,-0.00,0.99
116,"(morekids, hispan)",(boy2nd),0.30,0.51,0.15,0.50,0.99,-0.00,0.99
378,"(average_income, thirties)",(samesex),0.40,0.50,0.20,0.50,1.00,-0.00,1.00
57,(thirties),(samesex),0.56,0.50,0.28,0.50,1.00,-0.00,1.00
