In [20]:
# Includes
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
import warnings
warnings.simplefilter('ignore')
from mlxtend.frequent_patterns import apriori,association_rules
import numpy as np


# 1) Read the dataset, show its head, shape and description

In [2]:
df = pd.read_csv('./AirQualityUCI.csv', sep=';')

display(df.head())

print(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns")

display(df.describe())

Unnamed: 0,Date,Time,X00,X01,X02,X03,X04,X05,X06,X07,X08,X09,X10,X11,X12,X13,X14,X15
0,10/03/2004,18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,,,
1,10/03/2004,19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,,,
2,10/03/2004,20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,,,
3,10/03/2004,21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,,,
4,10/03/2004,22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,,,


The dataset has 9471 rows and 18 columns


Unnamed: 0,X00,X01,X02,X03,X04,X05,X06,X07,X08,X09,X10,X11,X12,X13,X14,X15
count,7765.0,8991.0,914.0,9357.0,8991.0,7718.0,8991.0,7715.0,8991.0,8991.0,8991.0,8991.0,8991.0,0.0,0.0,0.0
mean,2.127521,1099.833166,218.811816,9.688704,939.153376,246.896735,835.493605,113.091251,1456.264598,1022.906128,18.317829,49.234201,1.02553,,,
std,1.463171,217.080037,204.459921,7.559785,266.831429,212.979168,256.81732,48.370108,346.206794,398.484288,8.832116,17.316892,0.403813,,,
min,0.0,647.0,7.0,0.0,383.0,2.0,322.0,2.0,551.0,221.0,-1.9,9.2,0.1847,,,
25%,1.0,937.0,67.0,4.0,734.5,98.0,658.0,78.0,1227.0,731.5,11.8,35.8,0.7368,,,
50%,1.8,1063.0,150.0,7.9,909.0,180.0,806.0,109.0,1463.0,963.0,17.8,49.6,0.9954,,,
75%,2.9,1231.0,297.0,13.6,1116.0,326.0,969.5,142.0,1674.0,1273.5,24.4,62.5,1.3137,,,
max,11.9,2040.0,1189.0,63.7,2214.0,1479.0,2683.0,340.0,2775.0,2523.0,44.6,88.7,2.231,,,


# 2) Eliminate totally null columns and totally null rows, eliminate columns with less than 1/3 of non null values; fill the remaining NaN values with the mean of the column

In [3]:
df.dropna(axis=0, how='all', inplace=True) # Drop rows with only NaN values
df.dropna(axis=1, how='all', inplace=True) # Drop cols with only NaN values

df.dropna(axis=1, inplace=True, thresh=df.shape[0]/3) # Drop cols with less than 1/3 NaN values

In [4]:
# Put mean of column in all other NaN values
# for c in df.columns :
#     for i in range(df.shape[0]) :
#         if pd.isna(df.loc[i, c]): df.loc[i, c]=df[c].mean()

for c in df.columns:
    if pd.api.types.is_numeric_dtype(df[c]):
        df[c].fillna(df[c].mean(), inplace=True)

# 3) Drop Time, convert Date from string to datetime and group by Date using mean as aggregate function.

In [5]:
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S')

df.drop(columns=['Date', 'Time'], inplace=True)

df = df.groupby(by='Datetime').mean().reset_index()

# 4) Preparation of the boolean matrix:
- Discretise continuous values with two bins, kmeans strategy and onehot-dense encoding
- Discretization/encoding generates 0/1 values; convert the binary values obtained into boolean, as requested by Apriori

In [6]:
## WTF ?? (made by CHATGPT) ##
# 1. Discretization with KMeans and Two Bins
kmeans = KMeans(n_clusters=2, random_state=42)
discretizer = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='kmeans')

for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        # Add a new column with discretized values
        df[f'{col}_discrete'] = discretizer.fit_transform(df[[col]])
        # Rename the resulting column
        df = df.rename(columns={f'{col}_discrete': col})

# 2. OneHot-Dense Encoding
encoder = OneHotEncoder(drop='if_binary', sparse=False)
encoded_data = encoder.fit_transform(df)

# 3. Convert Binary Values to Boolean
boolean_matrix = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(df.columns))
boolean_matrix = boolean_matrix.astype(bool)

# 5) Set the names of two columns generated by the discretisation of each attribute A to A_low, A_high (with discretisation/one-hot-encoding, each original column generates two columns, the first is for the low values, the second for the high values) 

In [7]:
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        # Extract original column name
        original_col_name = col[:-9]  # Remove '_discrete' from the column name
        
        # Set names for low and high values columns
        low_col_name = f'{original_col_name}_low'
        high_col_name = f'{original_col_name}_high'
        
        # Rename columns in boolean_matrix
        boolean_matrix = boolean_matrix.rename(columns={f'{col}_0': low_col_name, f'{col}_1': high_col_name})

# 6) find a value of min_support such that the Apriori algorithm generates at least 8 frequent itemsets with at least 2 items, output the result

In [30]:
# Put the limits
min_itemsets = 8
min_items_in_itemsets = 2
support_range = np.arange(1, 0, -0.01)

min_support = -1

for support in support_range:
    frequent_itemsets = apriori(boolean_matrix, min_support=support, use_colnames=True)

    c = sum([len(itemset) >= min_items_in_itemsets for itemset in frequent_itemsets.itemsets])

    if c >= min_itemsets : min_support = support

# Best result    
frequent_itemsets = apriori(boolean_matrix, min_support=min_support, use_colnames=True)
c = sum([len(itemset) >= min_items_in_itemsets for itemset in frequent_itemsets.itemsets])

# Output
print(f"The support {min_support} produced {len(frequent_itemsets)} itemsets and {c} of them had more than {min_items_in_itemsets} items")

The support 0.00999999999999912 produced 11889 itemsets and 11832 of them had more than 2 items


# 7) find the minimum metric threshold such that at least 100 association rules are extracted from the frequent itemsets found and show the metrics used and the best 10 rules by descending confidence and support

In [31]:
min_rules = 100
min_threshold = -1

for threshold in np.arange(1, 0, -0.01) :
    print(threshold)
    rules = association_rules(frequent_itemsets, min_threshold=threshold)
    if len(rules) >= min_rules :
        min_threshold = threshold

rules = association_rules(frequent_itemsets, min_threshold=min_threshold)
print(f"The support {min_threshold} produced {len(rules)} rules with more than {min_rules} items")

1.0
0.99
0.98
0.97
0.96
0.95
0.94
0.9299999999999999
0.9199999999999999
0.9099999999999999
0.8999999999999999
0.8899999999999999
0.8799999999999999
0.8699999999999999
0.8599999999999999
0.8499999999999999
0.8399999999999999
0.8299999999999998
0.8199999999999998
0.8099999999999998
0.7999999999999998
0.7899999999999998
0.7799999999999998
0.7699999999999998
0.7599999999999998
0.7499999999999998
0.7399999999999998
0.7299999999999998
0.7199999999999998
0.7099999999999997
0.6999999999999997
0.6899999999999997
0.6799999999999997
0.6699999999999997
0.6599999999999997
0.6499999999999997
0.6399999999999997
0.6299999999999997
0.6199999999999997
0.6099999999999997
0.5999999999999996
0.5899999999999996
0.5799999999999996
0.5699999999999996
0.5599999999999996
0.5499999999999996
0.5399999999999996
0.5299999999999996
0.5199999999999996
0.5099999999999996
0.49999999999999956
0.48999999999999955
0.47999999999999954
0.46999999999999953
0.4599999999999995
0.4499999999999995
0.4399999999999995
0.4299999999

In [32]:
sorted_rules = rules.sort_values(by=['confidence', 'support'], ascending=False)
sorted_rules.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
393,(X03_1.0),(X04_1.0),0.283531,0.360799,0.283531,1.0,2.771623,0.181233,inf,0.892154
3540,"(X03_1.0, X09_1.0)",(X04_1.0),0.245698,0.360799,0.245698,1.0,2.771623,0.157051,inf,0.847407
3534,"(X08_1.0, X03_1.0)",(X04_1.0),0.245378,0.360799,0.245378,1.0,2.771623,0.156846,inf,0.847047
3306,"(X01_1.0, X03_1.0)",(X04_1.0),0.236935,0.360799,0.236935,1.0,2.771623,0.151449,inf,0.837675
19765,"(X01_1.0, X03_1.0, X09_1.0)",(X04_1.0),0.224858,0.360799,0.224858,1.0,2.771623,0.14373,inf,0.824624
21039,"(X08_1.0, X03_1.0, X09_1.0)",(X04_1.0),0.209255,0.360799,0.209255,1.0,2.771623,0.133756,inf,0.808352
19750,"(X01_1.0, X03_1.0, X08_1.0)",(X04_1.0),0.206476,0.360799,0.206476,1.0,2.771623,0.13198,inf,0.805522
80099,"(X01_1.0, X03_1.0, X08_1.0, X09_1.0)",(X04_1.0),0.194827,0.360799,0.194827,1.0,2.771623,0.124534,inf,0.793868
243,(X05_246.8967349054159),(X07_113.09125081011017),0.175163,0.175484,0.175163,1.0,5.698538,0.144425,inf,0.999611
3090,"(X00_1.0, X03_1.0)",(X04_1.0),0.173132,0.360799,0.173132,1.0,2.771623,0.110666,inf,0.773039
