### **Data Mining Using Python**

In [1]:
# Set-up
import csv

In [2]:
# Upload data files - must be on local computer
# Shop.csv, Movies.csv
# If running notebook on local computer:
#   No need to run this cell (it will generate an error)
#   Make sure data files are in same workspace as notebook
from google.colab import files
uploaded = files.upload()

Saving Movies.csv to Movies.csv
Saving Shop.csv to Shop.csv


**Look at CSV files:** TID,item pairs

In [3]:
# Read shopping dataset from CSV file
# Create dictionary "Sitems" with key = item and value = set of transactions
# Also set variable Snumtrans = number of transactions
Sitems = {}
trans = []  # list of transactions used to set Snumtrans
with open('Shop.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        if r['item'] not in Sitems:
            Sitems[r['item']] = {r['TID']}
        else:
            Sitems[r['item']].add(r['TID'])
        if r['TID'] not in trans:
            trans.append(r['TID'])
Snumtrans = len(trans)
print('Number of transactions:', Snumtrans)
print('Number of distinct items:', len(Sitems))
print('Item dictionary:')
Sitems

Number of transactions: 5
Number of distinct items: 5
Item dictionary:


{'milk': {'1', '2', '4', '5'},
 'eggs': {'1', '3', '4'},
 'juice': {'1', '2', '5'},
 'cookies': {'2', '5'},
 'chips': {'3', '5'}}

In [None]:
# Read movies dataset from CSV file
# Create dictionary "Mitems" with key = item and value = set of transactions
# Also set variable Mnumtrans = number of transactions
Mitems = {}
trans = []  # list of transactions used to set Mnumtrans
with open('Movies.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        if r['item'] not in Mitems:
            Mitems[r['item']] = {r['TID']}
        else:
            Mitems[r['item']].add(r['TID'])
        if r['TID'] not in trans:
            trans.append(r['TID'])
Mnumtrans = len(trans)
print('Number of transactions (users):', Mnumtrans)
print('Number of distinct items (movies):', len(Mitems))
print('Item dictionary:')
Mitems.items()

### Some new Python features

In [5]:
# Iterating through dictionaries
for i in Sitems:
    print(i)
    print(Sitems[i])

milk
{'5', '4', '2', '1'}
eggs
{'4', '3', '1'}
juice
{'5', '2', '1'}
cookies
{'5', '2'}
chips
{'5', '3'}


In [6]:
# Intersecting sets
# How many transactions contain both eggs and milk?
set1 = Sitems['eggs']
print('Transactions containing eggs:', set1)
set2 = Sitems['milk']
print('Transactions containing milk:', set2)
set3 = set1 & set2
print('Transactions containing both:', set3)
# print('Number of transactions containing both:', len(set3))

Transactions containing eggs: {'4', '3', '1'}
Transactions containing milk: {'5', '4', '2', '1'}
Transactions containing both: {'1', '4'}


## Shopping dataset - frequent item-sets

### Frequent item-sets of two

#### First compute all pairs of items and the number of transactions they occur together in (see what's wrong and fix it)

In [15]:
pairs = []
for i1 in Sitems:
    for i2 in Sitems:
      if i1 < i2:
        common = len(Sitems[i1] & Sitems[i2])
        pairs.append([i1, i2, common])
pairs

[['eggs', 'milk', 2],
 ['eggs', 'juice', 1],
 ['juice', 'milk', 3],
 ['cookies', 'milk', 2],
 ['cookies', 'eggs', 0],
 ['cookies', 'juice', 2],
 ['chips', 'milk', 1],
 ['chips', 'eggs', 1],
 ['chips', 'juice', 1],
 ['chips', 'cookies', 1]]

#### Print pairs that meet support threshold

In [16]:
support = .3
for p in pairs:
    if p[2]/Snumtrans > support:
        print(p[0], '|', p[1])

eggs | milk
juice | milk
cookies | milk
cookies | juice


In [None]:
# Fold previous two code boxes together into one program
WILL ADD CODE HERE

### Frequent item-sets of three

In [17]:
support = .3
for i1 in Sitems:
    for i2 in Sitems:
        for i3 in Sitems:
            if i1 < i2 and i2 < i3:
                common = len(Sitems[i1] & Sitems[i2] & Sitems[i3])
                if common/Snumtrans > support:
                    print(i1, '|', i2, '|', i3)

cookies | juice | milk


### <font color = 'green'>**Your Turn - Movies dataset frequent item-sets**</font>

In [18]:
print(Mnumtrans, 'transactions (users)')
print(len(Mitems), 'distinct items (movies)')

1382 transactions (users)
123 distinct items (movies)


#### Mine for frequent item-sets of three and four items in the Movies dataset. Find a single support threshold where the number of frequent item-sets of three items is more than 10 but less than 20, and the number of frequent item-sets of four items is more than 0.

In [None]:
# Frequent item-sets of three
support = .03
for i1 in Mitems:
    for i2 in Mitems:
        for i3 in Mitems:
          for i4 in Mitems:
            if i1 < i2 and i2 < i3 and i3 < i4:
                common = len(Mitems[i1] & Mitems[i2] & Mitems[i3] & Mitems[i4])
                if common/Mnumtrans > support:
                   print(i1, '|', i2, '|', i3, '|', i4)
                   



Boyhood | Gone Girl | Inside Out | The Imitation Game
Big Hero 6 | Boyhood | Gone Girl | The Imitation Game
Big Hero 6 | Gone Girl | Inside Out | The Imitation Game


In [None]:
# Frequent item-sets of four
support = INSERT VALUE HERE (SAME VALUE AS IN PREVIOUS CELL)
YOUR CODE HERE

## Shopping dataset - association rules

### Association rules with one item on the left-hand side

#### First compute frequent item-sets of one item, as candidate left-hand sides of assocation rules. Include the number of transactions the items occur in.

In [None]:
support = .5
frequentLHS = []
for i in Sitems:
    if len(Sitems[i])/Snumtrans > support:
        frequentLHS.append([i,len(Sitems[i])])
print(frequentLHS)

#### Now find right-hand side items with sufficient confidence (see what's wrong and fix it)

In [None]:
confidence = .5
for lhs in frequentLHS:
    for i in Sitems:
        common = len(Sitems[lhs[0]] & Sitems[i])
        if common/lhs[1] > confidence:
            print(lhs[0], '->', i)

### Association rules with two items on the left-hand side

#### First compute frequent item-sets of two items, as candidate left-hand sides of assocation rules. Include the number of transactions the items occur in.

In [None]:
support = .5
frequentLHS = []
for i1 in Sitems:
    for i2 in Sitems:
        if i1 < i2:
            common = len(Sitems[i1] & Sitems[i2])
            if common/Snumtrans > support:
                frequentLHS.append([i1,i2,common])
print(frequentLHS)

#### Now find right-hand side items with sufficient confidence

In [None]:
confidence = .5
for lhs in frequentLHS:
    for i in Sitems:
        if i not in lhs:
            common = len(Sitems[lhs[0]] & Sitems[lhs[1]] & Sitems[i])
            if common/lhs[2] > confidence:
                print(lhs[0], '|', lhs[1], '->', i)

## Shopping dataset - association rules with lift instead of confidence

### Association rules with one item on the left-hand side

#### First compute frequent item-sets of one item, as candidate left-hand sides of assocation rules. Include the number of transactions the items occur in.

In [None]:
support = .5
frequentLHS = []
for i in Sitems:
    if len(Sitems[i])/Snumtrans > support:
        frequentLHS.append([i,len(Sitems[i])])
print(frequentLHS)

#### Now find right-hand side items with sufficient lift

In [None]:
liftthresh = 1
for lhs in frequentLHS:
    for i in Sitems:
        if i not in lhs:
            common = len(Sitems[lhs[0]] & Sitems[i])
            lift = (common/lhs[1]) / (len(Sitems[i])/Snumtrans)
            if lift > liftthresh:
                print(lhs[0], '->', i, 'with lift', lift)

### Association rules with two items on the left-hand side

#### First compute frequent item-sets of two items, as candidate left-hand sides of assocation rules. Include the number of transactions the items occur in.

In [None]:
support = .5
frequentLHS = []
for i1 in Sitems:
    for i2 in Sitems:
        if i1 < i2:
            common = len(Sitems[i1] & Sitems[i2])
            if common/Snumtrans > support:
                frequentLHS.append([i1,i2,common])
print(frequentLHS)

#### Now find right-hand side items with sufficient lift

In [None]:
liftthresh = 1
for lhs in frequentLHS:
    for i in Sitems:
        if i not in lhs:
            common = len(Sitems[lhs[0]] & Sitems[lhs[1]] & Sitems[i])
            lift = (common/lhs[2]) / (len(Sitems[i])/Snumtrans)
            if lift > 1:
                print(lhs[0], '|', lhs[1], '->', i, 'with lift', lift)

### <font color = 'green'>**Your Turn - Movies dataset association rules**</font>

#### Mine for association rules in the Movies dataset with three items on the left-hand side. Find support and confidence thresholds (need not be the same) so the number of association rules is more than 10 but less than 20.

In [None]:
# Association rules with three items on the left-hand side
support = INSERT VALUE HERE
confidence = INSERT VALUE HERE
YOUR CODE HERE

#### Mine for association rules in the Movies dataset with three items on the left-hand side. Find support and lift thresholds so the number of association rules is more than 10 but less than 20.


In [None]:
# Association rules with three items on the left-hand side
support = INSERT VALUE HERE
liftthresh = INSERT VALUE HERE
YOUR CODE HERE