## 2. 트랜잭션 데이터

In [1]:
import csv
with open('basket.csv', 'r', encoding='utf-8') as cf:
    transactions = []
    r = csv.reader(cf)
    for row in r:
        transactions.append(row)

In [2]:
transactions

[['소주', '콜라', '와인'],
 ['소주', '오렌지주스', '콜라'],
 ['콜라', '맥주', '와인'],
 ['소주', '콜라', '맥주'],
 ['오렌지주스', '와인']]

## 3. 연관분석

### 3.1 연관 규칙 생성

In [4]:
from apyori import apriori

rules = apriori(transactions, min_support=0.1, min_confidence=0.1)
results = list(rules)
results[0]

RelationRecord(items=frozenset({'맥주'}), support=0.4, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'맥주'}), confidence=0.4, lift=1.0)])

In [5]:
results[10]

RelationRecord(items=frozenset({'콜라', '소주'}), support=0.6, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'콜라', '소주'}), confidence=0.6, lift=1.0), OrderedStatistic(items_base=frozenset({'소주'}), items_add=frozenset({'콜라'}), confidence=1.0, lift=1.25), OrderedStatistic(items_base=frozenset({'콜라'}), items_add=frozenset({'소주'}), confidence=0.7499999999999999, lift=1.2499999999999998)])

### 3.2 연관 규칙 조회

In [19]:
import pandas as pd

result_df = pd.DataFrame(None, columns=['lhs', 'rhs', 'support', 'confidence', 'lift'])

In [20]:
index = 0

for result in results:
    support = result[1]
    for ordered_item in result[2]:
        lhs = ','.join(ordered_item[0])
        rhs = ','.join(ordered_item[1])
        confidence = ordered_item[2]
        lift = ordered_item[3]

        result_df.loc[index] = [lhs, rhs, support, confidence, lift]
        index += 1

result_df.head(10)

Unnamed: 0,lhs,rhs,support,confidence,lift
0,,맥주,0.4,0.4,1.0
1,,소주,0.6,0.6,1.0
2,,오렌지주스,0.4,0.4,1.0
3,,와인,0.6,0.6,1.0
4,,콜라,0.8,0.8,1.0
5,,"맥주,소주",0.2,0.2,1.0
6,맥주,소주,0.2,0.5,0.833333
7,소주,맥주,0.2,0.333333,0.833333
8,,"맥주,와인",0.2,0.2,1.0
9,맥주,와인,0.2,0.5,0.833333


### 3.3 연관 규칙 평가

In [21]:
result_df.loc[(result_df.lhs == '오렌지주스') & (result_df.rhs == '와인')]

Unnamed: 0,lhs,rhs,support,confidence,lift
24,오렌지주스,와인,0.2,0.5,0.833333


## 4. 뉴스 기사 연관분석 실습

In [61]:
import requests
from bs4 import BeautifulSoup

url = 'http://fs.jtbc.joins.com/RSS/economy.xml'
news = requests.get(url)
news

<Response [200]>

In [62]:
news_list = BeautifulSoup(news.content, 'xml')
link_list = news_list.select('item > link')
link_list[0]

<link>https://news.jtbc.joins.com/article/article.aspx?news_id=NB12041654</link>

In [63]:
from konlpy.tag import Kkma
kkma = Kkma()

In [64]:
import re
num_pattern = re.compile(r'[0-9]+')

transaction = []
for link in link_list:
    news_link = link.text
    news_link_res = requests.get(news_link)
    news_soup = BeautifulSoup(news_link_res.content, 'html.parser')
    news_content = news_soup.select_one('#articlebody > .article_content')

    news_content_text = news_content.text
    news_content_noun = kkma.nouns(news_content_text)
    news_content_noun = [i for i in news_content_noun if i != '앵커' and i != '기자' and num_pattern.match(i) == None]
    filtered_noun_list = list(filter(lambda word: len(word) > 1, news_content_noun[:70]))

    transaction.append(filtered_noun_list)

transaction

[['미래',
  '기술',
  '세계',
  '최대',
  '규모',
  '가전',
  '가전전시회',
  '전시회',
  '미국',
  '일상',
  '로봇',
  '우주',
  '건강',
  '관련',
  '올해',
  '현지',
  '김영민',
  '이곳',
  '재개',
  '네바다',
  '네바다주',
  '라스',
  '라스베이거스',
  '베이',
  '거스',
  '프레스',
  '컨퍼런스',
  '시작',
  '기업',
  '대면',
  '혼합',
  '단계',
  '진화',
  '지난해',
  '머스크',
  '베이조스',
  '조스',
  '장자',
  '우주전쟁',
  '전쟁',
  '격화',
  '우주항공',
  '항공',
  '발사대',
  '필요',
  '미니',
  '우주선',
  '공개',
  '비행사',
  '자율',
  '자율주행',
  '주행',
  '모드',
  '우주정거장',
  '정거장',
  '택시',
  '이동',
  '회사'],
 ['종부',
  '분납',
  '신청자',
  '지난해',
  '종합',
  '종합부동산세',
  '부동',
  '산세',
  '고지',
  '사람',
  '가운데',
  '명이',
  '신청',
  '가량',
  '종부세액',
  '세액',
  '상대적',
  '납세자',
  '분석',
  '현대',
  '현대차',
  '기아',
  '미국',
  '역대',
  '최다',
  '판매',
  '판매현대차',
  '대차',
  '시장',
  '기록',
  '진출',
  '처음',
  '연간',
  '판매량',
  '일본',
  '혼다',
  '카페',
  '일회용',
  '사용',
  '금지',
  '금지카페',
  '플라스틱',
  '플라스틱컵',
  '코로나',
  '때문',
  '일시적',
  '허용',
  '규제',
  '일회용품',
  '용품',
  '품목',
  '적용',
  '업종',
  '종이컵'],
 ['세상',
  '세계',
  '최대',
  '가전',
  '전시회',


In [65]:
from apyori import apriori

rules = apriori(transaction, min_support=0.2, min_confidence=0.2)
results = list(rules)
results[0]

RelationRecord(items=frozenset({'관련'}), support=0.25, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'관련'}), confidence=0.25, lift=1.0)])

In [66]:
import pandas as pd

result_df = pd.DataFrame(None, columns=['lhs', 'rhs', 'support', 'confidence', 'lift'])
index = 0

for res in results:
    support = res[1]
    ordered_item = res[2]
    for item in ordered_item:
        lhs = ','.join(item[0])
        rhs = ','.join(item[1])
        confidence = item[2]
        lift = item[3]

        result_df.loc[index] = [lhs, rhs, support, confidence, lift]
        index += 1

result_df

Unnamed: 0,lhs,rhs,support,confidence,lift
0,,관련,0.25,0.25,1.0
1,,규모,0.25,0.25,1.0
2,,금융,0.20,0.20,1.0
3,,기술,0.20,0.20,1.0
4,,기업,0.20,0.20,1.0
...,...,...,...,...,...
245,"미국,기업,세계,전시회",최대,0.20,1.00,5.0
246,"미국,기업,세계,최대",전시회,0.20,1.00,5.0
247,"최대,미국,기업,전시회",세계,0.20,1.00,4.0
248,"최대,기업,세계,전시회",미국,0.20,1.00,4.0


In [68]:
result_df[result_df['lhs'] == ''].sort_values('support', ascending=False).head(10)

Unnamed: 0,lhs,rhs,support,confidence,lift
22,,지난해,0.55,0.55,1.0
14,,올해,0.35,0.35,1.0
11,,시장,0.3,0.3,1.0
0,,관련,0.25,0.25,1.0
21,,지금,0.25,0.25,1.0
1,,규모,0.25,0.25,1.0
5,,때문,0.25,0.25,1.0
6,,미국,0.25,0.25,1.0
59,,"시장,지난해",0.25,0.25,1.0
8,,세계,0.25,0.25,1.0


In [72]:
result_df[result_df.lhs.str.contains('미국')].sort_values('lift', ascending=False)

Unnamed: 0,lhs,rhs,support,confidence,lift
168,"미국,최대","기업,세계",0.2,1.0,5.0
164,"미국,기업","세계,최대",0.2,1.0,5.0
170,"미국,기업,세계",최대,0.2,1.0,5.0
173,"미국,세계,최대",기업,0.2,1.0,5.0
179,"미국,기업","전시회,최대",0.2,1.0,5.0
...,...,...,...,...,...
176,미국,"기업,전시회,최대",0.2,0.8,4.0
171,"미국,기업,최대",세계,0.2,1.0,4.0
156,"미국,기업,전시회",세계,0.2,1.0,4.0
31,미국,기업,0.2,0.8,4.0


## 5. 연습문제

In [93]:
import csv
with open('mybasket.csv', 'r', encoding='utf-8') as file:
    file_content = csv.reader(file)
    content_list = []
    for row in file_content:
        content_list.append(row)

content_list[:5]

[['clothes', 'frozen', 'snack'],
 ['frozen', 'toiletry'],
 ['clothes', 'alcohol', 'toiletry', 'snack'],
 ['clothes', 'milk', 'bakery'],
 ['clothes']]

In [94]:
# 전체 트랜잭션 개수
len(content_list)

786

In [97]:
# 규칙 순서 상관없이 최소 지지도, 신뢰도 각각 10% 일 경우 규칙 수
from apyori import apriori

res = apriori(content_list, min_support=0.1, min_confidence=0.1)
result = list(res)
len(result)

53

In [103]:
# 순서 있는 전체 연관 규칙 출력
filtered_res = []

for item in result:
    ordered_items = item[2]
    for ordered_item in ordered_items:
        if ordered_item[0] != '' and ordered_item[1] != '':
            filtered_res.append(item)

len(filtered_res)

269

In [116]:
# 향상도가 가장 높은 연관 규칙
result_df = pd.DataFrame(None, columns=['lhs', 'rhs', 'support', 'confidence', 'lift'])
index = 0

for res in filtered_res:
    support = res[1]
    ordered_item = res[2]
    for item in ordered_item:
        lhs = ','.join(item[0])
        rhs = ','.join(item[1])
        confidence = item[2]
        lift = item[3]

        result_df.loc[index] = [lhs, rhs, support, confidence, lift]
        index += 1

max_lift = result_df['lift'].max()
result_df[result_df['lift'] == max_lift][['lhs', 'rhs', 'lift']]

Unnamed: 0,lhs,rhs,lift
934,milk,"bakery,deco",2.343877
935,"bakery,deco",milk,2.343877
941,milk,"bakery,deco",2.343877
942,"bakery,deco",milk,2.343877
948,milk,"bakery,deco",2.343877
949,"bakery,deco",milk,2.343877
955,milk,"bakery,deco",2.343877
956,"bakery,deco",milk,2.343877
962,milk,"bakery,deco",2.343877
963,"bakery,deco",milk,2.343877


In [129]:
# 가장 많이 팔린 두 제품
a = result_df.loc[result_df.lhs == ''].sort_values('support', ascending=False).iloc[:2]['rhs'].values
two_top_sold = ', '.join(a)
two_top_sold

'clothes, snack'

In [139]:
# 가장 많이 팔린 두 제품 사이의 지지도, 신뢰도, 향상도
a = result_df[(result_df.lhs.str.contains('clothes')) & (result_df.rhs == 'snack')][['support', 'confidence', 'lift']]
a.sort_values('lift', ascending=False)

Unnamed: 0,support,confidence,lift
1945,0.101781,0.695652,1.465905
1840,0.101781,0.695652,1.465905
1735,0.101781,0.695652,1.465905
1750,0.101781,0.695652,1.465905
1765,0.101781,0.695652,1.465905
1780,0.101781,0.695652,1.465905
1795,0.101781,0.695652,1.465905
1810,0.101781,0.695652,1.465905
1825,0.101781,0.695652,1.465905
1855,0.101781,0.695652,1.465905
