### Copyright (C) 2020 Sobhan Moradiyan Daghigh - All Rights Reserved

## Data Mining UniProj - no.2
#### 12/11/2020


In [1]:
# !pip install pyfpgrowth

In [2]:
# !pip install networkx

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyfpgrowth
import networkx as nx

### Reading the databases

In [4]:
orders = pd.read_csv(r"./dataset/orders.csv")

In [5]:
orders.head()

Unnamed: 0,ID_Order,ID_Customer,ID_Item,DateTime_CartFinalize,Amount_Gross_Order,city_name_fa,Quantity_item
0,2714054,469662,21386,2015-10-15 08:50:56.000,597982.0,محمود آباد,1.0
1,11104039,3063877,248497,2018-02-11 00:29:26.000,980000.0,خرمدره,1.0
2,4228130,3184893,50144,2016-06-14 00:30:08.000,229358.0,قرچک,1.0
3,22225624,6888562,70208,2018-09-03 14:37:19.000,16514.0,قم,1.0
4,4068771,2533490,67627,2016-05-21 11:51:02.000,133028.0,تهران,1.0


In [6]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 7 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   ID_Order               200000 non-null  int64  
 1   ID_Customer            200000 non-null  int64  
 2   ID_Item                200000 non-null  int64  
 3   DateTime_CartFinalize  200000 non-null  object 
 4   Amount_Gross_Order     200000 non-null  float64
 5   city_name_fa           200000 non-null  object 
 6   Quantity_item          200000 non-null  float64
dtypes: float64(2), int64(3), object(2)
memory usage: 10.7+ MB


### Ok now try to put each item in its transaction and make a list of transactions

In [7]:
grps = orders.groupby(by=['ID_Customer'])
transactions = []
for grp, pdf in grps:
    transactions.append(pdf['ID_Item'].tolist())
    

print("count of transactions:", len(transactions))
transactions[5:15]

count of transactions: 151634


[[2015151],
 [189732, 4631],
 [767994],
 [61363, 91566, 20173],
 [180116],
 [294942],
 [107583],
 [41842],
 [254822],
 [106303]]

### Cleaning the transactions for finding FP

In [8]:
trans = []
for tr in transactions:
    if len(tr) > 1:
        trans.append(tr)
        
print("Whole transactions      : ", len(transactions))
print("Prepared for finding FP : ", len(trans))

Whole transactions      :  151634
Prepared for finding FP :  28700


### Frequent patterns with support_threshold of 3

In [9]:
pattern = pyfpgrowth.find_frequent_patterns(trans, 3)
{k: pattern[k] for k in list(pattern)[:20]}

{(369749,): 3,
 (189732,): 3,
 (114994,): 3,
 (175451,): 3,
 (22105,): 3,
 (66491,): 3,
 (43533,): 3,
 (134937,): 3,
 (1150874,): 3,
 (161233,): 3,
 (388826,): 3,
 (295644,): 3,
 (169960,): 3,
 (10373,): 3,
 (199429,): 3,
 (44445,): 3,
 (478445,): 3,
 (51734,): 3,
 (299504,): 3,
 (424039,): 3}

### Generate association rules with confidence_threshold of 0.7

In [10]:
rules = pyfpgrowth.generate_association_rules(pattern, 0.7)
rules

{(875835,): ((294942,), 0.75),
 (31619,): ((294942,), 1.0),
 (388778,): ((104031,), 1.0),
 (12011, 12011): ((90776,), 1.0),
 (12011, 90776): ((), 1.0)}

### find the most repeated items in transactions which means the best seller items

In [11]:
mostsellers = pyfpgrowth.find_frequent_patterns(transactions, 150)
mostsellers

{(294943,): 155,
 (22839,): 160,
 (19890,): 168,
 (416448,): 175,
 (8289,): 233,
 (45121,): 259,
 (51778,): 288,
 (36871,): 367,
 (294942,): 481}

### Cities

In [12]:
grps = orders.groupby(by=['city_name_fa'])
city_transactions = []
cities =[]

for grp, pdf in grps:
    city_transactions.append(pdf['ID_Item'].tolist())
    cities.append(pdf['city_name_fa'].tolist()[0])
    
print("count of transactions ~ count of cities:", len(city_transactions))
city_transactions[7:10]

count of transactions ~ count of cities: 906


[[388022, 843970, 528564],
 [398970, 1130710],
 [277717,
  395375,
  3285,
  33295,
  66753,
  391883,
  23540,
  436980,
  954042,
  20368,
  1533025,
  98975,
  853043,
  168754,
  1415447,
  245531,
  197732]]

In [13]:
max(len(l) for l in city_transactions)

108306

### Find the count of transactions for each city and sort them decreasing.

In [14]:
ct = []
for ele in city_transactions:
    ct.append(len(ele))
 
ctt = ct.copy()
ctt.sort()

### Get top 10

In [15]:
top_ten_cities = []
for i in range(905,895, -1):
    top_ten_cities.append(cities[ct.index(ctt[i])])
    print(ctt[i])
    print(top_ten_cities[-1])
    print()

108306
تهران

6024
مشهد

5992
اصفهان

5934
کرج

4313
اهواز

4065
شیراز

3204
رشت

2630
تبریز

2388
قم

2248
کرمانشاه



### FP for each city

In [16]:
# If we use this segmentation, nothing will change --> grps = orders.groupby(by=['city_name_fa', 'ID_Order'])
grps = orders.groupby(by=['city_name_fa', 'ID_Customer'])
grps.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,ID_Order,ID_Item,DateTime_CartFinalize,Amount_Gross_Order,Quantity_item
city_name_fa,ID_Customer,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
آب بر,717511,1770542,44653,2015-02-22 08:08:40.000,260000.0,1.0
آب بر,969034,1516093,30185,2014-11-16 01:17:29.000,5638889.0,1.0
آب بر,1322101,5758863,246225,2016-12-19 17:53:19.000,145872.0,1.0
آب بر,1334942,1610510,22350,2014-12-29 19:47:52.000,4372222.0,1.0
آب بر,1354061,10615150,369278,2018-01-15 19:26:15.000,1620000.0,1.0
...,...,...,...,...,...,...
یزد,7254438,23378599,1049842,2018-10-18 12:34:54.000,810000.0,1.0
یزد,7256386,13639372,1086119,2018-06-22 19:23:41.000,1400000.0,1.0
یزد,7260340,21799890,9537,2018-08-20 00:19:00.000,366055.0,3.0
یزد,7269652,23208268,1697103,2018-10-11 10:42:10.000,95000.0,2.0


### Now try to seperate each city and its transactions

In [28]:
city_trans = {}

for grp, pdf in grps:
    if grp[0] in top_ten_cities:
        if city_trans.get(grp[0]) is None:
            city_trans[grp[0]] = [pdf['ID_Item'].tolist(), ]
        else:
            city_trans.get(grp[0]).append(pdf['ID_Item'].tolist())
    
print("count of cities:", len(city_trans))

# For example and in purpose to decrease the size of file, just print top 20 of Tabriz city
{k: city_trans[k][:20] for k in list(city_trans)[2:3]}

count of cities: 10


{'تبریز': [[43418],
  [91566],
  [207302],
  [200741],
  [88455],
  [529154],
  [851195, 290091],
  [23796, 45425],
  [377709, 545767, 753005, 87867],
  [90249, 86404],
  [193304],
  [39074],
  [912411],
  [106865, 377876],
  [151640, 969126],
  [99005, 716003, 907503, 150390, 20139],
  [191185, 165978],
  [179979],
  [451371, 424203],
  [165712, 1302611]]}

In [32]:
for city in city_trans:
    print(city, ":")
    
    if city == "تهران":
        pattern = pyfpgrowth.find_frequent_patterns(city_trans.get(city), 3)
        print({k: pattern[k] for k in list(pattern)[:200]})
        print("(more...)")
    else:
        pattern = pyfpgrowth.find_frequent_patterns(city_trans.get(city), 2)
        print(pattern)
    
    print("\n")
    rules = pyfpgrowth.generate_association_rules(pattern, 0.6)
    print(rules)
    print("\n\n\n")    
        

اصفهان :
{(185457,): 2, (40581,): 2, (163217,): 2, (123814,): 2, (12532,): 2, (11276,): 2, (23644,): 2, (759277,): 2, (98867,): 2, (65410,): 2, (273260,): 2, (132860,): 2, (545362,): 2, (20506,): 2, (252435,): 2, (290395,): 2, (7560,): 2, (38223,): 2, (159660,): 2, (22839,): 2, (523555,): 2, (165336,): 2, (83398,): 2, (152030,): 2, (67958,): 2, (117345,): 2, (257666,): 2, (223768,): 2, (349967,): 2, (405312,): 2, (46962,): 2, (113738,): 2, (82822,): 2, (69058,): 2, (89179,): 2, (9956,): 2, (190759,): 2, (87576,): 2, (67584,): 2, (84790,): 2, (124160,): 2, (77547,): 2, (42124,): 2, (426944,): 2, (11723,): 2, (1150876,): 2, (50150,): 2, (199409,): 2, (28761,): 2, (385695,): 2, (209751,): 2, (825106,): 2, (86629,): 2, (43691,): 2, (85010,): 2, (39471,): 2, (7561,): 2, (11654,): 2, (399801,): 2, (8095,): 2, (180096,): 2, (1287685,): 2, (281815,): 2, (394626,): 2, (130772,): 2, (96861,): 2, (663602,): 2, (157354,): 2, (79294,): 2, (208526,): 2, (208934,): 2, (158752,): 2, (283096,): 2, (288

{(369749,): 3, (767994,): 3, (1379404,): 3, (172604,): 3, (117773,): 3, (7736,): 3, (213347,): 3, (110865,): 3, (11758,): 3, (169960,): 3, (10373,): 3, (77972,): 3, (478445,): 3, (199899,): 3, (51734,): 3, (299504,): 3, (380021,): 3, (81752,): 3, (129852,): 3, (8643,): 3, (247063,): 3, (34891,): 3, (34913,): 3, (735681,): 3, (186214,): 3, (534288,): 3, (478098,): 3, (19438,): 3, (468454,): 3, (208576,): 3, (129042,): 3, (34890,): 3, (37684,): 3, (247845,): 3, (423885,): 3, (751731,): 3, (142673,): 3, (450997,): 3, (462003,): 3, (515807,): 3, (366780,): 3, (161545,): 3, (38325,): 3, (268493,): 3, (33126,): 3, (388937,): 3, (84517,): 3, (316968,): 3, (177820,): 3, (94472,): 3, (154323,): 3, (78346,): 3, (636951,): 3, (732425,): 3, (546413,): 3, (262701,): 3, (377277,): 3, (89710,): 3, (353753,): 3, (232274,): 3, (419137,): 3, (21163,): 3, (239285,): 3, (52415,): 3, (17403,): 3, (161116,): 3, (1355605,): 3, (40502,): 3, (45928,): 3, (87478,): 3, (39470,): 3, (179093,): 3, (20349,): 3, (34