In [1]:
import pandas as pd

log_df = pd.read_csv("access_log.csv", parse_dates=["date"])
print(len(log_df))
log_df.head()

325908


Unnamed: 0,user_id,item_id,date
0,4,205587,2015-07-04
1,4,748683,2015-07-04
2,4,790055,2015-07-04
3,4,790055,2015-07-04
4,4,764638,2015-07-04


In [2]:
log_df["user_id"].value_counts().describe()

count    31443.000000
mean        10.365042
std         16.023399
min          2.000000
25%          3.000000
50%          5.000000
75%         11.000000
max        632.000000
Name: count, dtype: float64

In [3]:
log_df["item_id"].value_counts().describe()

count    87611.000000
mean         3.719944
std          8.802572
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max        941.000000
Name: count, dtype: float64

In [4]:
log_df["date"].value_counts()

date
2015-07-03    45441
2015-07-02    45394
2015-07-01    44163
2015-07-04    43804
2015-07-08    39933
2015-07-05    39932
2015-07-07    33930
2015-07-06    33311
Name: count, dtype: int64

In [5]:
import datetime

start_date = datetime.datetime(2015, 7, 1)
end_date = datetime.datetime(2015, 7, 7)
target_date = datetime.datetime(2015, 7, 8)

In [6]:
x_df = log_df[(start_date <= log_df["date"]) & (log_df["date"] <= end_date)]
print(len(x_df))
x_df.head(3)

285975


Unnamed: 0,user_id,item_id,date
0,4,205587,2015-07-04
1,4,748683,2015-07-04
2,4,790055,2015-07-04


In [7]:
y_df = log_df[log_df["date"] == target_date]
print(len(y_df))
y_df.head()

39933


Unnamed: 0,user_id,item_id,date
103,94,603852,2015-07-08
104,94,28600,2015-07-08
105,94,987320,2015-07-08
106,94,109924,2015-07-08
107,94,886214,2015-07-08


In [8]:
U2I2Rcens = {}
for row in x_df.itertuples():
    rcen = (target_date - row.date).days

    U2I2Rcens.setdefault(row.user_id, {})
    U2I2Rcens[row.user_id].setdefault(row.item_id, [])
    U2I2Rcens[row.user_id][row.item_id].append(rcen)

In [9]:
U2I2Rcens[2497]

{400521: [4, 2, 2, 2, 1], 678277: [4], 687963: [2], 178138: [1]}

In [10]:
Rows1 = []
for user_id, I2Rcens in U2I2Rcens.items():
    for item_id, Rcens in I2Rcens.items():
        freq = len(Rcens)
        rcen = min(Rcens)
        Rows1.append((user_id, item_id, rcen, freq))
UI2RF_df = pd.DataFrame(Rows1, columns=["user_id", "item_id", "rcen", "freq"])
print(len(UI2RF_df))
UI2RF_df.head()

204661


Unnamed: 0,user_id,item_id,rcen,freq
0,4,205587,4,1
1,4,748683,4,1
2,4,790055,4,3
3,4,764638,4,2
4,4,492434,4,1


In [11]:
y_df = y_df.drop_duplicates()
print(len(y_df))
y_df["pv_flag"] = 1
y_df.head()

29651


Unnamed: 0,user_id,item_id,date,pv_flag
103,94,603852,2015-07-08,1
104,94,28600,2015-07-08,1
105,94,987320,2015-07-08,1
106,94,109924,2015-07-08,1
107,94,886214,2015-07-08,1


In [12]:
UI2RFP_df = pd.merge(
    UI2RF_df,
    y_df[["user_id", "item_id", "pv_flag"]],
    how="left",
    on=["user_id", "item_id"],
)
UI2RFP_df["pv_flag"].fillna(0, inplace=True)
print(len(UI2RFP_df))
UI2RFP_df.head()

204661


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  UI2RFP_df["pv_flag"].fillna(0, inplace=True)


Unnamed: 0,user_id,item_id,rcen,freq,pv_flag
0,4,205587,4,1,0.0
1,4,748683,4,1,0.0
2,4,790055,4,3,0.0
3,4,764638,4,2,0.0
4,4,492434,4,1,0.0


In [13]:
print(sorted(UI2RFP_df["rcen"].unique()))
print(sorted(UI2RFP_df["freq"].unique()))

[1, 2, 3, 4, 5, 6, 7]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 31, 32, 34, 35, 41, 43, 58, 63, 118]


In [14]:
tar_df = UI2RFP_df[UI2RFP_df["freq"] <= 7]
print(len(tar_df))
tar_df.head()

203456


Unnamed: 0,user_id,item_id,rcen,freq,pv_flag
0,4,205587,4,1,0.0
1,4,748683,4,1,0.0
2,4,790055,4,3,0.0
3,4,764638,4,2,0.0
4,4,492434,4,1,0.0


In [15]:
print(tar_df["pv_flag"].sum())

2038.0


In [None]:
rcen_df = pd.crosstab(index=tar_df["rcen"], columns=tar_df["pv_flag"])
rcen_df = rcen_df.rename(columns={0: "neg", 1: "pos"})
rcen_df