In [176]:
# Get list of dictionaries
# List index: ICP index
# List entry: Dictionary of ICP information
    # Industry
    # Num. text, image, video
    # Bytes text, image, video

# Only keep most recent data
    # 1. Sort dataframe by timestamp
    # 2. Iterate rows --> Keep row if current entry = empty

In [177]:
# How to compare timestamps
print "string" > "str"
print "20150507" > "20170629"

True
False


In [178]:
import pickle

interactions_pickle = '20170629-interactions-mappings.pkl'
with open(interactions_pickle, 'rb') as output:
    (interactions, iidx_to_cdn, cdn_to_iidx, uidx_to_icp, icp_to_uidx) = pickle.load(output)

In [179]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import csv

In [180]:
# Read in icpclassify file
icpclassify_filepath = 'icpstatistic/icpclassify.txt'
icpclassify_header = ['industry', 'icp']
icpclassify_datatypes = {
    'industry': str,
    'icp': str
}

icpclassify_df = pd.read_csv(icpclassify_filepath, 
                              sep=',', header=None, 
                              names=icpclassify_header,
                              dtype=icpclassify_datatypes)

In [181]:
icpclassify_df.head()

Unnamed: 0,industry,icp
0,600,www.baidu.com
1,100,www.qq.com
2,300,www.taobao.com
3,100,www.sina.com.cn
4,500,www.weibo.com


In [182]:
print icpclassify_df.industry.unique()
print icpclassify_df.duplicated().unique()
print icpclassify_df.icp.unique().shape[0]

['0600' '0100' '0300' '0500' '0200' '0400']
[False]
2000


In [183]:
num_icps = len(icp_to_uidx.keys())
icp_list = icp_to_uidx.keys()

In [184]:
[x for x in icp_to_uidx.values() if icp_to_uidx.values().count(x) >= 2]
# No duplicate mappings. Just to check.

[]

In [185]:
# Next step: try something like this instead
icpclassify_dict = icpclassify_df.set_index('icp').T.to_dict()

In [186]:
icpclassify_dict

{'www.guokr.com': {'industry': '0600'},
 'www.51yes.com': {'industry': '0600'},
 'www.ctrip.com': {'industry': '0600'},
 'www.pingan.com': {'industry': '0600'},
 'www.youwo.com': {'industry': '0400'},
 'www.joqoo.com': {'industry': '0400'},
 'www.39.net': {'industry': '0600'},
 'www.texnet.com.cn': {'industry': '0600'},
 'www.k73.com': {'industry': '0400'},
 'www.zei6.com': {'industry': '0600'},
 'www.kan300.com': {'industry': '0600'},
 'www.kukudm.com': {'industry': '0600'},
 'www.lawtime.cn': {'industry': '0100'},
 'www.williamlong.info': {'industry': '0600'},
 'www.51auto.com': {'industry': '0600'},
 'www.fzdm.com': {'industry': '0600'},
 'www.gxnews.com.cn': {'industry': '0100'},
 'www.tvmao.com': {'industry': '0100'},
 'www.laawoo.com': {'industry': '0600'},
 'www.tuolar.com': {'industry': '0300'},
 'www.veryeast.cn': {'industry': '0600'},
 'www.51zjxm.com': {'industry': '0600'},
 'www.guolairen.com': {'industry': '0600'},
 'www.nipic.com': {'industry': '0600'},
 'www.lenovo.com.c

In [187]:
icp_feature_dict_prelim = {icp:features for icp,features \
                    in icpclassify_dict.iteritems() if icp in icp_to_uidx}
print len(icp_feature_dict_prelim)
print icp_feature_dict_prelim

1151
{'www.yintai.com': {'industry': '0300'}, 'www.ccidnet.com': {'industry': '0600'}, 'www.cnxianzai.com': {'industry': '0100'}, 'www.guokr.com': {'industry': '0600'}, 'www.xiujue.cc': {'industry': '0600'}, 'www.xilu.com': {'industry': '0100'}, 'www.kiees.cn': {'industry': '0600'}, 'www.51.com': {'industry': '0500'}, 'www.ctrip.com': {'industry': '0600'}, 'www.pingan.com': {'industry': '0600'}, 'www.youwo.com': {'industry': '0400'}, 'www.51fashion.com.cn': {'industry': '0600'}, 'www.demohour.com': {'industry': '0100'}, 'www.yeah.net': {'industry': '0600'}, 'www.39.net': {'industry': '0600'}, 'www.i21st.cn': {'industry': '0600'}, 'www.baofeng.com': {'industry': '0200'}, 'www.yiqifa.com': {'industry': '0600'}, 'www.k73.com': {'industry': '0400'}, 'www.paipai.com': {'industry': '0300'}, 'www.jczqw.com': {'industry': '0600'}, 'www.cehome.com': {'industry': '0600'}, 'www.aicai.com': {'industry': '0600'}, 'www.upyun.com': {'industry': '0600'}, 'www.ys137.com': {'industry': '0600'}, 'www.qih

In [188]:
# Check all ICPs in dict belong in RecSys
for icp, features in icp_feature_dict_prelim.iteritems():
    assert(icp in icp_to_uidx)

# Check all ICPs in RecSys matrix represented in feature dict
for icp in icp_to_uidx.keys():
    assert (icp in icp_feature_dict_prelim)

In [189]:
# For quick testing
def get_industry(icp):
    if icp in icp_list:
        return icp_feature_dict_prelim[icp]['industry']
    else:
        return 'ICP not in RecSys list'

In [190]:
get_industry('www.17ok.com')

'ICP not in RecSys list'

In [191]:
# TODO: Add num page elements, bytes data

# To sort this list based on mapping indices: 
# https://stackoverflow.com/questions/72899/how-do-i-sort-a-list-of-dictionaries-by-values-of-the-dictionary-in-python

In [192]:
# Read in icpstatistic file
icpstatistic_filepath = 'icpstatistic/all_icp_statistics.txt'
icpstatistic_header = ['icp', 'textnum', 'imagenum', 'videonum', 'unknownnum',\
                       'textbytes', 'imagebytes', 'videobytes', 'unknownbytes',\
                       'createtime', 'ts']
icpstatistic_dtypes = {
    'icp': str,
    'textnum': np.int64,
    'imagenum': np.int64,
    'videonum': np.int64,
    'unknownnum': np.int64,
    'textbytes': np.int64,
    'imagebytes': np.int64,
    'videobytes': np.int64,
    'unknownbytes': np.int64,
    'createtime': str,
    'ts': str
}

icpstatistic_df = pd.read_csv(icpstatistic_filepath, 
                              sep=',', header=None, 
                              names=icpstatistic_header,
                              dtype=icpstatistic_dtypes)

In [193]:
icpstatistic_df.head()

Unnamed: 0,icp,textnum,imagenum,videonum,unknownnum,textbytes,imagebytes,videobytes,unknownbytes,createtime,ts
0,1000eb.com,52,1,0,0,437337,4286,0,0,20150312095107,20150102
1,2pcw.cn,2135,201,0,12,37115444,11604039,0,196756,20150312095108,20150102
2,360boclub.com,3312,93,7,9,143260320,2739491,78912,94623,20150312095108,20150102
3,365jia.cn,2607,5067,43,134,153051095,199486613,1402569193,7573289,20150312095108,20150102
4,365jilin.com,15,19,0,4,367434,255625,0,100506,20150312095108,20150102


In [194]:
# Sort: recent first
icpstatistic_sorted = icpstatistic_df.sort_values(['ts', 'createtime'], ascending=[False, False])
icpstatistic_sorted.head()

Unnamed: 0,icp,textnum,imagenum,videonum,unknownnum,textbytes,imagebytes,videobytes,unknownbytes,createtime,ts
117957,1000eb.com,19364,5,0,0,370553410,21430,0,0,20150430003613,20150429
117958,2pcw.cn,4625,446,0,24,79126492,26706230,0,393512,20150430003613,20150429
117959,360boclub.com,176568,883,41,116,8153276319,32099895,464946,1928660,20150430003613,20150429
117960,365jia.cn,223157,199273,817,718,12508850846,8603710019,25164893261,903447074,20150430003613,20150429
117961,365jilin.com,78,55,0,5,1856159,224145,0,144808,20150430003613,20150429


In [195]:
icpstatistic_sorted.describe()

# Web data features: Bin by quartiles? 
# So: 0, 1st quartile (>0), 2nd quartile, 3rd, 4th

Unnamed: 0,textnum,imagenum,videonum,unknownnum,textbytes,imagebytes,videobytes,unknownbytes
count,142189.0,142189.0,142189.0,142189.0,142189.0,142189.0,142189.0,142189.0
mean,52812.73,16943.68,173.407289,1685.686,1802321000.0,887223400.0,1070706000.0,891797500.0
std,185019.7,185824.7,3046.352977,16129.16,7427256000.0,9001758000.0,28635220000.0,17792610000.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1541.0,59.0,0.0,11.0,23326350.0,720029.0,0.0,138859.0
50%,19781.0,1016.0,0.0,62.0,442765900.0,26599600.0,0.0,1174268.0
75%,72178.0,10149.0,6.0,257.0,1752729000.0,399562400.0,482742.0,8281184.0
max,12975650.0,13546390.0,212220.0,1043031.0,484349600000.0,672854100000.0,1846369000000.0,1065438000000.0


In [265]:
# Define quantiles for data feature bins
text_q1 = 0
text_q2 = icpstatistic_sorted['textbytes'].quantile(.25)
text_q3 = icpstatistic_sorted['textbytes'].quantile(.50)
text_q4 = icpstatistic_sorted['textbytes'].quantile(.75)

image_q1 = 0
image_q2 = icpstatistic_sorted['imagebytes'].quantile(.25)
image_q3 = icpstatistic_sorted['imagebytes'].quantile(.50)
image_q4 = icpstatistic_sorted['imagebytes'].quantile(.75)

video_q1 = 0
video_q2 = icpstatistic_sorted['videobytes'].quantile(.70)
video_q3 = icpstatistic_sorted['videobytes'].quantile(.80)
video_q4 = icpstatistic_sorted['videobytes'].quantile(.90)

In [266]:
icpstatistic_sorted['videobytes'].quantile(.54)

150.0

In [267]:
# Quantile-bin function:
def text_bin(bytes):
    if bytes <= text_q1: return 'q0'
    elif bytes > text_q1 and bytes < text_q2: return 'q1'
    elif bytes >= text_q2 and bytes < text_q3: return 'q2'
    elif bytes >= text_q3 and bytes < text_q4: return 'q3'
    elif bytes >= text_q4: return 'q4'
    else: return 'Error'
    
def image_bin(bytes):
    if bytes <= image_q1: return 'q0'
    elif bytes > image_q1 and bytes < image_q2: return 'q1'
    elif bytes >= image_q2 and bytes < image_q3: return 'q2'
    elif bytes >= image_q3 and bytes < image_q4: return 'q3'
    elif bytes >= image_q4: return 'q4'
    else: return 'Error'
    
def video_bin(bytes):
    if bytes <= video_q1: return 'q0'
    elif bytes > video_q1 and bytes < video_q2: return 'q1'
    elif bytes >= video_q2 and bytes < video_q3: return 'q2'
    elif bytes >= video_q3 and bytes < video_q4: return 'q3'
    elif bytes >= video_q4: return 'q4'
    else: return 'Error'

In [268]:
video_bin(21175721.0)

'q4'

In [196]:
# Add web content statistics to feature dict
for entry in icpstatistic_sorted.itertuples():
    icp = entry[1]
    
    # Hasn't used a CDN
    if icp not in icp_to_uidx: 
        continue
        
    # Has no entries yet
    if 'textnum' not in icp_feature_dict_prelim[icp]:
        icp_feature_dict_prelim[icp]['textnum'] = entry[2]
    if 'imagenum' not in icp_feature_dict_prelim[icp]:
        icp_feature_dict_prelim[icp]['imagenum'] = entry[3]
    if 'videonum' not in icp_feature_dict_prelim[icp]:
        icp_feature_dict_prelim[icp]['videonum'] = entry[4]
    if 'unknownnum' not in icp_feature_dict_prelim[icp]:
        icp_feature_dict_prelim[icp]['unknownnum'] = entry[5]
    if 'textbytes' not in icp_feature_dict_prelim[icp]:
        icp_feature_dict_prelim[icp]['textbytes'] = entry[6]
    if 'imagebytes' not in icp_feature_dict_prelim[icp]:
        icp_feature_dict_prelim[icp]['imagebytes'] = entry[7]
    if 'videobytes' not in icp_feature_dict_prelim[icp]:
        icp_feature_dict_prelim[icp]['videobytes'] = entry[8]
    if 'unknownbytes' not in icp_feature_dict_prelim[icp]:
        icp_feature_dict_prelim[icp]['unknownbytes'] = entry[9]
    if 'ts' not in icp_feature_dict_prelim[icp]:
        icp_feature_dict_prelim[icp]['ts'] = entry[11]

In [197]:
for icp, features in icp_feature_dict_prelim.iteritems():
    if 'industry' not in features: print icp
    if 'textnum' not in features: print icp
    if 'imagenum' not in features: print icp
    if 'videonum' not in features: print icp
    if 'unknownnum' not in features: print icp
    if 'textbytes' not in features: print icp
    if 'imagebytes' not in features: print icp
    if 'videobytes' not in features: print icp
    if 'unknownbytes' not in features: print icp

www.chinacourt.org
www.chinacourt.org
www.chinacourt.org
www.chinacourt.org
www.chinacourt.org
www.chinacourt.org
www.chinacourt.org
www.chinacourt.org


In [198]:
# Manually adding info for www.chinacourt.org - got listed as "chinacourt.org" in icpstatistic
# Please forgive me for this shitty code
for entry in icpstatistic_sorted.itertuples():
    icp_raw = entry[1]
    
    if 'chinacourt.org' in icp_raw:
        icp = 'www.chinacourt.org'
        
        if 'textnum' not in icp_feature_dict_prelim[icp]:
            icp_feature_dict_prelim[icp]['textnum'] = entry[2]
        if 'imagenum' not in icp_feature_dict_prelim[icp]:
            icp_feature_dict_prelim[icp]['imagenum'] = entry[3]
        if 'videonum' not in icp_feature_dict_prelim[icp]:
            icp_feature_dict_prelim[icp]['videonum'] = entry[4]
        if 'unknownnum' not in icp_feature_dict_prelim[icp]:
            icp_feature_dict_prelim[icp]['unknownnum'] = entry[5]
        if 'textbytes' not in icp_feature_dict_prelim[icp]:
            icp_feature_dict_prelim[icp]['textbytes'] = entry[6]
        if 'imagebytes' not in icp_feature_dict_prelim[icp]:
            icp_feature_dict_prelim[icp]['imagebytes'] = entry[7]
        if 'videobytes' not in icp_feature_dict_prelim[icp]:
            icp_feature_dict_prelim[icp]['videobytes'] = entry[8]
        if 'unknownbytes' not in icp_feature_dict_prelim[icp]:
            icp_feature_dict_prelim[icp]['unknownbytes'] = entry[9]
        if 'ts' not in icp_feature_dict_prelim[icp]:
            icp_feature_dict_prelim[icp]['ts'] = entry[11]
            
        break

In [199]:
for icp, features in icp_feature_dict_prelim.iteritems():
    if 'industry' not in features: print icp
    if 'textnum' not in features: print icp
    if 'imagenum' not in features: print icp
    if 'videonum' not in features: print icp
    if 'unknownnum' not in features: print icp
    if 'textbytes' not in features: print icp
    if 'imagebytes' not in features: print icp
    if 'videobytes' not in features: print icp
    if 'unknownbytes' not in features: print icp

In [200]:
icp_feature_dict_prelim['www.qq.com']

{'imagebytes': 51454,
 'imagenum': 40,
 'industry': '0100',
 'textbytes': 79575808,
 'textnum': 2716,
 'ts': '20150429',
 'unknownbytes': 3711144,
 'unknownnum': 160,
 'videobytes': 0,
 'videonum': 0}

In [201]:
# import pickle
# with open('20170703-icp-features-prelim.pkl', 'wb') as output:
#     pickle.dump(icp_feature_dict, output, -1)

In [288]:
icp_feature_dict_final = {
                          icp:dict(
                                icp=icp,
                                industry=features['industry'],
                                text_bin=text_bin(features['textbytes']),
                                image_bin=image_bin(features['imagebytes']),
                                video_bin=video_bin(features['videobytes'])
                          )
                          for icp, features in icp_feature_dict_prelim.iteritems()
}

In [289]:
video_heavy = [cdn for cdn, feature in icp_feature_dict_final.iteritems() if feature['video_bin'] == 'q4']
print len(video_heavy)
print video_heavy

154
['www.oppo.com', 'www.guolairen.com', 'www.nipic.com', 'www.dayoo.com', 'www.esteelauder.com.cn', 'www.hebei.com.cn', 'www.taisha.org', 'www.titan24.com', 'www.abbottmama.com.cn', 'www.icbc.com.cn', 'www.17173.com', 'www.go108.com.cn', 'www.313.com', 'www.cztv.com', 'www.9978.cn', 'nut.com.cn', 'www.nanhutravel.com', 'www.wifigx.com', 'www.dqdaily.com', 'www.cpic.com.cn', 'www.foodmate.net', 'www.ofweek.com', 'www.safehoo.com', 'www.55188.com', 'www.gxnews.com.cn', 'www.bmw.com.cn', 'www.sf-express.com', 'www.smzy.com', 'www.xa999.com', 'www.xiami.com', 'www.ce.cn', 'www.smartshe.com', 'www.gq.com.cn', 'www.koolearn.com', 'www.juooo.com', 'www.yesky.com', 'www.ffpic.com', 'www.feel-bar.com', 'www.kuwo.cn', 'www.sznews.com', 'www.pcgames.com.cn', 'www.voc.com.cn', 'www.hqew.com', 'www.cnnb.com.cn', 'www.0731fdc.com', 'www.xmhouse.com', 'www.6eat.com', 'www.xywy.com', 'www.chinanews.com', 'www.tingroom.com', 'www.weather.com.cn', 'www.zxart.cn', 'www.3dmgame.com', 'www.infinitus.com.

In [290]:
# Create a list of ICP feature dicts
# Ordered by uidx (user/ICP index)
icp_feature_list = [icp_feature_dict_final[uidx_to_icp[uidx]] for uidx in range(num_icps)]

In [291]:
icp_feature_list

[{'icp': '365jilin.com',
  'image_bin': 'q1',
  'industry': '0100',
  'text_bin': 'q1',
  'video_bin': 'q0'},
 {'icp': 'www.100ec.cn',
  'image_bin': 'q4',
  'industry': '0100',
  'text_bin': 'q4',
  'video_bin': 'q2'},
 {'icp': 'www.163.com',
  'image_bin': 'q1',
  'industry': '0100',
  'text_bin': 'q2',
  'video_bin': 'q0'},
 {'icp': 'www.21cbh.com',
  'image_bin': 'q2',
  'industry': '0100',
  'text_bin': 'q4',
  'video_bin': 'q0'},
 {'icp': 'www.21cn.com',
  'image_bin': 'q3',
  'industry': '0100',
  'text_bin': 'q2',
  'video_bin': 'q2'},
 {'icp': 'www.2500sz.com',
  'image_bin': 'q3',
  'industry': '0100',
  'text_bin': 'q4',
  'video_bin': 'q3'},
 {'icp': 'www.315che.com',
  'image_bin': 'q2',
  'industry': '0100',
  'text_bin': 'q2',
  'video_bin': 'q0'},
 {'icp': 'www.3dmgame.com',
  'image_bin': 'q4',
  'industry': '0100',
  'text_bin': 'q4',
  'video_bin': 'q4'},
 {'icp': 'www.4hw.com.cn',
  'image_bin': 'q2',
  'industry': '0100',
  'text_bin': 'q2',
  'video_bin': 'q1'},
 

In [285]:
# Make sure indices match up
uidx_to_icp[1]

'www.100ec.cn'

In [292]:
# Vectorize! (One-hot encodings of each ICP)
from sklearn.feature_extraction import DictVectorizer
icp_vectorizer = DictVectorizer()
icp_feature_vectors = icp_vectorizer.fit_transform(icp_feature_list)

In [293]:
icp_feature_vectors.shape

(1151, 1172)

In [295]:
1172-1151
# 6 Industries
# 3 x 5 content-type bins

21

In [296]:
icp_feature_vectors

<1151x1172 sparse matrix of type '<type 'numpy.float64'>'
	with 5755 stored elements in Compressed Sparse Row format>

In [297]:
import pickle
with open('20170703-icp-feature-vectors.pkl', 'w') as output:
    pickle.dump(icp_feature_vectors, output, -1)