In [11]:
'''
In this example, The website has a number of topics, 
and there are fixed number of categories under topics.

For the data, user's recent visiting history will be memorized and
it will be used to predict the topic which the user highly likely interested.
Based on the predicted topic, the host can offer/recommend any websites under categories belong to
that topic

'''

from pandas import DataFrame, read_csv
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np

import pickle
from sklearn.ensemble import GradientBoostingClassifier

## Read 'xls' file which contains categories
filepath = 'category.xls'

# Assign data into DataFrame
n_categories_in_topic=6
df = pd.read_excel(filepath,  nrows=n_categories_in_topic)
print(df)




                     Arts       Automobile              Business  \
0            Architecture            Autos           Advertising   
1                  Awards         Aviation          Associations   
2                   Humor          Boating          Construction   
3         Performing Arts             Cars      Customer Service   
4             Photography      Motorcycles            E-Commerce   
5      Visual Arts Design           Trains             Logistics   

           Career                     Computer              Education  \
0        Business      Graphics and Multimedia      Business Training   
1            Jobs                   Networking               Colleges   
2      Employment                  Programming        Human Resources   
3          Salary                     Security           Universities   
4         Company                     Software                 School   
5       Positions                      Devices               Research   

   Entertai

In [2]:
### Convert categories into user profile representation 
# each topic is class
Topics = df.columns
print ("The class of user: ", Topics)
print ("The number of classes: %s" %len(Topics))

# Create category vector
for topic in Topics:
    if topic == 'Arts':
        category_array = df[topic].astype(str).to_numpy()

    else:
        temp_array = df[topic].astype(str).to_numpy()
        category_array = np.append(category_array,temp_array)

        
for idx, category in  enumerate(category_array):
    category_array[idx] =  (category[4:])
    
    
    
print ("The categories on the website: ", category_array)
print ("The number of categories: %s" %len(category_array))


The class of user:  Index(['Arts', 'Automobile', 'Business', 'Career', 'Computer', 'Education',
       'Entertainment', 'Finance', 'Fitness', 'Gambling', 'Games',
       'Government', 'Health', 'Hobbies', 'Home', 'Industry',
       'Information Technology', 'News and Media', 'People and Society',
       'Reference', 'Science', 'Shopping', 'Sports', 'Travel'],
      dtype='object')
The number of classes: 24
The categories on the website:  ['Architecture' 'Awards' 'Humor' 'Performing Arts' 'Photography'
 'Visual Arts Design' 'Autos' 'Aviation' 'Boating' 'Cars' 'Motorcycles'
 'Trains' 'Advertising' 'Associations' 'Construction' 'Customer Service'
 'E-Commerce' 'Logistics' 'Business' 'Jobs' 'Employment' 'Salary'
 'Company' 'Positions' 'Graphics and Multimedia' 'Networking'
 'Programming' 'Security' 'Software' 'Devices' 'Business Training'
 'Colleges' 'Human Resources' 'Universities' 'School' 'Research'
 'Animation' 'Comics' 'Fashion' 'Modeling' 'News' 'Movies' 'Accounting'
 'Banking' 'Cred

In [19]:
## Functions
def create_user_profile(f,m):
    '''
    f : feature length (i.e., the number of categories)
    m : memory size
    '''
    arr = np.zeros(f)
    arr[:m]  = 1
    np.random.shuffle(arr)
    return arr


def generate_label(usr_data, Topics, n_categories_in_topic):
    score_array = np.zeros(len(Topics))
    for idx in range(len(Topics)):
        score = sum(usr_data[n_categories_in_topic*idx:n_categories_in_topic*(idx+1)-1])
        score_array[idx] = score

    score_idx = np.argmax(score_array)
    label = Topics[score_idx]
    
    return label



def prediction_accracy(y_pred, y_truth):
    acc_count = 0
    for idx in range(len(y_pred)):
        if y_pred[idx] == y_truth[idx]:
            acc_count+=1
        else:
            acc_count+=0
    accuracy = acc_count/len(y_pred)*100
    return accuracy
        

In [4]:
## Parameters
memory_size = 50
n_train = 150000
n_test = 50000

In [5]:
## Create user profile with binary user profile representation
'''
1 : The user have visited this category
0 : The user never visited this category
memory_size = the number of saved visiting category 
Let assume that only the 'memory_size' most recently visited categories are saved
(i.e., the total number of 1 is same as 'memory_size' )
'''

## Create 200,000 user profiles (150,000: training, 50,000:test)
## Then assign into dataframe and save into csv file
feat_len = len(category_array) 
user_train_array = np.zeros((n_train, feat_len))
user_test_array = np.zeros((n_test, feat_len))


#Create array
for idx in tqdm(range(n_train)):
    user_train_array[idx,:] = create_user_profile(feat_len,memory_size)
#     print (user_train_array[idx,:])

for idx in tqdm(range(n_test)):
    user_test_array[idx,:] = create_user_profile(feat_len,memory_size)
 
    

# Assign to df
df_train = pd.DataFrame(user_train_array, columns=category_array)  
df_test = pd.DataFrame(columns=category_array)  


# Save to the files
filename_train = 'user_train.csv'
filename_test = 'user_test.csv'
try:
    f = open(filename_train)
    print("Files already exist")

except IOError:
    df_train.to_csv('user_train.csv')
    df_test.to_csv('user_test.csv')
    






100%|██████████| 150000/150000 [00:04<00:00, 32624.47it/s]
100%|██████████| 50000/50000 [00:01<00:00, 31893.18it/s]

Files already exist





In [9]:
## Create labels for the training
'''
In real data, the label of user should be the interested topic of user. 
This kind of data can be obtained by survey or by recording actual visiting website after 50 

However, in this synthetic data, the label (user's possible interested topic in the future) would be 
the hightest score topic on his/her memorized visiting history.
And the score is simply summation of all the ones in the topic

'''
labels_train = []
g_truth_test = []

# Generate label and ground truth
for idx in tqdm(range(n_train)):
    label_temp = generate_label(usr_data = user_train_array[idx,:], Topics=Topics, 
                                n_categories_in_topic=n_categories_in_topic)
    labels_train.append(label_temp)


for idx in tqdm(range(n_test)):
    label_temp = generate_label(usr_data = user_test_array[idx,:], Topics=Topics, 
                                n_categories_in_topic=n_categories_in_topic)
    g_truth_test.append(label_temp)
    

# Convert list to array    
labels_train_array = np.asarray(labels_train)    
g_truth_test_array = np.asarray(g_truth_test)  
print (len(labels_train_array))
print (labels_train_array)


100%|██████████| 150000/150000 [00:06<00:00, 23663.29it/s]
100%|██████████| 50000/50000 [00:02<00:00, 23402.22it/s]

150000
['Gambling' 'Fitness' 'Arts' ... 'Entertainment' 'People and Society'
 'Home']





In [13]:
## Train any machine learning model
'''
Any machine learning model can be used if we have labeled training samples
Here, GB claasifier used as for example
'''
filename = 'GBC_web_model.sav'
train_X = user_train_array
train_y = labels_train_array
try:
    f = open(filename)
    # Do something with the file
    print("Trained model already exists")
    my_model = pickle.load(open(filename, 'rb'))

except IOError:
    # print("File not accessible")
    my_model = GradientBoostingClassifier(random_state=21, learning_rate=0.1, n_estimators=50, verbose=1)
    print("Training in progress...")
    my_model.fit(train_X, train_y)
    # save trained model
    pickle.dump(my_model, open(filename, 'wb'))



Training in progress...
      Iter       Train Loss   Remaining Time 
         1      397282.2364           29.28m
         2      349539.6366           28.64m
         3      316409.9929           28.10m
         4      290489.7934           27.47m
         5      269260.2127           26.89m
         6      251704.2178           26.36m
         7      236590.9209           25.80m
         8      223536.8647           25.20m
         9      211921.9729           24.66m
        10      201559.0870           24.09m
        20      137435.6275           18.37m
        30      107879.3536           12.38m
        40       90805.7148            6.21m
        50       79110.6472            0.00s


In [21]:
## Test on trained model
'''
The goal of this machine learning model is to predict what topic is user interested in, 
when his/her previously visited history is given 

In other words, AI learns the relation between 'visiting history' and 'future interest' of users.
In this example, the relation is a simple logic (hightest score).

Hence, the machine learning model learns this simple logic from data, and gives output based on that.
If the model predict a topic as an output, then the categories/websites in this topic can be offered to the user

'''
test_X = user_test_array
test_result= my_model.predict(test_X)

pd.set_option('display.max_rows', 1000)
test_print = pd.DataFrame()
test_print['user_intesert']  = g_truth_test_array.flatten()
test_print['prediction(offer)'] = test_result.flatten()
print (test_print.head(1000))


pred_accuracy = prediction_accracy(test_result, g_truth_test_array)
print ("prediction accuracy is %s%%" %pred_accuracy)

              user_intesert       prediction(offer)
0                Automobile              Automobile
1                      Arts                    Arts
2                Automobile              Automobile
3        People and Society      People and Society
4                      Arts                    Arts
5                   Finance                 Finance
6                  Gambling                Gambling
7                  Gambling                Gambling
8                  Shopping                Shopping
9                Government              Government
10                Education               Education
11       People and Society      People and Society
12            Entertainment           Entertainment
13                 Gambling                Gambling
14                 Business                Business
15                     Arts                    Arts
16                   Career                  Career
17                    Games                   Games
18          

In [None]:
'''
The prediction accuracy is 99.19 with non-converged training loss
(it may reach to 100 percent because it is very simple logic)
In real case, the relation (between data and label) may not follow simple visiting score.
The contextual information further than binary visiting history should be considered, and
more complex learning model also would be needed for the learning and the prediction. 

'''