In [None]:
import pandas as pd
import numpy as np

In [None]:
user_data = pd.read_csv('user_data.csv')
ads_data = pd.read_csv('ads_data.csv')
user_messages = pd.read_csv('user_messages.csv')

In [None]:
# convert to datetime fields to standard datetime format
import datetime as dt
user_data['event_time'] = pd.to_datetime(user_data['event_time']).map(dt.datetime.toordinal)
ads_data['creation_time'] = pd.to_datetime(ads_data['creation_time']).map(dt.datetime.toordinal)


In [None]:
# label encoding of categorical fields
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(user_data['event'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
#print(le_name_mapping) 
user_data['event'] = le.fit(user_data['event']).transform(user_data['event'])

In [None]:
le = preprocessing.LabelEncoder()
le.fit(user_data['channel'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
#print(le_name_mapping)
user_data['channel'] = le.fit(user_data['channel']).transform(user_data['channel'])

In [None]:
combined = pd.merge(user_data, ads_data, on='ad_id')

In [None]:
# generating output label
user_ad_dict = {}
for index, row in user_messages.iterrows():
    ads = row['ads'].replace('[','').replace(']','').split(',') 
    for ad in ads:
        user_ad_dict[str(row['user_id']) + '|' + str(ad)] = 1
        
def isRecommended(row):
    return user_ad_dict.get(str(row['user_id']) + '|' + str(row['ad_id']),0)

combined['isRecommended'] = combined.apply(isRecommended, axis = 1)

In [None]:
#  sort user data 
combined.sort_values('event_time',inplace=True)

# Recommending the most popular

In [None]:
df = user_data[['ad_id','ad_messages']].groupby('ad_id', as_index=False).agg('count')
most_popular_ads = df.nlargest(10,['ad_messages'])['ad_id'].tolist()

In [None]:
sub1 = user_messages 
sub1['recommend'] = str(most_popular_ads)
sub1.head()

# Model Evaluation

In [None]:
def evaluate_accuracy(df):
    
    count = 0
    for index, row in df.iterrows():
        ads = row['ads'].replace('[','').replace(']','').split(',')
        recommended_ads = row['recommend'].replace('[','').replace(']','').split(',')
        
        for ad in ads:
            if (ad in recommended_ads):
                count = count + 1
                
    return count
                

In [None]:
def evaluate_precision(df):
    
    score = 0
    for index, row in df.iterrows():
        count = 0
        ads = row['ads'].replace('[','').replace(']','').split(',')
        recommended_ads = row['recommend'].replace('[','').replace(']','').split(',')
        
        for i in range(0, len(recommended_ads)):
            if (recommended_ads[i] in ads):
                count = count + 1
                score = score + count
        
        #print(ads, ":", recommended_ads,":" ,score)
        score = 1.0*score/len(ads)
        
                
    return score
                

In [None]:
print('accuracy of model 1 : ',evaluate_accuracy(sub1))
print('precision of model 1 : ',evaluate_precision(sub1))

# Recommending category-wise most popular elements

In [None]:
df = user_data[['ad_id','ad_views']].groupby('ad_id', as_index=False).agg('sum')
most_popular_ads = df.nlargest(10,['ad_views'])['ad_id'].tolist()

In [None]:
df1 = user_data[['ad_id','ad_views']]
df2 = ads_data[['ad_id','category_id']]
merged_data = pd.merge(df1, df2, on='ad_id')

In [None]:
most_popular_ads_catwise = merged_data.groupby(['category_id','ad_id'], as_index=False).agg('count')

In [None]:
df = most_popular_ads_catwise
cats = df.category_id.unique()
cats

In [None]:
cat_dict = {}
for cat in cats:
    df1 = df[df['category_id']==cat]
    cat_dict[cat] = df1.nlargest(10,['ad_views'])['ad_id'].tolist()
    
print(cat_dict)

In [None]:
def catWisePopular(row):
    return str(cat_dict[row['category_id']])

sub2 = user_messages 
sub2['recommend'] = sub2.apply(catWisePopular, axis = 1)
sub2.head()

In [None]:
print('accuracy of model 2 : ',evaluate_accuracy(sub2))
print('precision of model 2 : ',evaluate_precision(sub2))

# Classifaction based model

In [None]:
#columns = [col for col in combined.columns if col not in ['user_lat', 
#'user_long', 'lat', 'long', 'title', 'description', 'origin', 'source']]

#combined = combined[columns]

In [None]:
import numpy as np
combined = combined.replace([np.inf, -np.inf], np.nan)
combined = combined.fillna(0)

In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(combined, test_size = 0.2, random_state = 0)

In [None]:
features = ['lat','user_lat']
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(random_state=0, max_depth=1)
clf.fit(train[features], train['isRecommended'])

combined['pred'] = clf.predict(combined[features])
print("Traning Accuracy :" + str(accuracy_score(train['isRecommended'], clf.predict(train[features]))))
print("Validation Accuracy :" + str(accuracy_score(val['isRecommended'], clf.predict(val[features]))))


In [None]:
personalized_dict = {}
counter = 0
for index, row in combined[combined['pred']==1].iterrows():
    key = str(row['user_id']) + "|" + str(row['category_id'])
    personalized_dict[key] = personalized_dict.get(key,[])
    personalized_dict[key].add(row['ad_id'])

# Merged Approach

In [None]:
user_dict_view = {}
user_dict_fmsg = {}
counter = 0
for index, row in user_data.iterrows():
    counter = counter + 1
    if (counter%100000==0):
        print(counter)
    if (row['event']=='first_message'):
        user_dict_fmsg[row['user_id']] = user_dict_fmsg.get(row['user_id'],[])
        user_dict_fmsg[row['user_id']].add('ad_id')
    if (row['event']=='view'):
        user_dict_view[row['user_id']] = user_dict_view.get(row['user_id'],[])
        user_dict_view[row['user_id']].add('ad_id')
    

In [None]:
def getRecommendations(row):
    user = str(row['user_id'])
    cat = str(row['category_id'])
    
    personalized_ads = personalized_dict.get(user+ '|'+cat,[])
    user_most_viewed = user_dict_view.get(int(user),[])
    user_messaged = user_dict_fmsg.get(int(user),[])
    
    most_popular = cat_dict[row['category_id']]
    
    allAds = user_most_viewed + personalized_ads + user_messaged + most_popular
    # here we can write some scoring logic for choosing the one to recommend among these
    return str(allAds[0:10])
    

In [None]:
sub = user_messages 
sub['recommend'] = sub.apply(catWisePopular, axis = 1)

print('accuracy of model : ',evaluate_accuracy(sub))
print('precision of model : ',evaluate_precision(sub))