In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors 

from scipy.sparse import csr_matrix


In [None]:
final_dataset = pd.read_csv('cf_final_dataset.csv',sep=',')
final_dataset

In [73]:
final_dataset= final_dataset[['bundle_name','id_bundle','msisdn_id_x','sum_subscriptions']]
df_user_item_matrix = final_dataset.pivot(index='id_bundle',columns='msisdn_id_x',values='sum_subscriptions').fillna(0)

In [74]:
#Creation Matrice
sample = np.array([[0,0,3,0,0],[4,0,0,0,2],[0,0,0,0,1]])
sparsity = 1.0 - ( np.count_nonzero(sample) / float(sample.size) )
csr_sample = csr_matrix(sample)
csr_data = csr_matrix(df_user_item_matrix.values)
df_user_item_matrix.reset_index(inplace=True)

In [75]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [76]:
# Function to recommend bundle with K-NN
def get_bundle_recommendation(movie_name):
                n_movies_to_reccomend = 10
                movie_list = final_dataset[final_dataset['bundle_name'].str.contains(movie_name)]  
                if len(movie_list):        
                    movie_idx= movie_list.iloc[0]['id_bundle']
                    movie_idx = df_user_item_matrix[df_user_item_matrix['id_bundle'] == movie_idx].index[0]

                    distances , indices = knn.kneighbors(csr_data[movie_idx],n_neighbors=n_movies_to_reccomend+1)    
                    rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),\
                                               key=lambda x: x[1])[:0:-1]
        
                    recommend_frame = []

                    for val in rec_movie_indices:
                            movie_idx = df_user_item_matrix.iloc[val[0]]['id_bundle']
                            idx = final_dataset[final_dataset['id_bundle'] == movie_idx].index
                            recommend_frame.append({'bundle_name':final_dataset.iloc[idx]['bundle_name'].values[0],'Distance':val[1]})
                    dfx = pd.DataFrame(recommend_frame,index=range(1,n_movies_to_reccomend+1))
                    return dfx
    
                else:
                    return "No movies found. Please check your input"

In [77]:
get_bundle_recommendation('BTL 4G Bundle 8 Weekly')

Unnamed: 0,bundle_name,Distance
1,BTL 4G Bundle 10 Monthly,0.989342
2,BTL 4G Bundle 9 Monthly,0.986853
3,BTL 4G Bundle 7 Weekly,0.986292
4,BTL 4G Bundle 11 Monthly,0.985987
5,BTL 4G Bundle 2 Weekly,0.984845
6,BTL 4G Bundle 16 Monthly,0.983001
7,BTL 4G Bundle 12 Monthly,0.980844
8,BTL 4G Bundle 6 Monthly,0.979135
9,BTL 4G Bundle 3 Weekly,0.977822
10,BTL 4G Bundle 1 Daily,0.973917


# surprise_knn_recommendation

In [78]:
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import KNNBasic,  KNNWithMeans, KNNBaseline
from surprise.model_selection import KFold
from surprise import Reader
from surprise import NormalPredictor
from surprise.model_selection import cross_validate
import matplotlib.pyplot as plt
import seaborn as sns
from surprise.model_selection import GridSearchCV

In [79]:
reader = Reader(rating_scale=(1, 5))
# The columns must correspond to user id, item id and ratings (in that order).
data=Dataset.load_from_df(final_dataset[['msisdn_id_x', 'id_bundle', 'sum_subscriptions']], reader)
anti_set = data.build_full_trainset().build_anti_testset()

In [80]:
#anti_set

In [81]:
bundles = final_dataset[['bundle_name' , 'id_bundle']].drop_duplicates(['bundle_name' , 'id_bundle'])
users = final_dataset[['msisdn_id_x']].drop_duplicates(['msisdn_id_x'])


### Modeling


In [82]:
sim_options = { 'name': 'cosine' ,'user_based':  False}
kf = KFold(n_splits=5)
algo = KNNWithMeans(k =3 , sim_options = sim_options)
best_algo = None
best_rmse = 1000.0
best_pred = None
for trainset, testset in kf.split(data):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    # Compute and print Root Mean Squared Error
    rmse = accuracy.rmse(predictions, verbose=True)
    if rmse < best_rmse:
        best_rmse= rmse
        best_algo = algo
        best_pred = predictions

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.3340
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.2977
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.6675
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.5877
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.7336


In [83]:
best_pred=pd.DataFrame(best_pred)
best_pred

Unnamed: 0,uid,iid,r_ui,est,details
0,472935,5,11.0,1.646329,"{'was_impossible': True, 'reason': 'User and/o..."
1,497706,18,2.0,1.646329,"{'was_impossible': True, 'reason': 'User and/o..."
2,371806,6,1.0,1.000000,"{'actual_k': 1, 'was_impossible': False}"
3,1772,0,4.0,1.646329,"{'was_impossible': True, 'reason': 'User and/o..."
4,375945,5,1.0,1.646329,"{'was_impossible': True, 'reason': 'User and/o..."
...,...,...,...,...,...
5010,24567,16,1.0,1.646329,"{'was_impossible': True, 'reason': 'User and/o..."
5011,414803,5,1.0,1.646329,"{'was_impossible': True, 'reason': 'User and/o..."
5012,440315,10,1.0,1.646329,"{'was_impossible': True, 'reason': 'User and/o..."
5013,246745,5,1.0,1.116729,"{'actual_k': 1, 'was_impossible': False}"


In [84]:
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25076 entries, 0 to 25075
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   bundle_name        25076 non-null  object
 1   id_bundle          25076 non-null  int64 
 2   msisdn_id_x        25076 non-null  int64 
 3   sum_subscriptions  25076 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 783.8+ KB


In [85]:
pred_df = pd.DataFrame(best_pred).merge(final_dataset , left_on = ['iid', 'uid'], right_on = ['id_bundle','msisdn_id_x'])
pred_df[['uid', 'iid', 'msisdn_id_x', 'bundle_name', 'id_bundle', 'est','sum_subscriptions']]


Unnamed: 0,uid,iid,msisdn_id_x,bundle_name,id_bundle,est,sum_subscriptions
0,472935,5,472935,BTL 4G Bundle 13 Daily,5,1.646329,11
1,497706,18,497706,BTL 4G Bundle 9 Monthly,18,1.646329,2
2,371806,6,371806,BTL 4G Bundle 14 Weekly,6,1.000000,1
3,1772,0,1772,BTL 4G Bundle 1 Daily,0,1.646329,4
4,375945,5,375945,BTL 4G Bundle 13 Daily,5,1.646329,1
...,...,...,...,...,...,...,...
5010,24567,16,24567,BTL 4G Bundle 7 Weekly,16,1.646329,1
5011,414803,5,414803,BTL 4G Bundle 13 Daily,5,1.646329,1
5012,440315,10,440315,BTL 4G Bundle 2 Weekly,10,1.646329,1
5013,246745,5,246745,BTL 4G Bundle 13 Daily,5,1.116729,1


In [87]:
anti_pre = best_algo.test(anti_set)
pred_df = pd.DataFrame(anti_pre).merge(bundles , left_on = ['iid'], right_on = ['id_bundle'])
pred_df = pd.DataFrame(pred_df).merge(users , left_on = ['uid'], right_on = ['msisdn_id_x'])

In [88]:
pred_df

Unnamed: 0,uid,iid,r_ui,est,details,bundle_name,id_bundle,msisdn_id_x
0,36,0,1.639416,5.000000,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 1 Daily,0,36
1,36,11,1.639416,5.000000,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 3 Weekly,11,36
2,36,4,1.639416,4.940382,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 12 Monthly,4,36
3,36,8,1.639416,4.854685,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 15 Monthly,8,36
4,36,6,1.639416,5.000000,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 14 Weekly,6,36
...,...,...,...,...,...,...,...,...
295582,487116,1,1.639416,1.637444,"{'actual_k': 2, 'was_impossible': False}",BTL 4G Bundle 10 Monthly,1,487116
295583,487116,2,1.639416,1.642752,"{'actual_k': 2, 'was_impossible': False}",BTL 4G Bundle 10 Monthly,2,487116
295584,487116,9,1.639416,1.624926,"{'actual_k': 2, 'was_impossible': False}",BTL 4G Bundle 16 Monthly,9,487116
295585,487116,18,1.639416,1.715564,"{'actual_k': 2, 'was_impossible': False}",BTL 4G Bundle 9 Monthly,18,487116


### finding Recommendations for a user


We can decide based on below that we 'll recommend a bundle to the users if the estimated sum_subscriptions is more than 3. based on the above following are going to be the recommendations for the user 36



In [93]:
pred_df[(pred_df['est']>3.0)&(pred_df['msisdn_id_x']==36)]


Unnamed: 0,uid,iid,r_ui,est,details,bundle_name,id_bundle,msisdn_id_x
0,36,0,1.639416,5.0,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 1 Daily,0,36
1,36,11,1.639416,5.0,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 3 Weekly,11,36
2,36,4,1.639416,4.940382,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 12 Monthly,4,36
3,36,8,1.639416,4.854685,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 15 Monthly,8,36
4,36,6,1.639416,5.0,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 14 Weekly,6,36
5,36,7,1.639416,5.0,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 14 Weekly,7,36
6,36,12,1.639416,4.893182,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 4 Monthly,12,36
7,36,13,1.639416,4.895045,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 4 Monthly,13,36
8,36,14,1.639416,4.882489,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 5 Monthly,14,36
9,36,5,1.639416,5.0,"{'actual_k': 1, 'was_impossible': False}",BTL 4G Bundle 13 Daily,5,36


### Finding nearest neighbours of an Bundle


In [None]:
Above are the nearest neighbours for the bundle_id15 (BTL 4G Bundle 6 Monthly) as per our model.


In [102]:
tsr_inner_id = best_algo.trainset.to_inner_iid(15)
tsr_neighbors = best_algo.get_neighbors(tsr_inner_id, k=3)
bundles[bundles.id_bundle.isin([algo.trainset.to_raw_iid(inner_id)
                           for inner_id in tsr_neighbors])]


Unnamed: 0,bundle_name,id_bundle
2254,BTL 4G Bundle 1 Daily,0
5502,BTL 4G Bundle 3 Weekly,11
22016,BTL 4G Bundle 2 Weekly,10
