In [1]:
#Step 1 : Import required packages
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


In [2]:
#Step 2 : Reading Dataset
#Reading train file

train_df = pd.read_csv('/Users/paramanandbhat/Downloads/Article_Recommendation 2/train.csv')

print(train_df.head())
print(train_df.columns)

#Reading article info file

article_info_df = pd.read_csv('/Users/paramanandbhat/Downloads/Article_Recommendation 2/article_info.csv')

print(article_info_df.head())

print(article_info_df.columns)



   user_id  article_id  rating
0        1         456       1
1        1        2934       1
2        1          82       1
3        1        1365       1
4        1         221       1
Index(['user_id', 'article_id', 'rating'], dtype='object')
   article_id     website                                              title  \
0        1025  uxmovement  Comment concevoir une procédure pas à pas que ...   
1        2328    endeavor  Ressources humaines? Seulement si vous optez p...   
2        2469    linkedin           Deux motions de vente différentes. . . .   
3        2590  googleblog  Apprentissage large et profond: mieux avec Ten...   
4         697       infoq              Agile: manque de compétences en tests   

                                             content  
0  par anthony le 18/07/16 à 8h02 Si une nouvelle...  
1  «Ambassadeurs», «avocats», «porte-parole» d'un...  
2  J'ai passé pas mal de temps récemment avec des...  
3  "Apprenez les règles comme un pro, afin de pou...  

In [3]:
#Reading test file

test_df = pd.read_csv('/Users/paramanandbhat/Downloads/Article_Recommendation 2/test.csv')

print(test_df.head())

print(test_df.columns)


   user_id  article_id
0        1        2607
1        1        1445
2        1         911
3        1         857
4        1        2062
Index(['user_id', 'article_id'], dtype='object')


In [4]:
#Step 2: Data Preparation
'''We'll start by merging the article_info dataset with the
 train and test datasets. This is to ensure that the article details 
 are incorporated into the model training and evaluation process.'''


# Merging article information with the training and test datasets

train_df = train_df.merge(article_info_df, on='article_id', how='left')

print(train_df.columns)


Index(['user_id', 'article_id', 'rating', 'website', 'title', 'content'], dtype='object')


In [5]:
test_df = test_df.merge(article_info_df, on='article_id', how='left')

print(test_df.columns)


Index(['user_id', 'article_id', 'website', 'title', 'content'], dtype='object')


In [6]:
# Displaying the first few rows of the merged training and test datasets
merged_data_samples = {
    "Merged Training Data": train_df.head(),
    "Merged Test Data": test_df.head()
}
print(merged_data_samples)

{'Merged Training Data':    user_id  article_id  rating     website  \
0        1         456       1      medium   
1        1        2934       1   thestreet   
2        1          82       1    facebook   
3        1        1365       1  techcrunch   
4        1         221       1    geekwire   

                                               title  \
0  Obtenez 6 mois d'accès à Pluralsight, la plus ...   
1  La plateforme cloud de Google est désormais un...   
2        La technologie derrière les photos d'aperçu   
3  Les VM préemptives de Google Cloud Platform so...   
4  Ray Kurzweil: Le monde ne se détériore pas - n...   

                                             content  
0  Obtenez 6 mois d'accès à Pluralsight, la plus ...  
1  Bien que la plate-forme Google Cloud (GCP) ne ...  
2  Les premières impressions comptent, que vous s...  
3  Pendant un certain temps, Google, Amazon et Mi...  
4  Ray Kurzweil, l'auteur, inventeur, informatici...  , 'Merged Test Data':    user_id

In [7]:
'''Merged Training Data: Contains user_id, article_id, rating, website, title, and content.
Merged Test Data: Contains user_id, article_id, website, title, and content, but lacks the rating '''


'Merged Training Data: Contains user_id, article_id, rating, website, title, and content.\nMerged Test Data: Contains user_id, article_id, website, title, and content, but lacks the rating '

In [8]:
#Step 3: Model Adaptation
from sklearn.model_selection import train_test_split

# Splitting the training data into training and validation sets
train_data, validation_data = train_test_split(train_df, test_size=0.25, random_state=42)


In [9]:
# Function to compute the RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))



In [10]:
# Creating the ratings matrix for the training data
r_matrix = train_data.pivot_table(values='rating', index='user_id', columns='article_id')

# User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, article_id):
    if article_id in r_matrix:
        mean_rating = r_matrix[article_id].mean()
    else:
        mean_rating = train_data['rating'].mean()
    return mean_rating

# RMSE Score Function
def rmse_score(model, data):
    id_pairs = zip(data['user_id'], data['article_id'])
    y_pred = np.array([model(user, article) for (user, article) in id_pairs])
    y_true = np.array(data['rating'])
    return rmse(y_true, y_pred)

# Compute RMSE for the Mean Model on Validation Data
rmse_mean_model = rmse_score(cf_user_mean, validation_data)

print("Mean model rmse score", rmse_mean_model)


Mean model rmse score 1.0420497615166175


In [11]:
# Compute the Pearson Correlation using the ratings matrix
pearson_corr = r_matrix.T.corr(method='pearson')

# User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, article_id):
    if article_id in r_matrix:
        # Mean rating for active user
        ra = r_matrix.loc[user_id].mean()
        sim_scores = pearson_corr[user_id]
        # Keep only positive correlations
        sim_scores_pos = sim_scores[sim_scores > 0]
        m_ratings = r_matrix[article_id][sim_scores_pos.index]
        idx = m_ratings[m_ratings.isnull()].index
        m_ratings = m_ratings.dropna()
        if len(m_ratings) == 0:
            wmean_rating = r_matrix[article_id].mean()
        else:   
            sim_scores_pos = sim_scores_pos.drop(idx)
            m_ratings = m_ratings - r_matrix.loc[m_ratings.index].mean(axis=1)
            wmean_rating = ra + (np.dot(sim_scores_pos, m_ratings) / sim_scores_pos.sum())
    else:
        wmean_rating = train_data['rating'].mean()
    
    return wmean_rating


In [12]:
# Compute RMSE for the Weighted Mean Model on Validation Data
rmse_wmean_model = rmse_score(cf_user_wmean, validation_data)
print("User rmse",rmse_wmean_model)

# Predicting ratings for the test dataset using the User-Based Collaborative Filtering with Weighted Mean Ratings model
test_predictions_wmean = test_df.copy()
test_predictions_wmean['predicted_rating'] = test_predictions_wmean.apply(
    lambda x: cf_user_wmean(x['user_id'], x['article_id']), axis=1)


# Renaming the 'predicted_rating' column to 'rating'
test_predictions_wmean_renamed = test_predictions_wmean.rename(columns={'predicted_rating': 'rating'})


User rmse 1.0950466136265755


In [14]:
# Saving the test dataset with predicted ratings to a CSV file
output_file_path_wmean = '/Users/paramanandbhat/Downloads/ImplementationforItemBasedCollaborativeFiltering-201024-234420 (1)/user_collab_filter_predicted_ratings_wmean.csv'
test_predictions_wmean_renamed.to_csv(output_file_path_wmean, index=False)

print(output_file_path_wmean)



/Users/paramanandbhat/Downloads/ImplementationforItemBasedCollaborativeFiltering-201024-234420 (1)/user_collab_filter_predicted_ratings_wmean.csv
