**Recommendation Systems in Bioinformatics: A Comprehensive Tutorial**
--
Melza Rensiana
--

NPM :2306174980
--


# Content-Based Filtering

##**Initial Setup**

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

##**Data Preparation**

In [23]:
# Sample fabricated data for genes and their descriptions
genes_data = {
    'Gene': ['Gene1', 'Gene2', 'Gene3'],
    'Description': [
        'This gene is related to function X and protein Y',
        'Functionality of this gene is associated with process A',
        'Protein B interacts with this gene, leading to process C'
    ]
}

# Creating a DataFrame
genes_df = pd.DataFrame(genes_data)

##**Text Preprocessing**

In [24]:
genes_df['Clean_Description'] = genes_df['Description'].str.lower()  # Convert to lowercase

##**Feature Extraction**

In [25]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(genes_df['Clean_Description'])

##**Building Content-Based Model**

In [26]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

##**Recommendation for Users**

In [27]:
gene_index = 0  # Index of the gene to recommend for a specific user
similar_genes = list(enumerate(cosine_sim[gene_index]))

# Sort genes based on similarity scores
similar_genes = sorted(similar_genes, key=lambda x: x[1], reverse=True)

##**Display Recommendations**

In [28]:
print("Recommended Genes for the Reference Gene:")
for gene in similar_genes[1:]:  # Exclude itself (similarity score of 1)
    print(genes_df.iloc[gene[0]]['Gene'])

Recommended Genes for the Reference Gene:
Gene3
Gene2


#Collaborative Filtering

In [29]:
!pip install scikit-surprise



##**Make a sysnthetic dataset**

In [30]:
import pandas as pd
import numpy as np

# Creating a synthetic dataset for user-gene interactions
# Assuming 100 users, 50 genes, and generating random ratings (0-5) for interactions
num_users = 100
num_genes = 50

users = ['User{}'.format(i) for i in range(1, num_users + 1)]
genes = ['Gene{}'.format(i) for i in range(1, num_genes + 1)]

# Generating random user-gene interactions (ratings)
np.random.seed(42)  # For reproducibility
interaction_data = {
    'User': np.random.choice(users, size=500),  # Adjust size based on your dataset
    'Gene': np.random.choice(genes, size=500),
    'Rating': np.random.randint(0, 6, size=500)  # Ratings between 0 and 5
}

interaction_df = pd.DataFrame(interaction_data)
interaction_df.to_csv('interaction_data.csv', index=False)

##**Initial Setup**

In [31]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

##**Data Preparation**

In [32]:
interaction_data = pd.read_csv('interaction_data.csv')
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(interaction_data[['User', 'Gene', 'Rating']], reader)

##**Building Collaborative Filtering Model**

In [33]:
model = SVD()
results = cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.9092  1.7403  1.7287  1.7934  1.7445  1.7832  0.0668  
MAE (testset)     1.6587  1.4927  1.4738  1.5629  1.4894  1.5355  0.0688  
Fit time          0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


##**Model Training and Prediction**

In [34]:
trainset = data.build_full_trainset()
model.fit(trainset)

user_id = 'User1'  # Replace with a specific user ID
genes_rated_by_user1 = user1_interactions['Gene'].tolist()  # Genes already rated by User1

genes_to_recommend = genes_for_user1[~genes_for_user1.isin(genes_rated_by_user1)]

unique_genes_to_recommend = genes_to_recommend.unique()  # Get unique genes for recommendation

user_recommendations = [model.predict(user_id, gene).est for gene in unique_genes_to_recommend]

##**Display Recommendations**

In [35]:
recommendations_df = pd.DataFrame({'Gene': unique_genes_to_recommend, 'Estimated_Rating': user_recommendations})
print("Recommended Genes for User:", user_id)
print(recommendations_df.sort_values('Estimated_Rating', ascending=False).head(10))

Recommended Genes for User: User1
      Gene  Estimated_Rating
22   Gene2          3.633431
6    Gene5          3.580818
18  Gene32          3.321793
13  Gene33          3.239467
41  Gene15          3.200144
21  Gene24          3.150444
27  Gene43          3.135609
30  Gene13          3.124605
19  Gene45          3.121596
34   Gene7          3.089321
