In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from joblib import Parallel, delayed

# Load datasets
users_df = pd.read_excel("Users.xlsx")
places_df = pd.read_excel("Places.xlsx")

# Define category columns
category_columns = [
    "Historical Sites", "Beaches", "Adventure", "Nile Cruises",
    "Religious Tourism", "Desert Exploration", "Relaxation"
]

# Ensure category columns are numeric
places_df[category_columns] = places_df[category_columns].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

# Create 'combined_info' column for each place
places_df['combined_info'] = places_df[category_columns].apply(lambda row: ' '.join(row.index[row == 1]), axis=1)

# Merge place categories into users_df
users_df = users_df.merge(places_df[['Place name', 'combined_info'] + category_columns],
                          left_on='Preferred Places', right_on='Place name', how='left')
users_df.fillna("", inplace=True)

# Aggregate category values when users have multiple preferred places
users_df = users_df.groupby(['User ID', 'Age', 'Gender', 'Marital status', 'Children', 'Travel Tags']) \
    .agg({
        'Preferred Places': lambda x: ', '.join(x),
        'combined_info': lambda x: ' '.join(x),
        **{col: 'max' for col in category_columns}  # Take max value for category presence
    }).reset_index()

# Combine user features for TF-IDF
users_df['combined_features'] = (
    users_df['Preferred Places'] + " " + users_df['Travel Tags'] + " " +
    users_df['Age'].astype(str) + " " + users_df['Marital status'] + " " +
    users_df['Children'] + " " + users_df['Gender'] + " " + users_df['combined_info']
)

# Train-test split (BEFORE TF-IDF processing)
train_df, test_df = train_test_split(users_df, test_size=0.2, random_state=42)

# TF-IDF Vectorization (fit on train, transform on both train and test)
tf = TfidfVectorizer(stop_words='english', use_idf=False)
tf_matrix_train = tf.fit_transform(train_df['combined_features'])
tf_matrix_test = tf.transform(test_df['combined_features'])

# Define feature weights
weights = {
    'Preferred Places': 6,
    'Travel Tags': 4,
    'Age': 1,
    'Marital status': 1,
    'Children': 1,
    'Gender': 1,
    'combined_info': 5
}

# Create weight vector based on TF-IDF features
feature_names = tf.get_feature_names_out()
weight_vector = np.ones(len(feature_names))

for i, feature in enumerate(feature_names):
    if any(word in feature for word in train_df['Preferred Places'].str.lower().str.split().explode().unique()):
        weight_vector[i] = weights['Preferred Places']
    elif any(word in feature for word in train_df['Travel Tags'].str.lower().str.split().explode().unique()):
        weight_vector[i] = weights['Travel Tags']
    elif any(str(word) in feature for word in train_df['Age'].unique()):
        weight_vector[i] = weights['Age']
    elif any(word in feature for word in train_df['Marital status'].str.lower().unique()):
        weight_vector[i] = weights['Marital status']
    elif any(word in feature for word in train_df['Children'].str.lower().unique()):
        weight_vector[i] = weights['Children']
    elif any(word in feature for word in train_df['Gender'].str.lower().unique()):
        weight_vector[i] = weights['Gender']
    elif any(str(word) in feature for word in train_df['combined_info'].dropna().astype(str).str.lower().str.split().explode().unique()):
        weight_vector[i] = weights['combined_info']

# Apply weights
weighted_tfidf_matrix_train = tf_matrix_train.multiply(weight_vector).toarray()
weighted_tfidf_matrix_test = tf_matrix_test.multiply(weight_vector).toarray()

# Compute similarity matrix for training set
cosine_sim_train = np.array(
    Parallel(n_jobs=-1)(delayed(lambda i: cosine_similarity([weighted_tfidf_matrix_train[i]], weighted_tfidf_matrix_train).flatten())(i)
                         for i in range(weighted_tfidf_matrix_train.shape[0]))
)

# Function to recommend places
def recommend_places(new_user_preferences, train_df, weighted_tfidf_matrix_train, top_n=5):
    new_user_data = new_user_preferences.split(", ")
    visited_places = set(new_user_data[:1])

    new_user_tf = tf.transform([" ".join(new_user_data)])
    weighted_new_user_tf = new_user_tf.multiply(weight_vector).toarray()
    sim_scores = cosine_similarity(weighted_new_user_tf, weighted_tfidf_matrix_train).flatten()

    most_similar_users_indices = sim_scores.argsort()[-top_n:][::-1]

    place_counts = {}
    for user_idx in most_similar_users_indices:
        if user_idx < len(train_df):
            places = train_df.iloc[user_idx]['Preferred Places'].split(", ")
            for place in places:
                if place not in visited_places:
                    place_counts[place] = place_counts.get(place, 0) + 1

    sorted_places = sorted(place_counts, key=place_counts.get, reverse=True)

    return sorted_places[:top_n], most_similar_users_indices, sim_scores[most_similar_users_indices]

# Evaluate model
y_true = []
y_pred = []

for _, row in test_df.iterrows():
    actual_places = set(row['Preferred Places'].split(", "))
    recommended_places, _, _ = recommend_places(row['combined_features'], train_df, weighted_tfidf_matrix_train)
    y_true.extend([1 if place in actual_places else 0 for place in recommended_places])
    y_pred.extend([1] * len(recommended_places))

precision = precision_score(y_true, y_pred) if y_pred else 0
recall = recall_score(y_true, y_pred) if y_pred else 0
f1 = f1_score(y_true, y_pred) if y_pred else 0

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Example usage
new_user_preferences = "Dahab, Desert Exploration, Adventure, 25, Single, No, Female"
recommended_places, similar_users, similarity_scores = recommend_places(new_user_preferences, train_df, weighted_tfidf_matrix_train)

print("Recommended Places:", recommended_places)
print("Most Similar Users (Indices):", similar_users)
print("Similarity Scores:", similarity_scores)


  users_df.fillna("", inplace=True)


Precision: 0.6863
Recall: 1.0000
F1-score: 0.8140
Recommended Places: ['North Coast', 'Alexandria', 'Ain El Sokhna', 'Taba', 'Marsa Matrouh']
Most Similar Users (Indices): [73 53 81 49 64]
Similarity Scores: [0.60279906 0.48298573 0.41040023 0.39483433 0.38372469]


In [None]:
import joblib

# Save artifacts
joblib.dump(tf, "tfidf_vectorizer.pkl")
np.save("weight_vector.npy", weight_vector)
joblib.dump(train_df, "train_df.pkl")
np.save("tfidf_matrix_train.npy", weighted_tfidf_matrix_train)
