In [77]:
# Install necessary libraries
!pip install scikit-learn pandas

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import re
import matplotlib.pyplot as plt

# Load your dataset
# Assuming the dataset is in a CSV file named 'dt.csv'
data = pd.read_csv('/content/dt.csv')

# Preprocess the data
# Replace missing values with empty strings
data.fillna('', inplace=True)

# Handle missing values
# Print the number of missing values per column
missing_values = data.isnull().sum()
print("\nNumber of missing values per column:")
print(missing_values)

# Fill missing values: Numerical columns with mean, categorical with mode
for column in data.columns:
    if data[column].dtype == np.number:
        data[column].fillna(data[column].mean(), inplace=True)
    else:
        data[column].fillna(data[column].mode()[0], inplace=True)

# Print the DataFrame after filling missing values
print("\nDataFrame after filling missing values:")
print(data)

# Handle duplicate values
# Print the number of duplicate rows
duplicate_rows = data.duplicated().sum()
print("\nNumber of duplicate rows:")
print(duplicate_rows)

# Remove duplicate rows
data.drop_duplicates(inplace=True)

# Print the DataFrame after removing duplicate rows
print("\nDataFrame after removing duplicate rows:")
print(data)

# Text cleaning function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, punctuation, and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Apply text cleaning to relevant columns
data['City'] = data['City'].apply(clean_text)
data['Place'] = data['Place'].apply(clean_text)
data['Place_desc'] = data['Place_desc'].apply(clean_text)

# Feature engineering: Combine relevant features into one text feature
data['combined_features'] =  data['Place'] + ' ' + data['Place_desc']

data.head()

new_df = data[[ 'City','Ratings','Distance','combined_features']]
new_df.head()


# TF-IDF Vectorization
# tfidf_vectorizer = TfidfVectorizer()
# tfidf_matrix = tfidf_vectorizer.fit_transform(data['combined_features'])

# # Compute similarity matrix (cosine similarity)
# cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# # Function to get top N recommendations for a given tour
# def get_top_n_recommendations(tour_index, n=5):
#     # Get similarity scores of all tours with the given tour
#     sim_scores = list(enumerate(cosine_sim[tour_index]))

#     # Sort tours based on similarity scores
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

#     # Get top N recommendations (excluding itself)
#     top_n = sim_scores[1:n+1]

#     return top_n

# # Example usage
# # Replace '2' with the index of the tour for which you want recommendations
# tour_index = 2
# top_recommendations = get_top_n_recommendations(tour_index)
# for i, (index, score) in enumerate(top_recommendations, 1):
#     print(f"{i}. Tour Index: {index}, Similarity Score: {score}")
#     print("City Name:", data.iloc[index]['City'])
#     print("Place:", data.iloc[index]['Place'])
#     print("Rating:", data.iloc[index]['Ratings'])
#     print("Distance:", data.iloc[index]['Distance'])
#     print("City Description:", data.iloc[index]['Place_desc'])
#     print()

# # Split data into training and test sets
# def train_test_split(data, test_size=0.2):
#     mask = np.random.rand(len(data)) < (1 - test_size)
#     train = data[mask]
#     test = data[~mask]
#     return train, test

# train_data, test_data = train_test_split(data)

# # Build recommendation system on training data
# tfidf_vectorizer = TfidfVectorizer()
# tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_data['combined_features'])
# cosine_sim_train = linear_kernel(tfidf_matrix_train, tfidf_matrix_train)

# # Evaluate recommendation system on test data
# hit_count = 0
# total_recommendations = 0

# for index, user_row in test_data.iterrows():
#     user_tour_index = index  # Index of the tour for the current user
#     top_recommendations = get_top_n_recommendations(user_tour_index, n=5)
#     recommended_tours = [tour[0] for tour in top_recommendations]  # Extract tour indexes from recommendations
#     user_interactions = [user_row['City'], user_row['Place'], user_row['Place_desc']]  # User's actual interactions

#     # Check if any of the recommended tours overlap with user's interactions
#     for tour_index in recommended_tours:
#         tour_data = data.iloc[tour_index]
#         tour_features = [tour_data['City'], tour_data['Place'], tour_data['Place_desc']]
#         if any(feature in tour_features for feature in user_interactions):
#             hit_count += 1
#         total_recommendations += 1

# # Calculate hit rate
# hit_rate = hit_count / total_recommendations
# print("Hit Rate:", hit_rate)

# # Function to calculate hit rate
# def calculate_hit_rate(test_data):
#     hit_count = 0
#     total_recommendations = 0

#     for index, user_row in test_data.iterrows():
#         user_tour_index = index  # Index of the tour for the current user
#         top_recommendations = get_top_n_recommendations(user_tour_index, n=5)
#         recommended_tours = [tour[0] for tour in top_recommendations]  # Extract tour indexes from recommendations
#         user_interactions = [user_row['City'], user_row['Place'], user_row['Place_desc']]  # User's actual interactions

#         # Check if any of the recommended tours overlap with user's interactions
#         for tour_index in recommended_tours:
#             tour_data = data.iloc[tour_index]
#             tour_features = [tour_data['City'], tour_data['Place'], tour_data['Place_desc']]
#             if any(feature in tour_features for feature in user_interactions):
#                 hit_count += 1
#             total_recommendations += 1

#     # Calculate hit rate
#     hit_rate = hit_count / total_recommendations
#     return hit_rate

# # Create a list of different test sizes
# test_sizes = [0.1, 0.2, 0.3, 0.4, 0.5]

# # Calculate hit rates for different test sizes
# hit_rates = []
# for test_size in test_sizes:
#     train_data, test_data = train_test_split(data, test_size=test_size)
#     hit_rate = calculate_hit_rate(test_data)
#     hit_rates.append(hit_rate)

# # Plot hit rate graph
# plt.figure(figsize=(10, 6))
# plt.plot(test_sizes, hit_rates, marker='o', linestyle='-')
# plt.xlabel('Test Size')
# plt.ylabel('Hit Rate')
# plt.title('Hit Rate vs. Test Size')
# plt.grid(True)
# plt.show()



Number of missing values per column:
City          0
Place         0
Ratings       0
Distance      0
Place_desc    0
dtype: int64

DataFrame after filling missing values:
        City                                           Place Ratings  \
0     Manali         1. Capture the Sceneries of Old Manali      3.9   
1     Manali   2. Engage in the Adventures of Solang Valley      4.6   
2     Manali                            3. Jogini Waterfall      4.6   
3     Manali                              4. Hadimba Temple      4.4   
4     Manali                                5. Rohtang Pass      4.4   
...      ...                                             ...     ...   
2984  Poovar                         5. The Fishing Village      3.1   
2985  Poovar                                     6. Cruises      3.1   
2986  Poovar                               7. Kovalam Beach      3.1   
2987  Poovar                                   8. Boat Race      3.1   
2988  Poovar                        

  if data[column].dtype == np.number:


Unnamed: 0,City,Ratings,Distance,combined_features
0,manali,3.9,2 km from city center,capture the sceneries of old manali on the...
1,manali,4.6,8 km from city center,engage in the adventures of solang valley ...
2,manali,4.6,4 km from city center,jogini waterfall jogini waterfall is locat...
3,manali,4.4,1 km from city center,hadimba temple hadimba temple away from th...
4,manali,4.4,16 km from city center,rohtang pass rohtang pass is the stretch w...


In [54]:
new_df.iloc[0]['combined_features']

'  capture the sceneries of old manali   on the other side of the manalsu river is a part of manali time left behind with a sweet scent of an old world charm interspersed with guesthouses and an increasing presence of tourism and hints of the present that it brings along old manali is a tiny shift in the world and its rush one must experience while here '

In [78]:
# Converting to lower case
new_df['combined_features'] = new_df['combined_features'].apply(lambda x:x.lower())
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()
def stems(text):
    T = []

    for i in text.split():
        T.append(ps.stem(i))

    return " ".join(T)
new_df['combined_features'] = new_df['combined_features'].apply(stems)
new_df.iloc[0]['combined_features']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['combined_features'] = new_df['combined_features'].apply(lambda x:x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['combined_features'] = new_df['combined_features'].apply(stems)


'captur the sceneri of old manali on the other side of the manalsu river is a part of manali time left behind with a sweet scent of an old world charm interspers with guesthous and an increas presenc of tourism and hint of the present that it bring along old manali is a tini shift in the world and it rush one must experi while here'

In [79]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [80]:
vector = cv.fit_transform(new_df['combined_features']).toarray()

In [81]:
vector[0]



array([0, 0, 0, ..., 0, 0, 0])

In [82]:
vector.shape

(2989, 5000)

In [83]:
len(cv.get_feature_names_out())

5000

In [84]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)
similarity.shape

(2989, 2989)

In [85]:
new_df[new_df['City'] == 'manali'].index[0]


0

In [86]:
# def recommend(place):
#     index = new_df[new_df['City'] == place].index[0]
#     distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
#     for i in distances[1:6]:
#         print(new_df.iloc[i[0]].City)


def recommend(place):
    # Check if the city exists in the DataFrame
    if not new_df[new_df['City'] == place].empty:
        # Find the index of the given city
        index = new_df[new_df['City'] == place].index[0]

        # Compute similarities
        distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])

        # Print the top 5 recommendations (excluding the given place)
        print(f"Recommendations for '{place}':")
        count = 0
        for i in distances:
            if new_df.iloc[i[0]].City != place:
                print(new_df.iloc[i[0]].City)
                count += 1
            if count == 5:
                break
    else:
        print(f"'{place}' not found in the DataFrame")


In [87]:
recommend('srinagar')

Recommendations for 'srinagar':
mcleodganj
alleppey
nainital
dharamshala
jammu


In [92]:
import os
import pickle

# Specify the directory path
directory_path = '/content/artifacts/'

# Create the directory if it doesn't exist
if not os.path.exists(directory_path):
    os.makedirs(directory_path)

# Save the DataFrame to a pickle file
pickle.dump(new_df, open(directory_path + 'place_list.pkl', 'wb'))

# Save the similarity matrix to a pickle file
pickle.dump(similarity, open(directory_path + 'similarity.pkl', 'wb'))
