In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [None]:

# Specify the path to the CSV file
data_path = 'Womens Clothing E-Commerce Reviews.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(data_path)


In [None]:

# Extract relevant columns for sentiment analysis and recommendation
columns_to_extract = ['Clothing ID', 'Review Text', 'Rating', 'Division Name', 'Class Name', 'Department Name']
data = df[columns_to_extract]

# Drop rows with null values in 'Review Text' and specific columns
columns_to_dropna = ['Review Text', 'Division Name', 'Class Name', 'Department Name']
data = data.dropna(subset=columns_to_dropna)

data.head()


In [None]:

# Perform sentiment analysis (example: assuming ratings 4 and 5 are positive sentiment)
data['Sentiment'] = data['Rating'].apply(lambda x: 1 if x >= 4 else 0)


In [None]:


# Check unique values in 'Rating' column
print("Unique values in 'Rating' column:", data['Rating'].unique())


In [None]:

# Preprocess the text data
def preprocess_text(text):
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

data['Processed_Text'] = data['Review Text'].apply(preprocess_text)
data.head()


In [None]:

# TF-IDF Vectorization of the processed text
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Processed_Text'])


In [None]:
# Print the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

# Print the vocabulary (mapping of terms to feature indices)
print("Vocabulary:")
print(tfidf_vectorizer.vocabulary_)

# Print the first few elements of the TF-IDF matrix
print("First Few Elements of TF-IDF Matrix:")
print(tfidf_matrix[:5, :])  # Adjust the indices as needed

In [None]:

# Calculate cosine similarity between documents
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [None]:
# Print the results 
print("Cosine Similarity Matrix:")
print(cosine_sim[:5, :5])  # Adjust the indices as needed

In [None]:

# Display the range of valid Clothing IDs
min_clothing_id = data['Clothing ID'].min()
max_clothing_id = data['Clothing ID'].max()
print(f"Valid Clothing ID range: {min_clothing_id} to {max_clothing_id}")


In [None]:
# Function to get recommendations based on similar clothing within the same division, class, and department
def get_recommendations(clothing_id, sentiment_threshold=0.5, min_avg_rating=4, top_n=5):
    # Check if the specified Clothing ID exists in the dataset
    if clothing_id not in data['Clothing ID'].values:
        print(f"Clothing ID {clothing_id} does not exist in the dataset.")
        return pd.DataFrame({'Clothing ID': [], 'Review Text': [], 'Rating': []})

    # Get division, class, and department for the specified Clothing ID
    division_name = data[data['Clothing ID'] == clothing_id]['Division Name'].iloc[0]
    class_name = data[data['Clothing ID'] == clothing_id]['Class Name'].iloc[0]
    department_name = data[data['Clothing ID'] == clothing_id]['Department Name'].iloc[0]


    # Filter data based on similar division, class, and department with positive sentiment
    similar_items = data[
        (data['Division Name'] == division_name) &
        (data['Class Name'] == class_name) &
        (data['Department Name'] == department_name) &
        (data['Sentiment'] >= sentiment_threshold)
    ]

    similar_items = similar_items[similar_items['Clothing ID'] != clothing_id]

    # Check if there are any similar items
    if not similar_items.empty:
        # Calculate the average rating
        avg_rating = similar_items['Rating'].mean()

        # Only proceed if the average rating is 4 or above
        if avg_rating >= min_avg_rating:
            # Calculate a combined score based on sentiment and rating (you can customize the weights)
            similar_items['Combined_Score'] = sentiment_threshold * similar_items['Sentiment'] + (1 - sentiment_threshold) * similar_items['Rating']

            # Sort by the combined score
            similar_items = similar_items.sort_values(by='Combined_Score', ascending=False)

            # Get top N recommendations
            top_recommendations = similar_items.head(top_n).drop_duplicates(subset=['Clothing ID'])

            return top_recommendations[['Clothing ID', 'Review Text', 'Rating', 'Sentiment', 'Combined_Score']]
        else:
            print(f"The average rating for similar items is below {min_avg_rating}. No recommendations.")
            return pd.DataFrame({'Clothing ID': [], 'Review Text': [], 'Rating': [], 'Sentiment': [], 'Combined_Score': []})
    else:
        print("No similar items found.")
        return pd.DataFrame({'Clothing ID': [], 'Review Text': [], 'Rating': [], 'Sentiment': [], 'Combined_Score': []})


In [None]:

# User input for clothing ID
user_input_id = int(input(f"Enter a Clothing ID within the range {min_clothing_id} to {max_clothing_id}: "))

# Example: Get recommendations based on user input
recommendations = get_recommendations(user_input_id)
if not recommendations.empty:
    print(f"Recommendations for Similar Clothing to Product {user_input_id}:\n")
    print(recommendations)
else:
    print(f"No similar clothing found for Product {user_input_id}.")
