In [48]:
import numpy as np
import pandas as pd
import os

# Load the datasets
coffeeAnalysis = pd.read_csv(r"C:\Users\Shafqat\Desktop\coffee\coffee_analysis.csv")
coffeeClean = pd.read_csv(r"C:\Users\Shafqat\Desktop\coffee\coffee_clean.csv")

# Check initial shapes and columns
print("Coffee Analysis Columns:", coffeeAnalysis.columns)
print("Coffee Clean Columns:", coffeeClean.columns)


Coffee Analysis Columns: Index(['name', 'roaster', 'roast', 'loc_country', 'origin_1', 'origin_2',
       '100g_USD', 'rating', 'review_date', 'desc_1', 'desc_2', 'desc_3'],
      dtype='object')
Coffee Clean Columns: Index(['slug', 'all_text', 'rating', 'roaster', 'name', 'location', 'origin',
       'roast', 'est_price', 'review_date', 'agtron', 'aroma', 'acid', 'body',
       'flavor', 'aftertaste', 'with_milk', 'desc_1', 'desc_2', 'desc_3'],
      dtype='object')


In [49]:
# Merge the datasets on 'name' column
coffeeAnalysis = coffeeAnalysis.merge(coffeeClean, on='name', how='inner')
print("Merged Coffee Analysis Columns:", coffeeAnalysis.columns)
print(coffeeAnalysis.head())  # Inspect merged data


Merged Coffee Analysis Columns: Index(['name', 'roaster_x', 'roast_x', 'loc_country', 'origin_1', 'origin_2',
       '100g_USD', 'rating_x', 'review_date_x', 'desc_1_x', 'desc_2_x',
       'desc_3_x', 'slug', 'all_text', 'rating_y', 'roaster_y', 'location',
       'origin', 'roast_y', 'est_price', 'review_date_y', 'agtron', 'aroma',
       'acid', 'body', 'flavor', 'aftertaste', 'with_milk', 'desc_1_y',
       'desc_2_y', 'desc_3_y'],
      dtype='object')
                      name                   roaster_x       roast_x  \
0       Ethiopia Suke Quto                 Roast House  Medium-Light   
1       Ethiopia Suke Quto                 Roast House  Medium-Light   
2       Ethiopia Suke Quto      Indaba Coffee Roasters  Medium-Light   
3       Ethiopia Suke Quto      Indaba Coffee Roasters  Medium-Light   
4  Ethiopia Kayon Mountain  Red Rooster Coffee Roaster         Light   

     loc_country              origin_1       origin_2  100g_USD  rating_x  \
0  United States             

In [50]:
# Required columns (check dynamically for their existence)
required_columns = [
    'name', 'roaster', 'roast', 'origin_1', 'origin_2', 'rating', 
    'desc_1', 'desc_2', 'desc_3', 'origin', 'aroma', 'acid', 'body', 
    'flavor', 'aftertaste', 'with_milk', 'tags'
]

# Filter for existing columns
existing_columns = [col for col in required_columns if col in coffeeAnalysis.columns]
print(f"Columns retained for processing: {existing_columns}")

# Keep only available columns
coffeeAnalysis = coffeeAnalysis[existing_columns]


Columns retained for processing: ['name', 'origin_1', 'origin_2', 'origin', 'aroma', 'acid', 'body', 'flavor', 'aftertaste', 'with_milk']


In [52]:
# Drop rows with missing values
coffeeAnalysis.dropna(inplace=True)

# Process description columns
for desc_col in ['desc_1', 'desc_2', 'desc_3']:
    if desc_col in coffeeAnalysis.columns:
        coffeeAnalysis[desc_col] = coffeeAnalysis[desc_col].apply(lambda x: x.split() if isinstance(x, str) else [])


In [53]:
# Step 5: Dynamically select columns for 'tags'
tag_columns = ['name', 'roaster', 'roast', 'origin', 'origin_1', 'origin_2', 
               'rating', 'desc_1', 'desc_2', 'desc_3', 'aroma', 'acid', 
               'body', 'flavor', 'aftertaste', 'with_milk']
existing_tag_columns = [col for col in tag_columns if col in coffeeAnalysis.columns]

# Create the 'tags' column by concatenating valid columns
coffeeAnalysis['tags'] = coffeeAnalysis.apply(
    lambda row: sum([row[col] for col in existing_tag_columns if isinstance(row[col], list)], []),
    axis=1
)

# Verify if 'tags' is populated
print("Sample 'tags' column after creation:")
print(coffeeAnalysis[['name', 'tags']].head())


Sample 'tags' column after creation:
                                           name tags
1708  Ethiopia Guji Natural Euphora Special Lot   []


In [54]:
# Step 6: Prepare a new DataFrame with selected columns
required_columns = ['name', 'roaster', 'origin', 'tags']
existing_columns = [col for col in required_columns if col in coffeeAnalysis.columns]

# Ensure 'tags' exists and is populated
if 'tags' not in coffeeAnalysis.columns or coffeeAnalysis['tags'].isnull().all():
    raise ValueError("The 'tags' column is either missing or empty. Check the tag creation step.")

# Create a copy of the DataFrame with the existing columns
new_df = coffeeAnalysis[existing_columns].copy()

# Safely join 'tags' lists into a single string and convert to lowercase
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower() if isinstance(x, list) else "")

# Verify the content of new_df
print("Preview of new_df:")
print(new_df.head())


Preview of new_df:
                                           name  \
1708  Ethiopia Guji Natural Euphora Special Lot   

                                           origin tags  
1708  Guji Zone, Oromia Region, Southern Ethiopia       


In [55]:
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize the Porter Stemmer
ps = PorterStemmer()

def stems(text):
    """Apply stemming to the input text."""
    return " ".join([ps.stem(word) for word in text.split()])

# Apply stemming to the 'tags' column
new_df['tags'] = new_df['tags'].apply(stems)

# Ensure no empty tags before applying CountVectorizer
new_df['tags'] = new_df['tags'].apply(lambda x: x if x.strip() != "" else "no description")

# Convert the 'tags' column to vectors using CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new_df['tags']).toarray()

# Calculate cosine similarity between the vectorized tags
similarity = cosine_similarity(vector)

# Recommendation function
def recommend(coffee_name):
    """Provide recommendations for a given coffee name."""
    try:
        index = new_df[new_df['name'] == coffee_name].index[0]
        distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
        
        recommendations = []
        for i in distances[1:6]:  # Top 5 recommendations
            recommendations.append(new_df.iloc[i[0]]['name'])
        
        return recommendations
    except IndexError:
        return [f"No recommendations found for coffee: {coffee_name}"]

# Example usage
coffee_to_recommend = 'Ethopia'  # Replace with an actual coffee name from the dataset
recommendations = recommend(coffee_to_recommend)

print(f"Recommendations for '{coffee_to_recommend}':")
for coffee in recommendations:
    print(coffee)


Recommendations for 'Ethopia':
No recommendations found for coffee: Ethopia


In [56]:
# Step 8: Save the DataFrame and similarity matrix using pickle
import pickle
import os

# Create the 'artifacts' directory if it doesn't exist
os.makedirs('artificates', exist_ok=True)

# Save the new_df DataFrame and similarity matrix
pickle.dump(new_df, open('artificates/coffeeAnalysis_list.pkl', 'wb'))
pickle.dump(similarity, open('artificates/similarity.pkl', 'wb'))

# Confirm the files are saved
print("Data and similarity matrix have been saved to 'artificates' directory.")


Data and similarity matrix have been saved to 'artificates' directory.


In [57]:
import pandas as pd
try:
    coffee_name = pd.read_pickle('artificates/coffeeAnalysis_list.pkl')
    print("File loaded successfully!")
except Exception as e:
    print(f"Error: {e}")


File loaded successfully!


In [58]:
import pickle

with open('artificates/coffeeAnalysis_list.pkl', 'rb') as file:
    coffee_name = pickle.load(file, encoding='latin1')
