In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import linear_kernel
import random


data = pd.read_csv('app/coffee_may2023_prep.csv')


# Concatenate multiple columns into a single feature column for item representation
coffee_item_features = ['variety', 'processing_method','color', 'country_of_origin', 'mill']
data['item_features'] = data[coffee_item_features].agg(' '.join, axis=1)


# Normalize 'rating' column to [0, 1] range for feature representation
scaler = MinMaxScaler()
normalized_columns = ['aftertaste','balance','flavor','aroma','body','acidity','moisture_percentage','quakers','overall','total_cup_points','altitude']
normalized_features = scaler.fit_transform(data[normalized_columns])
normalized_data = pd.DataFrame(normalized_features, columns=normalized_columns)


# Create a TF-IDF vectorizer to convert item_features into feature vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['item_features'])

# Combine the TF-IDF matrix with the normalized columns
data_vec = pd.concat([pd.DataFrame(tfidf_matrix.toarray()), normalized_data], axis=1)

# Calculate the cosine similarity between items
cosine_sim = linear_kernel(data_vec, data_vec)

# Function to get coffee recommendations based on country
def get_recommendations(key_value, cosine_similarities, data):
    key = 'country_of_origin'
    idx = random.choice(data.index[data[key] == key_value].tolist())
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:]  # Exclude the base coffee itself (most similar)
    coffee_indices = [i[0] for i in sim_scores]
    return data[coffee_item_features+normalized_columns].iloc[coffee_indices]

recommendations = get_recommendations('taiwan', cosine_sim, data)
recommendations

Unnamed: 0,variety,processing_method,color,country_of_origin,mill,aftertaste,balance,flavor,aroma,body,acidity,moisture_percentage,quakers,overall,total_cup_points,altitude
1,gesha,washed / wet,blue-green,taiwan,royal bean geisha estate,7.92,8.25,8.50,8.50,7.92,8.00,10.5,0,8.50,87.58,1200.0
3,gesha,washed / wet,green,costa rica,la montana tarrazu mill,8.17,8.08,8.17,8.08,8.17,8.25,11.8,0,8.25,87.17,1900.0
2,java,semi washed,yellowish,laos,oklao coffee processing plant,8.08,8.17,8.42,8.33,7.92,8.17,10.4,0,8.33,87.42,1300.0
4,red bourbon,"honey,mossto",yellow-green,colombia,finca santuario,8.08,7.92,8.33,8.33,7.92,8.25,11.6,2,8.25,87.08,1975.0
5,gesha,washed / wet,green,guatemala,dinámica café,8.25,8.17,8.33,8.33,7.83,7.83,10.7,0,8.25,87.00,1668.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,catuai and mundo novo,natural / dry,greenish,brazil,dry mill,7.17,7.17,7.17,7.25,7.17,7.08,11.6,1,7.17,80.17,1200.0
204,catimor,washed / wet,green,laos,dry mill,7.08,7.08,7.17,7.25,7.08,7.00,11.6,9,7.00,79.67,1300.0
203,shg,natural / dry,green,nicaragua,beneficio atlantic sébaco,6.75,7.17,7.08,7.33,7.42,7.17,10.4,2,7.08,80.00,1200.0
206,mundo novo,semi washed,green,brazil,beneficio humedo/seco,6.67,6.67,7.08,7.25,6.83,6.83,11.3,0,6.67,78.00,975.0


In [15]:
data['altitude'].astype(int)


0      1815
1      1200
2      1300
3      1900
4      1975
       ... 
202     950
203    1200
204    1300
205    1200
206     975
Name: altitude, Length: 207, dtype: int32

In [32]:
get_recommendations('taiwan', cosine_sim, data)


Unnamed: 0,variety,processing_method,color,country_of_origin,mill,aftertaste,balance,flavor,aroma,body,acidity,moisture_percentage,quakers,overall,total_cup_points,altitude
1,gesha,washed / wet,blue-green,taiwan,royal bean geisha estate,7.92,8.25,8.50,8.50,7.92,8.00,10.5,0,8.50,87.58,1200.0
7,sl34+gesha,natural / dry,yellow-green,taiwan,七彩琉璃咖啡莊園,8.17,8.08,8.25,8.25,7.92,8.00,10.0,0,8.08,86.75,1200.0
3,gesha,washed / wet,green,costa rica,la montana tarrazu mill,8.17,8.08,8.17,8.08,8.17,8.25,11.8,0,8.25,87.17,1900.0
8,sl34,washed / wet,greenish,taiwan,亮軒咖啡莊園,8.25,8.00,8.08,8.08,7.92,8.08,10.8,0,8.25,86.67,1250.0
2,java,semi washed,yellowish,laos,oklao coffee processing plant,8.08,8.17,8.42,8.33,7.92,8.17,10.4,0,8.33,87.42,1300.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,"castillo,caturra,bourbon",washed / wet,greenish,colombia,finca mallorca,7.17,7.17,7.25,7.25,7.00,7.33,11.0,3,7.25,80.42,1600.0
203,shg,natural / dry,green,nicaragua,beneficio atlantic sébaco,6.75,7.17,7.08,7.33,7.42,7.17,10.4,2,7.08,80.00,1200.0
204,catimor,washed / wet,green,laos,dry mill,7.08,7.08,7.17,7.25,7.08,7.00,11.6,9,7.00,79.67,1300.0
205,maragogype,natural / dry,bluish-green,el salvador,"optimum coffee, san salvador, el salvador",6.75,7.00,6.75,6.50,7.08,7.17,11.0,12,6.83,78.08,1200.0
