# Recommender

# Data Load Test

In [12]:
import pandas as pd
data = pd.read_csv('app/coffee_may2023_prep.csv')
data.head(10)

Unnamed: 0,lot_number,producer,mill,country_of_origin,variety,processing_method,aroma,flavor,aftertaste,acidity,body,balance,moisture_percentage,quakers,color,altitude
0,cqu2022015,diego samuel bermudez,finca el paraiso,colombia,castillo,double anaerobic washed,8.58,8.5,8.42,8.58,8.25,8.42,11.8,0,green,1815.0
1,"the 2022 pacific rim coffee summit,t037",曾福森,royal bean geisha estate,taiwan,gesha,washed / wet,8.5,8.5,7.92,8.0,7.92,8.25,10.5,0,blue-green,1200.0
2,"the 2022 pacific rim coffee summit,la01",wu tao chi,oklao coffee processing plant,laos,java,semi washed,8.33,8.42,8.08,8.17,7.92,8.17,10.4,0,yellowish,1300.0
3,cqu2022017,santa maria de dota,la montana tarrazu mill,costa rica,gesha,washed / wet,8.08,8.17,8.17,8.25,8.17,8.08,11.8,0,green,1900.0
4,cqu2023002,camilo merizalde,finca santuario,colombia,red bourbon,"honey,mossto",8.33,8.33,8.08,8.25,7.92,7.92,11.6,2,yellow-green,1975.0
5,"the 2022 pacific rim coffee summit,gt02",emilio antonio medina garcia,dinámica café,guatemala,gesha,washed / wet,8.33,8.33,8.25,7.83,7.83,8.17,10.7,0,green,1668.0
6,"the 2022 pacific rim coffee summit,t034",黃保錫,野牡丹咖啡,taiwan,gesha,washed / wet,8.33,8.17,8.08,8.0,7.83,8.25,9.1,0,green,1250.0
7,"the 2022 pacific rim coffee summit,t050",莊家榮,七彩琉璃咖啡莊園,taiwan,sl34+gesha,natural / dry,8.25,8.25,8.17,8.0,7.92,8.08,10.0,0,yellow-green,1200.0
8,"the 2022 pacific rim coffee summit,t018",鍾旭亮,亮軒咖啡莊園,taiwan,sl34,washed / wet,8.08,8.08,8.25,8.08,7.92,8.0,10.8,0,greenish,1250.0
9,cn 4127230034/4189230113,dorman (t) limited,gourmet coffee mill,"tanzania, united republic of",bourbon,washed / wet,8.08,8.17,8.08,8.17,8.0,8.0,11.0,0,greenish,1550.0


# Recommender

- This recommender is based in [Cosine Similarity](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html#sklearn.metrics.pairwise.cosine_similarity): used in information retrieval and item vectorized attributes. It calculates the similarity between two vectors.
- The vectors are created from coffee's string features and the normalized values [0 , 1] of its numeric features.
- string features: 'variety', 'processing_method','color', 'country_of_origin', 'mill'
- numeric features: 'aftertaste','balance','flavor','aroma','body','acidity','moisture_percentage','quakers','overall','total_cup_points','altitude'
- a search used by key (column name) and its value, helps to retrieve a base coffee ID to get the recommendation.

In [13]:
# Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import linear_kernel
import random

# Load preprocessed data
data = pd.read_csv('app/coffee_may2023_prep.csv')

def get_coffee_recommendation(key_value='costa rica',key_id='country_of_origin', top_n=10):
        # Concatenate multiple columns into a single feature column for item representation
        coffee_item_features = ['variety', 'processing_method','color', 'country_of_origin', 'mill']
        data['item_features'] = data[coffee_item_features].agg(' '.join, axis=1)

        # Normalize 'rating' column to [0, 1] range for feature representation
        scaler = MinMaxScaler()
        normalized_columns = ['aftertaste','balance','flavor','aroma','body','acidity','moisture_percentage','quakers','altitude']
        normalized_features = scaler.fit_transform(data[normalized_columns])
        normalized_data = pd.DataFrame(normalized_features, columns=normalized_columns)

        # Create a TF-IDF vectorizer to convert item_features into feature vectors
        tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf_vectorizer.fit_transform(data['item_features'])

        # Combine the TF-IDF matrix with the normalized columns
        data_vec = pd.concat([pd.DataFrame(tfidf_matrix.toarray()), normalized_data], axis=1)

        # Calculate the cosine similarity between items
        cosine_sim = linear_kernel(data_vec, data_vec)

        # Function to get coffee recommendations based on country
        def get_recommendations(key_value,key_id, cosine_similarities, data):
            idx = random.choice(data.index[data[key_id] == key_value].tolist())
            sim_scores = list(enumerate(cosine_similarities[idx]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            sim_scores = sim_scores[1:]  # Exclude the base coffee itself (most similar)
            coffee_indices = [i[0] for i in sim_scores]
            return data[coffee_item_features+normalized_columns].iloc[coffee_indices]

        # get recommendations using a key and value
        recommendations = get_recommendations(key_value,key_id, cosine_sim, data)
        # return top_n values
        return recommendations[0:top_n]

# Test

## by Country

In [14]:
get_coffee_recommendation(key_value='costa rica',key_id='country_of_origin', top_n=10)

Unnamed: 0,variety,processing_method,color,country_of_origin,mill,aftertaste,balance,flavor,aroma,body,acidity,moisture_percentage,quakers,altitude
0,castillo,double anaerobic washed,green,colombia,finca el paraiso,8.42,8.42,8.5,8.58,8.25,8.58,11.8,0,1815.0
1,gesha,washed / wet,blue-green,taiwan,royal bean geisha estate,7.92,8.25,8.5,8.5,7.92,8.0,10.5,0,1200.0
4,red bourbon,"honey,mossto",yellow-green,colombia,finca santuario,8.08,7.92,8.33,8.33,7.92,8.25,11.6,2,1975.0
2,java,semi washed,yellowish,laos,oklao coffee processing plant,8.08,8.17,8.42,8.33,7.92,8.17,10.4,0,1300.0
5,gesha,washed / wet,green,guatemala,dinámica café,8.25,8.17,8.33,8.33,7.83,7.83,10.7,0,1668.0
68,gesha,washed / wet,green,costa rica,la montana tarrazu mill,7.67,7.75,7.83,7.75,7.75,7.83,11.7,1,1900.0
12,gesha,washed / wet,green,taiwan,古峰咖啡莊園,8.08,8.0,8.0,8.08,8.0,8.08,11.9,0,1250.0
14,caturra,washed / wet,green,colombia,la gaitania,8.08,8.0,8.0,8.08,8.08,7.92,10.6,0,1850.0
11,gesha,natural / dry,brownish,guatemala,cafetoland,8.0,8.17,8.0,8.08,8.25,7.75,11.5,0,2000.0
7,sl34+gesha,natural / dry,yellow-green,taiwan,七彩琉璃咖啡莊園,8.17,8.08,8.25,8.25,7.92,8.0,10.0,0,1200.0


## by Color

In [15]:
get_coffee_recommendation(key_value='greenish',key_id='color', top_n=10)

Unnamed: 0,variety,processing_method,color,country_of_origin,mill,aftertaste,balance,flavor,aroma,body,acidity,moisture_percentage,quakers,altitude
1,gesha,washed / wet,blue-green,taiwan,royal bean geisha estate,7.92,8.25,8.5,8.5,7.92,8.0,10.5,0,1200.0
7,sl34+gesha,natural / dry,yellow-green,taiwan,七彩琉璃咖啡莊園,8.17,8.08,8.25,8.25,7.92,8.0,10.0,0,1200.0
3,gesha,washed / wet,green,costa rica,la montana tarrazu mill,8.17,8.08,8.17,8.08,8.17,8.25,11.8,0,1900.0
10,ethiopian heirlooms,natural / dry,greenish,ethiopia,moledo社　委託精選場,8.0,7.92,8.25,8.08,7.92,8.08,11.8,1,2000.0
8,sl34,washed / wet,greenish,taiwan,亮軒咖啡莊園,8.25,8.0,8.08,8.08,7.92,8.08,10.8,0,1250.0
4,red bourbon,"honey,mossto",yellow-green,colombia,finca santuario,8.08,7.92,8.33,8.33,7.92,8.25,11.6,2,1975.0
2,java,semi washed,yellowish,laos,oklao coffee processing plant,8.08,8.17,8.42,8.33,7.92,8.17,10.4,0,1300.0
13,gesha,natural / dry,yellow-green,ethiopia,dry mill or hulling facility,8.0,8.0,8.17,7.67,8.0,8.33,11.6,3,1950.0
134,typica,natural / dry,greenish,taiwan,蜜多莊園,7.5,7.42,7.75,7.75,7.5,7.75,9.7,0,250.0
9,bourbon,washed / wet,greenish,"tanzania, united republic of",gourmet coffee mill,8.08,8.0,8.17,8.08,8.0,8.17,11.0,0,1550.0


## by Processing Method

In [16]:
get_coffee_recommendation(key_value='natural / dry',key_id='processing_method', top_n=10)

Unnamed: 0,variety,processing_method,color,country_of_origin,mill,aftertaste,balance,flavor,aroma,body,acidity,moisture_percentage,quakers,altitude
195,bourbon,natural / dry,bluish-green,el salvador,"optimum coffee, san salvador, el salvador",7.33,7.0,7.33,7.42,7.08,7.5,10.5,8,1400.0
0,castillo,double anaerobic washed,green,colombia,finca el paraiso,8.42,8.42,8.5,8.58,8.25,8.58,11.8,0,1815.0
205,maragogype,natural / dry,bluish-green,el salvador,"optimum coffee, san salvador, el salvador",6.75,7.0,6.75,6.5,7.08,7.17,11.0,12,1200.0
165,bourbon,washed / wet,green,el salvador,agua caliente,7.42,7.67,7.5,7.5,7.42,7.42,12.0,7,1200.0
13,gesha,natural / dry,yellow-green,ethiopia,dry mill or hulling facility,8.0,8.0,8.17,7.67,8.0,8.33,11.6,3,1950.0
4,red bourbon,"honey,mossto",yellow-green,colombia,finca santuario,8.08,7.92,8.33,8.33,7.92,8.25,11.6,2,1975.0
34,ethiopian heirlooms,natural / dry,yellow-green,ethiopia,dry mill,8.0,7.75,8.08,8.0,7.67,8.0,12.3,3,2250.0
60,bourbon,natural / dry,yellow-green,guatemala,eco coffee,7.67,7.75,8.0,7.83,7.75,7.83,10.9,3,4895.0
3,gesha,washed / wet,green,costa rica,la montana tarrazu mill,8.17,8.08,8.17,8.08,8.17,8.25,11.8,0,1900.0
76,ethiopian heirlooms,natural / dry,yellow-green,ethiopia,washing station,7.58,7.5,7.75,7.83,8.0,7.92,11.0,5,2361.0
