In [33]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import linear_kernel
import random



data = pd.read_csv('app/coffee_may2023_prep.csv')


# Concatenate multiple columns into a single feature column for item representation
coffee_item_features = ['variety', 'processing_method','color', 'country_of_origin', 'mill']
data['item_features'] = data[coffee_item_features].agg(' '.join, axis=1)


# Normalize 'rating' column to [0, 1] range for feature representation
scaler = MinMaxScaler()
normalized_columns = ['aftertaste','balance','flavor','aroma','body','acidity','moisture_percentage','quakers','overall','total_cup_points','altitude']
normalized_features = scaler.fit_transform(data[normalized_columns])
normalized_data = pd.DataFrame(normalized_features, columns=normalized_columns)


# Create a TF-IDF vectorizer to convert item_features into feature vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['item_features'])

# Combine the TF-IDF matrix with the normalized columns
data_vec = pd.concat([pd.DataFrame(tfidf_matrix.toarray()), normalized_data], axis=1)

# Calculate the cosine similarity between items
cosine_sim = linear_kernel(data_vec, data_vec)

# Function to get coffee recommendations based on country
def get_recommendations(key_value, cosine_similarities, data):
    key = 'country_of_origin'
    idx = random.choice(data.index[data[key] == key_value].tolist())
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:]  # Exclude the movie itself (most similar)
    coffee_indices = [i[0] for i in sim_scores]
    return data[coffee_item_features+normalized_columns].iloc[coffee_indices]

recommendations = get_recommendations('taiwan', cosine_sim, data)
recommendations

Unnamed: 0,variety,processing_method,color,country_of_origin,mill,aftertaste,balance,flavor,aroma,body,acidity,moisture_percentage,quakers,overall,total_cup_points,altitude
1,gesha,washed / wet,blue-green,taiwan,royal bean geisha estate,7.92,8.25,8.50,8.50,7.92,8.00,10.5,0,8.50,87.58,1200.0
3,gesha,washed / wet,green,costa rica,la montana tarrazu mill,8.17,8.08,8.17,8.08,8.17,8.25,11.8,0,8.25,87.17,1900.0
2,java,semi washed,yellowish,laos,oklao coffee processing plant,8.08,8.17,8.42,8.33,7.92,8.17,10.4,0,8.33,87.42,1300.0
4,red bourbon,"honey,mossto",yellow-green,colombia,finca santuario,8.08,7.92,8.33,8.33,7.92,8.25,11.6,2,8.25,87.08,1975.0
5,gesha,washed / wet,green,guatemala,dinámica café,8.25,8.17,8.33,8.33,7.83,7.83,10.7,0,8.25,87.00,1668.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,mundo novo,natural / dry,green,brazil,dry mill,6.92,7.17,7.17,7.17,7.42,7.17,11.4,0,7.08,80.08,950.0
204,catimor,washed / wet,green,laos,dry mill,7.08,7.08,7.17,7.25,7.08,7.00,11.6,9,7.00,79.67,1300.0
203,shg,natural / dry,green,nicaragua,beneficio atlantic sébaco,6.75,7.17,7.08,7.33,7.42,7.17,10.4,2,7.08,80.00,1200.0
206,mundo novo,semi washed,green,brazil,beneficio humedo/seco,6.67,6.67,7.08,7.25,6.83,6.83,11.3,0,6.67,78.00,975.0


In [15]:
data['altitude'].astype(int)


0      1815
1      1200
2      1300
3      1900
4      1975
       ... 
202     950
203    1200
204    1300
205    1200
206     975
Name: altitude, Length: 207, dtype: int32

In [105]:
normalized_features = scaler.fit_transform(data[normalized_columns])
normalized_data = pd.DataFrame(normalized_features, columns=normalized_columns)


In [95]:
sim_scoress = list(enumerate(cosine_sim[2]))
sim_scoress = sorted(sim_scoress, key=lambda x: x[1], reverse=True)
sim_scoress
coffee_indices = [i[0] for i in sim_scoress]
coffee_indices

[0,
 2,
 1,
 3,
 4,
 5,
 6,
 8,
 9,
 7,
 12,
 11,
 10,
 13,
 14,
 16,
 20,
 17,
 15,
 18,
 19,
 21,
 22,
 23,
 24,
 25,
 32,
 30,
 34,
 27,
 28,
 37,
 29,
 31,
 26,
 33,
 35,
 39,
 36,
 47,
 41,
 38,
 43,
 40,
 48,
 42,
 46,
 53,
 44,
 51,
 45,
 52,
 60,
 49,
 56,
 55,
 59,
 63,
 50,
 57,
 54,
 58,
 65,
 85,
 67,
 68,
 61,
 70,
 69,
 119,
 66,
 71,
 64,
 62,
 78,
 73,
 81,
 72,
 76,
 92,
 79,
 82,
 77,
 75,
 80,
 74,
 96,
 91,
 83,
 86,
 84,
 102,
 87,
 90,
 101,
 94,
 100,
 97,
 89,
 99,
 104,
 95,
 88,
 98,
 105,
 115,
 117,
 93,
 103,
 109,
 114,
 124,
 112,
 116,
 106,
 107,
 113,
 127,
 108,
 136,
 122,
 120,
 111,
 118,
 128,
 126,
 110,
 129,
 135,
 133,
 131,
 139,
 132,
 123,
 121,
 130,
 137,
 146,
 145,
 134,
 141,
 142,
 150,
 138,
 148,
 143,
 147,
 144,
 149,
 140,
 159,
 151,
 158,
 156,
 152,
 166,
 155,
 161,
 160,
 163,
 165,
 153,
 157,
 154,
 168,
 164,
 167,
 162,
 125,
 170,
 171,
 178,
 177,
 169,
 190,
 176,
 179,
 175,
 172,
 180,
 181,
 182,
 183,
 185,
 173,


In [32]:
get_recommendations('taiwan', cosine_sim, data)


Unnamed: 0,variety,processing_method,color,country_of_origin,mill,aftertaste,balance,flavor,aroma,body,acidity,moisture_percentage,quakers,overall,total_cup_points,altitude
1,gesha,washed / wet,blue-green,taiwan,royal bean geisha estate,7.92,8.25,8.50,8.50,7.92,8.00,10.5,0,8.50,87.58,1200.0
7,sl34+gesha,natural / dry,yellow-green,taiwan,七彩琉璃咖啡莊園,8.17,8.08,8.25,8.25,7.92,8.00,10.0,0,8.08,86.75,1200.0
3,gesha,washed / wet,green,costa rica,la montana tarrazu mill,8.17,8.08,8.17,8.08,8.17,8.25,11.8,0,8.25,87.17,1900.0
8,sl34,washed / wet,greenish,taiwan,亮軒咖啡莊園,8.25,8.00,8.08,8.08,7.92,8.08,10.8,0,8.25,86.67,1250.0
2,java,semi washed,yellowish,laos,oklao coffee processing plant,8.08,8.17,8.42,8.33,7.92,8.17,10.4,0,8.33,87.42,1300.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,"castillo,caturra,bourbon",washed / wet,greenish,colombia,finca mallorca,7.17,7.17,7.25,7.25,7.00,7.33,11.0,3,7.25,80.42,1600.0
203,shg,natural / dry,green,nicaragua,beneficio atlantic sébaco,6.75,7.17,7.08,7.33,7.42,7.17,10.4,2,7.08,80.00,1200.0
204,catimor,washed / wet,green,laos,dry mill,7.08,7.08,7.17,7.25,7.08,7.00,11.6,9,7.00,79.67,1300.0
205,maragogype,natural / dry,bluish-green,el salvador,"optimum coffee, san salvador, el salvador",6.75,7.00,6.75,6.50,7.08,7.17,11.0,12,6.83,78.08,1200.0


In [43]:
#+ ' ' + data['processing_method'] + ' ' + data['country_of_origin']+' ' + data['mill']
#data['variety'].isna()

#data[data['mill'].isna()]
#data.tail(3)

import random
listtt = data.index[data['country_of_origin'] == 'costa rica'].tolist()
random_item = random.choice(listtt)
random_item


176

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import linear_kernel


data = pd.read_csv('app/coffee_may2023_prep.csv')


# Sample data: movies dataset with 'title', 'description', 'genres', 'actors', and 'rating' columns
data = pd.DataFrame({
    'title': ['Movie A', 'Movie B', 'Movie C', 'Movie D'],
    'description': [
        'This is a thrilling action movie with explosions and car chases.',
        'A heartwarming comedy about friends on a road trip.',
        'A drama about love, loss, and redemption.',
        'An animated adventure with talking animals.'
    ],
    'genres': [
        'Action',
        'Comedy',
        'Drama',
        'Animation'
    ],
    'actors': [
        'Actor A, Actor B',
        'Actor C, Actor D',
        'Actor A, Actor E',
        'Actor F, Actor G'
    ],
    'rating': [4.2, 3.8, 4.5, 4.0]
})

# Concatenate multiple columns into a single feature column for item representation
data['item_features'] = data['description'] + ' ' + data['genres'] + ' ' + data['actors']

# Normalize 'rating' column to [0, 1] range for feature representation
scaler = MinMaxScaler()
data['rating_normalized'] = scaler.fit_transform(data['rating'].values.reshape(-1, 1))

# Create a TF-IDF vectorizer to convert item_features into feature vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['item_features'])

# Combine the TF-IDF matrix with the normalized 'rating' column
tfidf_matrix_with_rating = pd.concat([pd.DataFrame(tfidf_matrix.toarray()), data['rating_normalized']], axis=1)

# Calculate the cosine similarity between items
cosine_sim = linear_kernel(tfidf_matrix_with_rating, tfidf_matrix_with_rating)

# Function to get movie recommendations based on movie title
def get_recommendations(title, cosine_similarities, data):
    idx = data.index[data['title'] == title].tolist()[0]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:]  # Exclude the movie itself (most similar)
    movie_indices = [i[0] for i in sim_scores]
    return data['title'].iloc[movie_indices]

# Example: Get recommendations for 'Movie A'
recommendations = get_recommendations('Movie D', cosine_sim, data)
print(recommendations)
data

2    Movie C
0    Movie A
1    Movie B
Name: title, dtype: object


Unnamed: 0,title,description,genres,actors,rating,item_features,rating_normalized
0,Movie A,This is a thrilling action movie with explosio...,Action,"Actor A, Actor B",4.2,This is a thrilling action movie with explosio...,0.571429
1,Movie B,A heartwarming comedy about friends on a road ...,Comedy,"Actor C, Actor D",3.8,A heartwarming comedy about friends on a road ...,0.0
2,Movie C,"A drama about love, loss, and redemption.",Drama,"Actor A, Actor E",4.5,"A drama about love, loss, and redemption. Dram...",1.0
3,Movie D,An animated adventure with talking animals.,Animation,"Actor F, Actor G",4.0,An animated adventure with talking animals. An...,0.285714
