### Load the data

In [42]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [46]:
print("Train Data:")
print(train.head())

print("\nTest Data:")
print(test.head())

Train Data:
      id      cuisine                                        ingredients
0  10259        greek  romaine lettuce,black olives,grape tomatoes,ga...
1  25693  southern_us  plain flour,ground pepper,salt,tomatoes,ground...
2  20130     filipino  eggs,pepper,salt,mayonaise,cooking oil,green c...
3  22213       indian                     water,vegetable oil,wheat,salt
4  13162       indian  black pepper,shallots,cornflour,cayenne pepper...

Test Data:
      id                                        ingredients
0  18009  baking powder,eggs,all-purpose flour,raisins,m...
1  28583  sugar,egg yolks,corn starch,cream of tartar,ba...
2  41580  sausage links,fennel bulb,fronds,olive oil,cub...
3  29752  meat cuts,file powder,smoked sausage,okra,shri...
4  35687  ground black pepper,salt,sausage casings,leeks...


In [47]:
print("\nTrain Data Columns:", train.columns)
print("Test Data Columns:", test.columns)


Train Data Columns: Index(['id', 'cuisine', 'ingredients'], dtype='object')
Test Data Columns: Index(['id', 'ingredients'], dtype='object')


### Preprocess the Data
#### Data Cleaning: Handle missing values.
#### Feature Extraction: Convert ingredients into a numerical format.

In [48]:
print("Missing values in train data:\n", train.isnull().sum())
print("\nMissing values in test data:\n", test.isnull().sum())

Missing values in train data:
 id             0
cuisine        0
ingredients    0
dtype: int64

Missing values in test data:
 id             0
ingredients    0
dtype: int64


In [50]:
# Fill missing values if any

train.fillna('', inplace=True)
test.fillna('', inplace=True)

In [52]:
train['all_ingredients'] = train['ingredients'].apply(lambda x: ' '.join(x.split(',')))
test['all_ingredients'] = test['ingredients'].apply(lambda x: ' '.join(x.split(',')))

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert ingredients into TF-IDF features
tfidf = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf.fit_transform(train['all_ingredients'])
X_test_tfidf = tfidf.transform(test['all_ingredients'])

print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(39774, 2970)
(9944, 2970)


In [55]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# combined_ingredients = pd.concat([train_df['ingredients'], test_df['ingredients']])
# vectorizer = TfidfVectorizer()
# tfidf_matrix = vectorizer.fit_transform(combined_ingredients)

# tfidf_train = tfidf_matrix[:len(train_df)]
# tfidf_test = tfidf_matrix[len(train_df):]

# if 'dietary_restrictions' in train_df.columns:
#     train_df = pd.get_dummies(train_df, columns=['dietary_restrictions'])
# if 'dietary_restrictions' in test_df.columns:
#     test_df = pd.get_dummies(test_df, columns=['dietary_restrictions'])

## Implement the Recommendation System


#### Collaborative Filtering

In [56]:
print(train.head())


      id      cuisine                                        ingredients  \
0  10259        greek  romaine lettuce,black olives,grape tomatoes,ga...   
1  25693  southern_us  plain flour,ground pepper,salt,tomatoes,ground...   
2  20130     filipino  eggs,pepper,salt,mayonaise,cooking oil,green c...   
3  22213       indian                     water,vegetable oil,wheat,salt   
4  13162       indian  black pepper,shallots,cornflour,cayenne pepper...   

                                     all_ingredients  
0  romaine lettuce black olives grape tomatoes ga...  
1  plain flour ground pepper salt tomatoes ground...  
2  eggs pepper salt mayonaise cooking oil green c...  
3                     water vegetable oil wheat salt  
4  black pepper shallots cornflour cayenne pepper...  


#### User-based collaborative filtering.

In [59]:
np.random.seed(0)
user_ratings = pd.DataFrame(np.random.randint(1, 6, size=(100, len(train))), columns=train['id'])

def collaborative_filtering(user_id, user_ratings, n_recommendations=5):
    user_similarity = cosine_similarity(user_ratings)
    similar_users = user_similarity[user_id].argsort()[:-n_recommendations-1:-1]
    recommendations = user_ratings.iloc[similar_users].mean().sort_values(ascending=False)
    return recommendations.index.values[:n_recommendations]

print(collaborative_filtering(0, user_ratings))

[47823 16886 20604 41720  6436]


#### Content-Based Filtering

In [58]:
def content_based_recommendations(recipe_id, X_tfidf, n_recommendations=5):
    cosine_similarities = cosine_similarity(X_tfidf[recipe_id], X_tfidf).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-n_recommendations-1:-1]
    return related_docs_indices

print(content_based_recommendations(0, X_train_tfidf))

[    0 34089 16419 37955 32784]


### Hybrid Recommendation System

In [60]:
def hybrid_recommendations(user_id, recipe_id, X_tfidf, user_ratings, n_recommendations=5):
    content_recommendations = content_based_recommendations(recipe_id, X_tfidf, n_recommendations)
    collab_recommendations = collaborative_filtering(user_id, user_ratings, n_recommendations)
    
    hybrid_scores = np.mean([content_recommendations, collab_recommendations], axis=0)
    hybrid_recommendations = np.argsort(hybrid_scores)[:n_recommendations]
    return hybrid_recommendations

print(hybrid_recommendations(0, 0, X_train_tfidf, user_ratings))

[2 4 0 1 3]


#### Evaluate the Performance

In [61]:
y_true = np.random.randint(0, 2, size=100)
y_pred = np.random.randint(0, 2, size=100)

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.52
Precision: 0.5454545454545454
Recall: 0.5660377358490566
F1 Score: 0.5555555555555556


## Validation

#### Train-Test Split

In [63]:
train_indices, test_indices = train_test_split(range(X_train_tfidf.shape[0]), test_size=0.2, random_state=42)

X_train_split = X_train_tfidf[train_indices]
X_test_split = X_train_tfidf[test_indices]
y_train_split = train['cuisine'].iloc[train_indices]
y_test_split = train['cuisine'].iloc[test_indices]

print(X_train_split.shape, X_test_split.shape, y_train_split.shape, y_test_split.shape)

(31819, 2970) (7955, 2970) (31819,) (7955,)


### Cross-Validation

#### k-fold cross-validation

In [65]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, val_index in kf.split(X_train_tfidf):
    X_train_kfold, X_val_kfold = X_train_tfidf[train_index], X_train_tfidf[val_index]
    y_train_kfold, y_val_kfold = train['cuisine'].iloc[train_index], train['cuisine'].iloc[val_index]
    
print(X_train_kfold.shape, X_val_kfold.shape, y_train_kfold.shape, y_val_kfold.shape)

(31820, 2970) (7954, 2970) (31820,) (7954,)


## Handle Additional Challenges


#### Cold-Start Problem

In [66]:
def recommend_for_new_user(X_tfidf, n_recommendations=5):

    mean_tfidf = np.mean(X_tfidf, axis=0)
    popular_indices = mean_tfidf.argsort()[0, -n_recommendations:].tolist()[0]
    return popular_indices

print(recommend_for_new_user(X_train_tfidf))

[1030, 1189, 1837, 2321, 1970]


#### For New Recipes

In [67]:
def recommend_new_recipe(recipe_ingredients, tfidf, X_tfidf, n_recommendations=5):
   
    new_recipe_tfidf = tfidf.transform([recipe_ingredients])
    
    cosine_similarities = cosine_similarity(new_recipe_tfidf, X_tfidf).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-n_recommendations-1:-1]
    return related_docs_indices

# Example: Recommendations for a new recipe
new_recipe_ingredients = "tomato garlic onion basil"
print(recommend_new_recipe(new_recipe_ingredients, tfidf, X_train_tfidf))

[17997  8620  6957 18087 33862]
