In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer 
df = pd.read_csv('resources.csv')

In [18]:
df.head()

Unnamed: 0,r_id,name,url,tags,vote_average,vote_count
0,1,Headspace,https://www.headspace.com/,"[mindfulness, meditation, sleep, stress, anxie...",4,3045
1,2,Samaritans Website,https://www.samaritans.org/,"[information, suicide, suicidal, kill, harm, die]",5,4357
2,3,NHS Suicide Help,https://www.nhs.uk/conditions/suicide/,"[information, suicide, suicidal, death, harm]",5,123
3,4,NHS Self Harm Help,https://www.nhs.uk/conditions/self-harm/,"[self-harm, hurt, selfloathing, past-trauma",4,53
4,5,Samaritans Support,https://www.samaritans.org/how-we-can-help/con...,"[helpline, phoneline, chat, suicide, suicidal,...",3,1094


In [19]:


#ranking system


C = df['vote_average'].mean()
C

3.409090909090909

In [20]:
m = df['vote_count'].quantile(0.6)
m

3752.3999999999996

In [21]:
qualified = df.copy().loc[df['vote_count'] >= m]
qualified.shape

(9, 6)

In [22]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [23]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
qualified['score'] = qualified.apply(weighted_rating, axis=1)

In [24]:
df[df['tags'].str.contains("suicide")]

Unnamed: 0,r_id,name,url,tags,vote_average,vote_count
1,2,Samaritans Website,https://www.samaritans.org/,"[information, suicide, suicidal, kill, harm, die]",5,4357
2,3,NHS Suicide Help,https://www.nhs.uk/conditions/suicide/,"[information, suicide, suicidal, death, harm]",5,123
4,5,Samaritans Support,https://www.samaritans.org/how-we-can-help/con...,"[helpline, phoneline, chat, suicide, suicidal,...",3,1094


In [25]:
#Sort movies based on score calculated above
qualified = qualified.sort_values('score', ascending=False)

#Print the top 15 movies
qualified[['name', 'vote_count', 'vote_average', 'score']].head(4)

Unnamed: 0,name,vote_count,vote_average,score
1,Samaritans Website,4357,5,4.263851
10,The Mix: Support,5474,4,3.759676
21,Grief: Support for Young People,5424,4,3.758366
15,Kooth,5095,4,3.749381


In [26]:
df = df[['name','tags']]
df.head()

Unnamed: 0,name,tags
0,Headspace,"[mindfulness, meditation, sleep, stress, anxie..."
1,Samaritans Website,"[information, suicide, suicidal, kill, harm, die]"
2,NHS Suicide Help,"[information, suicide, suicidal, death, harm]"
3,NHS Self Harm Help,"[self-harm, hurt, selfloathing, past-trauma"
4,Samaritans Support,"[helpline, phoneline, chat, suicide, suicidal,..."


In [27]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['tags'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)
print(cosine_sim)

[[1.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.30151134
  0.23570226 0.         0.38490018 0.        ]
 [0.         1.         0.73029674 0.16666667 0.6172134  0.
  0.         0.         0.16666667 0.         0.         0.
  0.16666667 0.16666667 0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.73029674 1.         0.18257419 0.50709255 0.2
  0.2        0.18257419 0.18257419 0.         0.         0.
  0.18257419 0.18257419 0.         0.         0.         0.
  0.         0.         0.         0.16903085]
 [0.         0.16666667 0.18257419 1.         0.15430335 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.6172134  0.50709255 0.15430335 1.         0.
  0.         0.15430335

In [28]:
indices = pd.Series(df['name'])

In [29]:
def recommend(name, cosine_sim = cosine_sim):
    recommended_rec = []
    idx = indices[indices == name].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_10_indices = list(score_series.iloc[1:11].index)
    
    for i in top_10_indices:
        recommended_rec.append(list(df['name'])[i])
        
    return recommended_rec

In [30]:
recommend('Headspace')

['Smiling Mind',
 'Wysa',
 'Stoic',
 'Young Minds: Bullying',
 'Samaritans Website',
 'NHS Suicide Help',
 'NHS Self Harm Help',
 'Samaritans Support',
 'Young Minds: Grief and Loss',
 'NHS: Bereavement and Young People']