In [1]:
import numpy as np
import pandas as pd

In [2]:
df_jokes = pd.read_csv('JokeText.csv')
df_ratings1 = pd.read_csv('UserRatings1.csv')

In [3]:
df_jokes.head()

Unnamed: 0,JokeId,JokeText
0,0,"A man visits the doctor. The doctor says ""I ha..."
1,1,This couple had an excellent relationship goin...
2,2,Q. What's 200 feet long and has 4 teeth? \n\nA...
3,3,Q. What's the difference between a man and a t...
4,4,Q.\tWhat's O. J. Simpson's Internet address? \...


In [4]:
df_ratings1.head()

Unnamed: 0,JokeId,User1,User2,User3,User4,User5,User6,User7,User8,User9,...,User36701,User36702,User36703,User36704,User36705,User36706,User36707,User36708,User36709,User36710
0,0,5.1,-8.79,-3.5,7.14,-8.79,9.22,-4.03,3.11,-3.64,...,,,,,,,,,2.91,
1,1,4.9,-0.87,-2.91,-3.88,-0.58,9.37,-1.55,0.92,-3.35,...,,,,-5.63,,-6.07,,-1.6,-4.56,
2,2,1.75,1.99,-2.18,-3.06,-0.58,-3.93,-3.64,7.52,-6.46,...,,,,,,4.08,,,8.98,
3,3,-4.17,-4.61,-0.1,0.05,8.98,9.27,-6.99,0.49,-3.4,...,,,,,,,,,,
4,4,5.15,5.39,7.52,6.26,7.67,3.45,5.44,-0.58,1.26,...,2.28,-0.49,5.1,-0.29,-3.54,-1.36,7.48,-5.78,0.73,2.62


It's more convenient to replace this with a transposed version.

In [5]:
df_ratings1 = df_ratings1.transpose()[1:]
df_ratings1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
User1,5.1,4.9,1.75,-4.17,5.15,1.75,4.76,3.3,-2.57,-1.41,...,5.34,-4.61,3.59,7.18,0.92,6.31,-4.95,-0.19,3.25,4.37
User2,-8.79,-0.87,1.99,-4.61,5.39,-0.78,1.6,1.07,-8.69,-4.66,...,3.59,1.21,2.86,-0.05,-1.75,-1.02,-0.97,4.13,-1.84,2.96
User3,-3.5,-2.91,-2.18,-0.1,7.52,1.26,-5.39,1.5,-8.4,4.37,...,1.84,-4.03,-1.41,1.65,-3.79,3.98,-6.46,-6.89,-2.33,-7.38
User4,7.14,-3.88,-3.06,0.05,6.26,6.65,-7.52,7.28,-5.15,-7.14,...,-4.47,6.36,4.71,-5.19,6.26,3.93,-2.57,1.07,2.33,-0.34
User5,-8.79,-0.58,-0.58,8.98,7.67,8.25,4.08,2.52,-9.66,2.48,...,-0.29,9.37,8.3,9.13,-3.45,9.13,9.17,9.17,9.08,8.98


Some of the ratings are negative, ranging from -10 to 10. To avoid negative and positive ratings cancelling, I add 10 to the whole dataframe here.

In [6]:
df_ratings1 = df_ratings1.add(10, axis=0)

We also need an average rating feature.

In [7]:
df_ratings1['average_rating'] = df_ratings1.mean(axis=1)

Note I dropped a row that would correspond to the transposed JokeID, since the JokeID feature duplicated thee index, this was simply convenient.

# Recommender Outline
For some usere
1. Pick neighborhood (at least pick users who have rated the same jokes)
2. Calculate similarity between input user and every other neighbor in neighborhood (Pearson's)
3. Prediict based on the neighborhood

In [8]:
users = [col for col in df_ratings1.columns if col != 'average_rating']

In [10]:
def find_common_ratings(df, rated, N):
    '''
    Finds users that have rated the jokes in the list rated
    Parameters: df, a dataframe containing user ratings
                rated, a list of integers representing jokes the user has rated
                N, size of the neighborhood (ideally calculate similarity first, but this saves on computation)
    Returns: subset, a dataframe containing as rows users who have rated the jokes
            in "rated"
    '''
    subset = df.copy()
    for i in rated:
        subset = subset.loc[subset[i].notna()]
    return subset[:N]

rated = [0,1,3]
subset = find_common_ratings(df_ratings1, rated, 10000)
subset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,average_rating
User1,15.1,14.9,11.75,5.83,15.15,11.75,14.76,13.3,7.43,8.59,...,5.39,13.59,17.18,10.92,16.31,5.05,9.81,13.25,14.37,11.747
User2,1.21,9.13,11.99,5.39,15.39,9.22,11.6,11.07,1.31,5.34,...,11.21,12.86,9.95,8.25,8.98,9.03,14.13,8.16,12.96,9.4166
User3,6.5,7.09,7.82,9.9,17.52,11.26,4.61,11.5,1.6,14.37,...,5.97,8.59,11.65,6.21,13.98,3.54,3.11,7.67,2.62,9.2565
User4,17.14,6.12,6.94,10.05,16.26,16.65,2.48,17.28,4.85,2.86,...,16.36,14.71,4.81,16.26,13.93,7.43,11.07,12.33,9.66,10.7567
User5,1.21,9.42,9.42,18.98,17.67,18.25,14.08,12.52,0.34,12.48,...,19.37,18.3,19.13,6.55,19.13,19.17,19.17,19.08,18.98,12.942


In [11]:
corr = subset[users].T.corr() # Pandas ignores NaNs by default

Next, calculate predictions.

In [21]:
def single_prediction(similarity_vector, rating_deviations, user_idx, joke):
    rating_dev_vector = rating_deviations[joke]

    # Sum over only other users, not input user
    similarity_vector = np.delete(similarity_vector, user_idx)
    rating_dev_vector = np.delete(rating_dev_vector, user_idx)

    numerator = np.dot(similarity_vector, rating_dev_vector) # weighted sum over other users of similaritiies and differences between rating and user mean
    denominator = np.sum(similarity_vector) # normaalizing term, sum of similarities between user and all other useers in neighborhood
    
    prediction = numerator/denominator
    
    return prediction

def predict_jokes(similarity_matrix, ratings_matrix, user, jokes):
    user_idx = int(user.split('r')[1])
    user_mean = ratings_matrix['average_rating'][user]
    similarity_vector = corr[user].values
    
    rating_deviations = ratings_matrix.sub(ratings_matrix['average_rating'], axis=0)[users].T.values

    predictions = []
    
    for joke in jokes:
        predictions.append(single_prediction(similarity_vector, rating_deviations, user_idx, joke))

    predictions = np.asarray(predictions) + user_mean - 10 # Remove 10 which was added initially
    return predictions
    

predict_jokes(corr, subset, 'User99', [1,2,3, 20])

array([5.95853705, 5.80349167, 2.71969647, 6.66842115])

In [25]:
df_jokes.iloc[[2,1,3]]

Unnamed: 0,JokeId,JokeText
2,2,Q. What's 200 feet long and has 4 teeth? \n\nA...
1,1,This couple had an excellent relationship goin...
3,3,Q. What's the difference between a man and a t...


In [33]:
a = [4,2,10]
b = [1,2,3]

c = list(zip(a,b))

sort = sorted(c, key=lambda x:x[0])

list(zip(*sort))[1]

(2, 1, 3)

In [None]:
def give_recommendations(user, ratings_matrix, joke_matrix, rated_jokes, n_neighbors):
    users = [col for col in ratings_matrix.columns if col != 'average_rating']
    
    neighbors = find_common_ratings(ratings_matrix, rated_jokes, n_neighbors)
    
    similarity_matrix = neighbors[users].T.corr()
    
    predictions = predict_jokes(similarity_matrix, ratings_matrix, user, rated_jokes)
    
    zip_jokes_ratings = list(zip(predictions, rated_jokes))
    
    
    return predictions