In [2]:
from src.core import DataLoader
import pandas as pd
import numpy as np
from src.preprocessing import PreprocessingPipeline, PunctuationNode, StopWordsNode, LemmatizationNode

In [3]:
dl = DataLoader('data/jokes.csv', 'data/rating.csv')

In [5]:
dl.jokes

Unnamed: 0,joke_id,joke_text
0,1,Q. What's O. J. Simpson's web address? A. Slas...
1,2,How many feminists does it take to screw in a ...
2,3,Q. Did you hear about the dyslexic devil worsh...
3,4,They asked the Japanese visitor if they have e...
4,5,Q: What did the blind person say when given so...
...,...,...
134,135,"A blonde, brunette, and a red head are all lin..."
135,136,America: 8:00 - Welcome to work! 12:00 - Lunch...
136,137,It was the day of the big sale. Rumors of the ...
137,138,"Recently a teacher, a garbage collector, and a..."


In [6]:
jokes = dl.jokes['joke_text'].tolist()

In [7]:
from src.embeddings import SentenceBert, BagOfWords, TFIDF

In [8]:
bert = SentenceBert()

In [9]:
embedded = bert.to_vec(jokes)

In [10]:
embedded.shape

(139, 384)

In [11]:
from sklearn.cluster import KMeans,SpectralClustering

kmeans = SpectralClustering(n_clusters=10)
kmeans.fit(embedded)
clusters = kmeans.labels_

In [12]:
import numpy as np

In [13]:
clusters

array([0, 4, 7, 6, 7, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 7,
       8, 7, 7, 9, 7, 9, 0, 5, 0, 0, 7, 9, 7, 0, 0, 0, 0, 7, 0, 0, 0, 7,
       0, 0, 0, 5, 0, 9, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 3, 0, 0,
       0, 0, 6, 0, 6, 0, 0, 0, 4, 9, 0, 7, 8, 5, 0, 7, 0, 0, 0, 0, 7, 0,
       0, 0, 7, 0, 0, 8, 7, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0,
       7, 0, 1, 0, 9, 0, 0, 7, 0, 0, 7, 0, 0, 0, 7, 0, 0, 7, 0, 2, 8, 7,
       0, 9, 0, 0, 0, 7, 0], dtype=int32)

In [14]:
dl.train_data.sort_values('joke_id')

Unnamed: 0,id,user_id,joke_id,Rating
1041818,5_1,5,1,-0.031
421502,2_1,2,1,-9.688
91178,446_1,446,1,-4.031
372845,315_1,315,1,-4.781
1086188,116_1,116,1,4.344
...,...,...,...,...
550742,23445_139,23445,139,2.062
947551,26310_139,26310,139,4.500
87757,5587_139,5587,139,9.750
1012733,2760_139,2760,139,5.812


In [15]:
jokes[42]

'One Sunday morning William burst into the living room and said, "Dad! Mom! I have some great news for you! I am getting married to the most beautiful girl in town. She lives a block away and her name is Susan." After dinner, William\'s dad took him aside. "Son, I have to talk with you. Your mother and I have been married 30 years. She\'s a wonderful wife but she has never offered much excitement in the bedroom, so I used to fool around with women a lot. Susan is actually your half-sister, and I\'m afraid you can\'t marry her." William was heart-broken. After eight months he eventually started dating girls again. A year later he came home and very proudly announced, "Dianne said yes! We\'re getting married in June." Again his father insisted on another private conversation and broke the sad news. "Dianne is your half-sister too, William. I\'m awfully sorry about this." William was furious! He finally decided to go to his mother with the news. "Dad has done so much harm.. I guess I\'m n

In [16]:
jokes[54]

'Two rednecks were seated at the end of a bar when a young lady seated a few stools up began to choke on a piece of hamburger. She was turning blue and obviously in serious respiratory distress. One said to the other, "That gal there is having a bad time!" The other agreed and said, "Think we should go help?" "You bet," said the first, and with that, he ran over and said, "Can you breathe?" She shook her head no. He said, "Can you speak?" She again shook her head no. With that, he pulled up her skirt and licked her on the butt. She was so shocked, she coughed up the obstruction and began to breathe--with great relief. The redneck walked back to his friend and said, "Funny how that hind lick maneuver always works."'

In [17]:
jokes[23]

'Out in the backwoods of some midwestern state, little Johnny arrives at school an hour late. Teacher: "Why are you so late, John?" Johny: "My big brother got shot in the ass." (The teacher corrects his speech.) Teacher: "Rectum." Johnny: "Wrecked him!? Hell, It damn near killed him!"'

In [18]:
jokes[26]

'A Jewish young man was seeing a psychiatrist for an eating and sleeping disorder. "I am so obsessed with my mother...As soon as I go to sleep, I start dreaming, and everyone in my dream turns into my mother. I wake up in such a state, and all I can do is go downstairs and eat a piece of toast." The psychiatrist replies, "What, just one piece of toast, for a big boy like you?"'

In [19]:
df = pd.merge(dl.train_data, dl.jokes, on="joke_id")

In [20]:
dff = df.loc[:, ["Rating", "joke_text", "joke_id", ]]

In [21]:
dff

Unnamed: 0,Rating,joke_text,joke_id
0,2.750,"Judy was having trouble with her computer, so ...",110
1,1.719,"Judy was having trouble with her computer, so ...",110
2,5.031,"Judy was having trouble with her computer, so ...",110
3,3.031,"Judy was having trouble with her computer, so ...",110
4,3.094,"Judy was having trouble with her computer, so ...",110
...,...,...,...
873642,-9.500,Q: Whats the difference between greeting a que...,90
873643,1.812,Q: Whats the difference between greeting a que...,90
873644,2.062,Q: Whats the difference between greeting a que...,90
873645,-0.344,Q: Whats the difference between greeting a que...,90


In [23]:
categorized = df
categorized['category'] = categorized.apply(lambda x: clusters[x['joke_id'] - 1], axis=1)
categorized['len'] = categorized.apply(lambda x: len(x['joke_text']), axis=1)

In [24]:
d = categorized[categorized['user_id'] == 1]

In [25]:
d.groupby(['user_id', 'category']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,joke_id,Rating,len
user_id,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,54.4,2.6824,476.68
1,4,4.5,-9.156,78.0
1,6,33.0,1.0,312.0
1,7,56.142857,3.388429,541.857143
1,8,79.0,9.812,784.0
1,9,67.5,8.781,443.5


In [26]:
d.groupby(['user_id', 'category']).std()

Unnamed: 0_level_0,Unnamed: 1_level_0,joke_id,Rating,len
user_id,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,35.95715,6.65363,323.71576
1,4,3.535534,0.176777,4.242641
1,6,41.012193,11.003996,301.227489
1,7,42.369126,3.876982,320.46811
1,8,,,
1,9,58.689863,0.0,137.885822


In [27]:
rating = dl.train_data.groupby("joke_id").mean().loc[:, ["Rating"]]

In [28]:
rating

Unnamed: 0_level_0,Rating
joke_id,Unnamed: 1_level_1
1,-2.173608
2,-1.960930
3,-0.664389
4,-0.574157
5,-1.413184
...,...
135,2.401095
136,0.201342
137,1.842146
138,3.043138


In [29]:
categorized.groupby(['user_id', 'category']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,joke_id,Rating,len
user_id,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,54.400000,2.682400,476.680000
1,4,4.500000,-9.156000,78.000000
1,6,33.000000,1.000000,312.000000
1,7,56.142857,3.388429,541.857143
1,8,79.000000,9.812000,784.000000
...,...,...,...,...
40863,4,4.500000,-7.140500,78.000000
40863,6,51.500000,3.906250,277.250000
40863,7,65.500000,4.847100,443.500000
40863,8,82.666667,-2.875000,201.333333


In [30]:
categorized.groupby('category').std()

Unnamed: 0_level_0,user_id,joke_id,Rating,len
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,11803.456381,41.746552,5.131907,332.078047
1,12375.209499,0.0,5.869162,0.0
2,12549.608484,0.0,6.064632,0.0
3,12704.590254,0.0,5.356489,0.0
4,11853.484858,16.850393,5.454697,10.806002
5,12516.893936,21.441206,5.631019,14.621569
6,11918.571524,36.522655,5.380516,184.419164
7,11760.025473,48.036429,5.258915,269.962205
8,11901.44219,24.029536,4.97307,231.490671
9,11857.793271,41.766913,5.027087,224.848895


In [31]:
for i in range(len(jokes)):
    print(jokes[i], clusters[i])

Q. What's O. J. Simpson's web address? A. Slash, slash, backslash, slash, slash, escape. 0
How many feminists does it take to screw in a light bulb? That's not funny. 4
Q. Did you hear about the dyslexic devil worshiper? A. He sold his soul to Santa. 7
They asked the Japanese visitor if they have elections in his country. "Every morning," he answers. 6
Q: What did the blind person say when given some matzah? A: Who the hell wrote this? 7
Q. What is orange and sounds like a parrot? A. A carrot. 0
How many men does it take to screw in a light bulb? One. Men will screw anything. 4
A dog walks into Western Union and asks the clerk to send a telegram. He fills out a form on which he writes down the telegram he wishes to send: "Bow wow wow, bow wow wow." The clerk says, "You can add another 'Bow wow' for the same price." The dog responded, "Now wouldn't that sound a little silly?" 0
Q: If a person who speaks three languages is called "trilingual," and a person who speaks two languages is cal

In [32]:
from src.preprocessing import PreprocessingPipeline, PunctuationNode, StopWordsNode, LemmatizationNode

In [33]:
pipeline = PreprocessingPipeline()
pipeline.add(PunctuationNode())
pipeline.add(StopWordsNode('english'))
pipeline.add(LemmatizationNode())

In [34]:
users = {}

for record in categorized.groupby(['user_id', 'category']).mean().iterrows():
    r = record[1]
    
    if record[0][0] not in users:
        users[record[0][0]] = np.ones(10) * -11
    
    users[record[0][0]][record[0][1]] = r['Rating']


In [35]:
preference_array = np.array(list(users.values()))

In [37]:
test = dl.test_data

In [38]:
test['category'] = test.apply(lambda x: clusters[x['joke_id'] - 1], axis=1)

In [39]:
pop = dl.train_data.groupby('joke_id').mean()

In [43]:
test

Unnamed: 0,id,user_id,joke_id,Rating,category
995679,19883_56,19883,56,3.000,0
83758,17001_5,17001,5,5.312,7
912487,15834_2,15834,2,-0.188,4
770113,40224_103,40224,103,7.562,0
636339,1786_25,1786,25,-0.188,7
...,...,...,...,...,...
240204,17508_122,17508,122,1.156,0
391967,3447_2,3447,2,-0.812,4
721679,8349_97,8349,97,5.938,0
352505,33633_116,33633,116,8.406,0


In [44]:
import math

err = 0
err_pop = 0
for r in test[:100].iterrows():
    joke_id = r[1]['joke_id']
    user_id = r[1]['user_id']

    sm = 0
    wg = 0

    for record in categorized[categorized['joke_id'] == joke_id].iterrows():
        profile = users[record[1]['user_id']]

        sim = math.sqrt(np.sum(np.power(profile - users[user_id], 2)))

        sm += sim * ((record[1]['Rating'] + 10) / 20)
        wg += sim

    pred = ((sm / wg) * 20) - 10
    pred_pop = pop.loc[joke_id]['Rating']
    
    err += abs(pred - r[1]['Rating'])
    err_pop += abs(pred_pop - r[1]['Rating'])

In [45]:
err / 100

4.266428191001645

In [46]:
err_pop / 100

4.209007716932173