In [15]:
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from scipy import spatial
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/max/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

here we generate the dataframe, which keyword named as title and GT recommendation named as content.

We are using here just the NYT articles, as buy using articles from just one publisher we hope to get a better more consistant matching and less confusion, which is important, when building the pipeline

In [5]:
df = pd.read_csv('../comparison_dataset/articles1.csv')
df = df[df['publication'] == 'New York Times']
df = df.drop(columns = ['id', 'publication', 'author', 'date', 'year', 'month', 'url', 'Unnamed: 0'])
df

Unnamed: 0,title,content
0,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...
1,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood..."
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","When Walt Disney’s “Bambi” opened in 1942, cri..."
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isn’t..."
4,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea — North Korea’s leader, ..."
...,...,...
7798,U.N. Relief Official Calls Crisis in Aleppo th...,The top aid official at the United Nations gav...
7799,Federal Judge Curbs Enforcement of North Carol...,A federal judge on Friday curbed the enforceme...
7800,Mexicans Accuse President of ‘Historic Error’ ...,MEXICO CITY — If President Enrique Peña Nie...
7801,"U.S. Presses for Truce in Syria, With Its Larg...","HANGZHOU, China — The image of a Syrian ..."


now we try to find embedding methods, which in an ideal matching, or we could also just try to minimize distance between the embedded title and content

In [10]:
def embedding_by_doc_2_vec(df):
  data = df['content']
  tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
  # df['tagged_data'] = tagged_data
  max_epochs = 5
  vec_size = 20
  alpha = 0.025

  model = Doc2Vec(size=vec_size,
                  alpha=alpha, 
                  min_alpha=0.00025,
                  min_count=1,
                  dm =1)
    
  model.build_vocab(tagged_data)

  for epoch in range(max_epochs):
      print('iteration {0}'.format(epoch))
      model.train(tagged_data,
                  total_examples=model.corpus_count,
                  epochs=model.iter)
      # decrease the learning rate
      model.alpha -= 0.0002
      # fix the learning rate, no decay
      model.min_alpha = model.alpha
  print('here')
  print(model.docvecs['1'])
  return model

In [11]:
model_d2v = embedding_by_doc_2_vec(df)

iteration 0




iteration 1
iteration 2
iteration 3
iteration 4
here
[-1.2421117   4.9880443  -5.4423003   5.246497    4.4367924   2.7066674
 -1.5880928   0.10194322  1.2945052  -0.476859    2.9589288   2.1537604
 -4.810111    0.6981496   2.4136384  -1.8792477   1.5653974   3.1918175
  1.7164583   1.3136255 ]


In [12]:
model_d2v.save("doc2vec.model")

In [14]:
def embedd_d2v(model, doc):
    return model.infer_vector(doc.split())

df['embed_title_d2v'] = df['title'].apply(lambda x: embedd_d2v(model_d2v, x))
df['embed_content_d2v'] = df['content'].apply(lambda x: embedd_d2v(model_d2v, x))
df

Unnamed: 0,title,content,embed_title,embed_title_d2v,embed_content_d2v
0,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...,"[-0.0093585085, -0.021507362, 0.0061488478, 0....","[-0.0093585085, -0.021507362, 0.0061488478, 0....","[-0.8141588, 1.7652576, -1.0776699, 3.24824, 1..."
1,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood...","[-0.032501455, 0.14153877, -0.15343542, 0.2488...","[-0.0073677874, 0.048639294, -0.09777665, 0.11...","[-0.16269964, 4.9958744, -4.323945, 4.6282096,..."
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","When Walt Disney’s “Bambi” opened in 1942, cri...","[-0.045466833, 0.14027055, -0.27003285, 0.3213...","[-0.06263837, 0.004463125, -0.25444648, 0.2261...","[0.73779887, 2.3255024, -1.1746546, 2.1986437,..."
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isn’t...","[-0.03360937, 0.093950205, 0.000188482, 0.1365...","[0.023187965, 0.09655971, -0.14669798, 0.19396...","[1.4059936, -0.7528463, -0.4544508, 1.3375974,..."
4,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea — North Korea’s leader, ...","[-0.048443943, -0.0075524626, -0.01917088, 0.1...","[-0.041952282, -0.00042028353, -0.011524385, 0...","[1.358609, 0.8953577, -1.1506597, 0.68220687, ..."
...,...,...,...,...,...
7798,U.N. Relief Official Calls Crisis in Aleppo th...,The top aid official at the United Nations gav...,"[-0.0040729754, 0.06346059, -0.11238866, 0.093...","[-0.08016987, 0.07987044, -0.11769748, 0.16607...","[-0.60183525, 2.2357724, -1.6118, 1.7577279, 1..."
7799,Federal Judge Curbs Enforcement of North Carol...,A federal judge on Friday curbed the enforceme...,"[-0.015068225, -0.013702527, -0.022383027, -0....","[-0.0032212203, 0.007402448, -0.0582778, 0.024...","[-1.5910295, 1.7444589, -1.6425011, 3.146459, ..."
7800,Mexicans Accuse President of ‘Historic Error’ ...,MEXICO CITY — If President Enrique Peña Nie...,"[-0.0024897717, -0.002366119, -0.032554604, 0....","[-0.035053734, 0.12764448, -0.17784066, 0.1667...","[0.96068466, 1.5211244, -1.9774988, 2.3074777,..."
7801,"U.S. Presses for Truce in Syria, With Its Larg...","HANGZHOU, China — The image of a Syrian ...","[-0.029645665, 0.19971707, -0.17143646, 0.2518...","[-0.034036286, 0.03349768, -0.14917196, 0.1764...","[-0.0807275, 2.1167324, -0.7399275, 2.7920866,..."


In [17]:
def cosine_similarity(df):
    return 1 - spatial.distance.cosine(df[0], df[1])

In [18]:
df['d2v_similarity'] = df[['embed_title_d2v', 'embed_content_d2v']].apply(cosine_similarity, axis = 1)
df['d2v_similarity'].mean()

0.37736899936229895