In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize

In [2]:
df=pd.read_csv("tweets.csv",index_col=0)
df=df.drop(columns=["id","retweet","link"])
df

Unnamed: 0,date,text,author
0,Oct 7,Here is my statement.pic.twitter.com/WAZiGoQqMQ,DonaldTrump
1,Oct 10,Is this really America? Terrible!pic.twitter.c...,DonaldTrump
2,Oct 8,The media and establishment want me out of the...,DonaldTrump
3,Oct 8,Certainly has been an interesting 24 hours!,DonaldTrump
4,Oct 10,Debate polls look great - thank you!\n#MAGA #A...,DonaldTrump
...,...,...,...
17211,12 May 2009,"""My persona will never be that of a wallflower...",DonaldTrump
17212,8 May 2009,New Blog Post: Celebrity Apprentice Finale and...,DonaldTrump
17213,8 May 2009,Donald Trump reads Top Ten Financial Tips on L...,DonaldTrump
17214,4 May 2009,Donald Trump will be appearing on The View tom...,DonaldTrump


In [3]:
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))

def traitement(phrase):
    phrase=phrase.lower()
    texte2=(str(phrase).replace(',',' ')).lower()
    texte2=texte2.replace("'",' ')
    texte2=texte2.replace("-",' ')
    texte2=texte2.replace(".",' ')
    texte2=texte2.replace("*",' ')
    texte2=texte2.replace("\\",' ')
    texte2=texte2.replace("\n",' ')
    texte2=texte2.replace("/",' ')
    texte2=texte2.replace("!",' ')
    texte2=texte2.replace("?",' ')
    texte2=texte2.replace("\"",' ')
    texte2=texte2.replace("'",' ')
    texte2=texte2.replace("\#",' ')
    texte2=texte2.split()
    texte2=[token for token in texte2 if len(token) and token.lower() not in stopwords]
    return ' '.join(texte2)


In [4]:
df['textmodif']=df['text'].map(traitement)
df

Unnamed: 0,date,text,author,textmodif
0,Oct 7,Here is my statement.pic.twitter.com/WAZiGoQqMQ,DonaldTrump,statement pic twitter com wazigoqqmq
1,Oct 10,Is this really America? Terrible!pic.twitter.c...,DonaldTrump,really america terrible pic twitter com wiwc61...
2,Oct 8,The media and establishment want me out of the...,DonaldTrump,media establishment want race badly never drop...
3,Oct 8,Certainly has been an interesting 24 hours!,DonaldTrump,certainly interesting 24 hours
4,Oct 10,Debate polls look great - thank you!\n#MAGA #A...,DonaldTrump,debate polls look great thank #maga #americafi...
...,...,...,...,...
17211,12 May 2009,"""My persona will never be that of a wallflower...",DonaldTrump,persona never wallflower i’d rather build wall...
17212,8 May 2009,New Blog Post: Celebrity Apprentice Finale and...,DonaldTrump,new blog post: celebrity apprentice finale les...
17213,8 May 2009,Donald Trump reads Top Ten Financial Tips on L...,DonaldTrump,donald trump reads top ten financial tips late...
17214,4 May 2009,Donald Trump will be appearing on The View tom...,DonaldTrump,donald trump appearing view tomorrow morning d...


In [5]:
s = 200
alpha = 0.2


t_data = [TaggedDocument(words = word_tokenize(_d.lower()),tags = [str(i)]) for i,_d in enumerate(df['textmodif'])]
model = Doc2Vec(t_data,
                       vector_size = s,
                      alpha = alpha,
                      min_alpha = 0.0002,
                      min_count = 3,
                      dm = 1)
model.save('tweetmodel.model')

In [6]:
model=Doc2Vec.load('tweetmodel.model')
model.wv.get_vector('maga')

array([ 0.02989734, -0.17371747, -0.40268847,  0.24184512,  0.63705283,
       -1.4572862 , -0.6955053 , -0.44727808, -0.3739715 , -0.81122893,
        0.58594024, -0.8812702 , -0.3424612 ,  0.521235  ,  0.1273385 ,
       -0.28626457, -0.3297589 , -0.8762249 ,  0.21804239, -0.22836114,
        0.21468113, -0.37584338,  0.8460227 , -0.09754482,  0.24576154,
       -0.4796889 ,  0.25968164,  0.99666977,  0.00808955,  0.39554584,
       -0.06230652, -0.08484923,  0.04447736,  0.6485679 ,  0.36047202,
       -0.40060484, -0.0997917 ,  0.66346884, -0.46366298,  0.01487875,
       -0.13512166,  0.04205681,  0.23008803,  0.35610843,  0.24118853,
       -0.02216247, -0.40040267,  0.6510028 , -0.11912849, -0.5034711 ,
       -0.18093348, -1.1791862 ,  0.29843336, -0.01681757,  0.12606908,
       -0.2944003 ,  0.40580758,  0.45082483, -0.20505989,  0.5268697 ,
       -0.298173  , -0.9570813 , -1.1849259 ,  0.3926795 , -0.00435319,
       -0.33998868,  0.09002861,  0.8623706 ,  0.03376999, -0.07

In [7]:
len(model.wv.get_vector('one'))

200

In [8]:
def norme(vec):
    return np.sqrt(np.sum(vec*vec))
def prediction(vec1,vec2):
    val=np.sum((vec1*vec2))
    val/=(norme(vec1)*norme(vec2))
    if val>1:
        return np.arccos(1)#because of the structure of floats it can appen that the result is almost 1 but superior
    else:
        return np.arccos(val)
        
def taille(vec1,vec2):
    return abs(norme(vec1)-norme(vec2))

In [9]:
def vectorization(word):
    T=word.split(' ')
    vector=sum([vec(i) for i in T])
    return vector

def vec(mot):
    try:
        x=model.wv.get_vector(mot)
        return x
    except:
        return np.array([0]*s,dtype=np.float32)

def similarity(word1,word2):
    w1,w2=traitement(word1),traitement(word2)
    v1,v2=vectorization(w1),vectorization(w2)
    #print(v1,v2)
    return prediction(v1,v2)

In [10]:
similarity("France s",df['text'][10000])

1.6123903

In [11]:
def get_20_best(phrase):
    res={tweet:similarity(phrase,tweet) for tweet in df['text']}
    res=sorted(res, key= lambda A: res[A])
    return res[:20]

In [12]:
get_20_best("Build a wall !!!")

  """
  


['"@Retrogirl01: @AP Mr. Trump is right! BUILD THE WALL! @realDonaldTrump"',
 'We will stop heroin and other drugs from coming into New Hampshire from our open southern border. We will build a WALL and have security.',
 "Obama says a WALL at our southern border won't enhance our security (wrong) and yet he now wants to build a much bigger wall (fence) at W.H.",
 'Jeb Bush just talked about my border proposal to build a "fence." It\'s not a fence, Jeb, it\'s a WALL, and there\'s a BIG difference!',
 'Hillary will never reform Wall Street. She is owned by Wall Street!',
 '.@AnnCoulter has been amazing. We will win and establish strong borders, we will build a WALL and Mexico will pay. We will be great again!',
 'Mexico will pay for the wall!',
 'When will the Democrats, and Hillary in particular, say "we must build a wall, a great wall, and Mexico is going to pay for it?" Never!',
 "BREAKING - Border security rally in Phoenix, AZ at 2PM MST has been moved to @PhoenixConvCtr! Build a wall

In [14]:

dico={i:model.wv.get_vector(i) for i in model.wv.vocab}
dico

{'statement': array([ 3.20252359e-01, -2.51103491e-01,  2.29658648e-01, -1.05296886e+00,
         5.27975619e-01,  2.84988374e-01,  1.29523516e+00, -7.73346424e-01,
        -9.71662402e-02,  3.55340213e-01, -4.05815363e-01, -1.27917260e-01,
        -9.04865388e-04,  9.66802418e-01,  9.00478363e-01, -6.20631874e-01,
         8.39243382e-02, -1.40543938e-01, -2.88261503e-01, -7.51494057e-03,
        -3.92684102e-01,  6.04748964e-01,  1.33083120e-01, -1.95456550e-01,
        -6.32018819e-02, -7.85924315e-01, -1.29516408e-01, -8.78411055e-01,
         3.60575676e-01, -4.77904409e-01,  1.08067596e+00,  6.17702067e-01,
        -6.76711798e-01,  5.01320124e-01,  5.66176355e-01,  5.15600443e-01,
        -2.19182044e-01, -2.26469919e-01,  1.28516123e-01, -5.27665854e-01,
        -1.66790605e-01, -6.11651957e-01,  6.02435410e-01,  3.52260888e-01,
         2.63126582e-01,  1.89821076e+00, -2.75409669e-01, -2.24907368e-01,
         8.41750026e-01,  4.49362189e-01,  2.82219887e-01, -4.60865289e-01,

In [16]:
def vectorization2(word):
    T=word.split(' ')
    vector=sum([vec2(i) for i in T])
    return vector

def vec2(mot):
    try:
        x=dico[mot]
        return x
    except:
        return np.array([0]*s,dtype=np.float32)

def similarity2(word1,word2):
    w1,w2=traitement(word1),traitement(word2)
    v1,v2=vectorization2(w1),vectorization(w2)
    #print(v1,v2)
    return prediction(v1,v2)

In [17]:
def get_20_best2(phrase):
    res={tweet:similarity2(phrase,tweet) for tweet in df['text']}
    res=sorted(res, key= lambda A: res[A])
    return res[:20]

In [18]:
get_20_best2("Build a wall !!!")

  """
  


['"@Retrogirl01: @AP Mr. Trump is right! BUILD THE WALL! @realDonaldTrump"',
 'We will stop heroin and other drugs from coming into New Hampshire from our open southern border. We will build a WALL and have security.',
 "Obama says a WALL at our southern border won't enhance our security (wrong) and yet he now wants to build a much bigger wall (fence) at W.H.",
 'Jeb Bush just talked about my border proposal to build a "fence." It\'s not a fence, Jeb, it\'s a WALL, and there\'s a BIG difference!',
 'Hillary will never reform Wall Street. She is owned by Wall Street!',
 '.@AnnCoulter has been amazing. We will win and establish strong borders, we will build a WALL and Mexico will pay. We will be great again!',
 'Mexico will pay for the wall!',
 'When will the Democrats, and Hillary in particular, say "we must build a wall, a great wall, and Mexico is going to pay for it?" Never!',
 "BREAKING - Border security rally in Phoenix, AZ at 2PM MST has been moved to @PhoenixConvCtr! Build a wall