#### Imports

In [None]:
%pip install stanza

import pandas as pd
import stanza, pickle
import statistics
from collections import Counter
import string

! pip install textstat
import textstat

#### Read data files

In [None]:
eng_train = pd.read_csv("eng/train/train.csv", header = 0)
tur_train = pd.read_csv("tur/train/train.csv", header = 0)

eng_test = pd.read_csv("eng/test/test.csv", header = 0)
tur_test = pd.read_csv("tur/test/test.csv", header = 0)

print(eng_train[:5])

Column "Time" should have been "Author"! Also let's change the name for "Publication Date" to "Time" as well:

In [None]:
# Column name error fix

eng_train1 = eng_train.rename(columns={"Time": "Author", "Publication Date": "Time"})
tur_train1 = tur_train.rename(columns={"Time": "Author", "Publication Date": "Time"})

eng_test1 = eng_test.rename(columns={"Time": "Author", "Publication Date": "Time"})
tur_test1 = tur_test.rename(columns={"Time": "Author", "Publication Date": "Time"})

eng_train1.head() # fixed!

In [None]:
tur_train1.head() # fixed!

In [None]:
print("Shape of English article dataset:", eng_train1.shape)

In [None]:
print("Shape of Turkish article dataset:", tur_train1.shape)

#### Basic Statistics

In [None]:
eng_train1.describe()

In [None]:
tur_train1.describe()

#### Own Statistics

In [None]:
def mean_str_data(df, col_name):

    lengths = []
    for str_data in df[col_name]:
        lengths.append(len(str_data))
    mean = sum(lengths)/len(lengths)
    return str(mean)


def max_str_data(df, col_name):

    lengths = []
    for str_data in df[col_name]:
        lengths.append(len(str_data))
    mean = max(lengths)
    return str(mean)


def min_str_data(df, col_name):

    lengths = []
    for str_data in df[col_name]:
        lengths.append(len(str_data))
    mean = min(lengths)
    return str(mean)


print("Mean length of English titles: " + mean_str_data(eng_train1, "Title"))
print("Mean length of Turkish titles: " + mean_str_data(tur_train1, "Title"))
print("------------------------------------")
print("Length of longest English title: " + max_str_data(eng_train1, "Title"))
print("Length of longest Turkish title: " + max_str_data(tur_train1, "Title"))
print("------------------------------------")
print("Length of shortest English title: " + min_str_data(eng_train1, "Title"))
print("Length of shortest Turkish title: " + min_str_data(tur_train1, "Title"))

#### Finding missing (NaN) values

In [None]:
eng_train1.isnull().sum() 

All missing values are in column "Author".

In [None]:
tur_train1.isnull().sum() 

Almost missing values are in column "Author". We also have a couple of missing articles in the dataset. 

In [None]:
eng_train_df = eng_train1.drop(["Author"], axis=1)
tur_train_df1 = tur_train1.drop(["Author"], axis=1)

eng_test_df = eng_test1.drop(["Author"], axis=1)
tur_test_df1 = tur_test1.drop(["Author"], axis=1)

For both datasets, column "Author" is missing more than 50% of its values. It is best if we drop this variable all together.

In [None]:
eng_train_df.isnull().sum() 

In [None]:
eng_test_df.isnull().sum() 

Rows with empty articles in Turkish dataset will also be deleted - from my perspective, they would be useless for linguistic analysis.

In [None]:
tur_train_df = tur_train_df1.dropna()
tur_test_df = tur_test_df1.dropna()

In [None]:
tur_train_df.isnull().sum()

In [None]:
tur_test_df.isnull().sum()

#### Basic Statistics with Stanza

In [None]:
# Prepare the nlp pipeline
stanza.download('en')
stanza.download('tr')

eng_nlp = stanza.Pipeline('en')
tur_nlp = stanza.Pipeline('tr')

#### Extract token frequencies info

In [None]:
def token_freqs_to_pickle(df, nlp_pipeline, frequency_file):
    
    # Initialize variables
    articles = df["Text"].values.tolist()

    # Iterate through all articles
    for article in articles:
        
        # Process the article with the stanza pipeline
        processed_article = nlp_pipeline(article)

        # Iterate through all sentences of the article
        sentences = processed_article.sentences
        token_frequencies = Counter()
        
        # Add the tokens to a counter
        for sentence in sentences:
            all_tokens =[token.text for token in sentence.tokens]
            token_frequencies.update(all_tokens)

    # Save the token frequencies as a pickle file
    pickle.dump(token_frequencies, open(frequency_file, "wb"))

In [None]:
en_path = 'en_tokenfrequencies.pkl'
tr_path = 'tr_tokenfrequencies.pkl'

token_freqs_to_pickle(eng_train_df, eng_nlp, en_path)
token_freqs_to_pickle(tur_train_df, tur_nlp, tr_path)

#### Get processed articles

In [None]:
def articles_to_pickle(df, nlp_pipeline, frequency_file):
    
    # Initialize variables
    articles = df["Text"].values.tolist()

    # Iterate through all articles
    for article in articles:
        
        # Process the article with the stanza pipeline
        processed_article = nlp_pipeline(article)

    # Save the token frequencies as a pickle file
    pickle.dump(processed_article, open(frequency_file, "wb"))

In [None]:
en_path2 = 'en_processedarticles.pkl'
tr_path2 = 'tr_processedarticles.pkl'

articles_to_pickle(eng_train_df, eng_nlp, en_path2)
articles_to_pickle(tur_train_df, tur_nlp, tr_path2)

#### Linguistic Units / Quality of the Processed Articles

In [None]:
en_nlp_output = pickle.load(open("en_processedarticles.pkl","rb"))
#print(en_nlp_output)
tr_nlp_output = pickle.load(open("tr_processedarticles.pkl","rb"))

In [None]:
for i, sentence in enumerate(en_nlp_output.sentences):
    # Only check first 20 sentences
    if i==20:
        break
        
    print(str(i), sentence.text)
    for word in sentence.words:
        # To check lemmatization, base words are excluded
        if not word.text == word.lemma:
            print(word.id, word.text, word.lemma)
    print()

Lemmatizations are almost correct for this particular sample (There is a particular error occuring when lemmatizing words ending with -ions e.g. "characterizations", "perceptions" and "reparations"). However, lemmatization of punctuation seems unnecessary and should be handled.

In [None]:
for i, sentence in enumerate(tr_nlp_output.sentences):
    # Only check first 20 sentences
    if i==20:
        break
        
    print(str(i), sentence.text)
    for word in sentence.words:
        if not word.text == word.lemma:
            print(word.id, word.text, word.lemma)
    print()

Turkish lemmatization seems to be more erronous than the English lemmatization. Moreover, the error cases vary in terms of the cause of the error.

#### Check POS-tags

In [None]:
def check_POS_tags(nlp_output):

    token_pos_frequencies = Counter()

    for sentence in nlp_output.sentences:
        token_pos = [(word.lemma, word.pos) for word in sentence.words]
        token_pos_frequencies.update(token_pos)
        
    print(token_pos_frequencies.most_common(50))


check_POS_tags(en_nlp_output)

In [None]:
check_POS_tags(tr_nlp_output)

For both dataset, the most frequent words are punctuation! It is best to remove them as they offer limited linguistic information.

#### Check the effect of stopwords

In [None]:
en_stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", 
                "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", 
                "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", 
                "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", 
                "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", 
                "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", 
                "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
tr_stopwords = ["acaba","ama","aslında","az","bazı","belki","biri","birkaç","birşey","biz","bu","çok","çünkü","da","daha","de","defa","diye","eğer","en","gibi","hem",
                "hep","hepsi","her","hiç","için","ile","ise","kez","ki","kim","mı","mu","mü","nasıl","ne","neden","nerde","nerede","nereye","niçin","niye","o","sanki",
                "şey","siz","şu","tüm","ve","veya","ya","yani"]


def calculate_token_frequencies(nlp_output,stopwords,ignore_stopwords=False):
    token_frequencies = Counter()
    for sentence in nlp_output.sentences:
        if ignore_stopwords:
            tokens = [token.text for token in sentence.tokens if token.text not in stopwords ]
        else:
            tokens = [token.text for token in sentence.tokens]

        token_frequencies.update(tokens)
    return token_frequencies

en_token_frequencies = calculate_token_frequencies(en_nlp_output,en_stopwords,ignore_stopwords=True)
print(en_token_frequencies.most_common(20))

In [None]:
tr_token_frequencies = calculate_token_frequencies(tr_nlp_output,tr_stopwords,ignore_stopwords=True)
print(tr_token_frequencies.most_common(20))

We have more linguistic info regarding the data. But the effect of keeping the punctuation and uppercased words is evident!

#### Normalization

In [None]:
en_frequencies_dataset = pickle.load(open("en_tokenfrequencies.pkl","rb"))
tr_frequencies_dataset = pickle.load(open("tr_tokenfrequencies.pkl","rb"))

def print_normalized_freqs(frequencies_dataset, stopwords):

    normalized_frequencies = Counter()
    for token, freq in frequencies_dataset.items():
        # Remove stopwords and punctuation
        if not token in stopwords and not token in string.punctuation:
            normalized_frequency = float(freq/frequencies_dataset[token])
            normalized_frequencies[token] = normalized_frequency
        
    print(normalized_frequencies.most_common(100))

print_normalized_freqs(en_frequencies_dataset, en_stopwords)

In [None]:
print_normalized_freqs(tr_frequencies_dataset, tr_stopwords)

It appears that some stopwords might have been skipped due to being uppercased. Hence it might be beneficial to lowercase the texts.

#### Dependency Parsing

In [None]:
def print_dep_pars(content):
    sentences = content.sentences
    for sentence in sentences[0:2]:
        print("id", "token", "head id", "head token", "dependency relation")
        for word in sentence.words:
            # word.head only provides the id of the word, here we determine the head token based on the id
            if word.head == 0:
                head_token ="root"
            else:
                head_token = sentence.words[word.head-1].text
            # Output the dependency relation
            print(word.id, word.text, word.head, head_token, word.deprel)

print_dep_pars(en_nlp_output)

In [None]:
print_dep_pars(tr_nlp_output)

#### Extracting stylistic features

In [None]:
def proc_articles(df, language):

    articles = df["Text"]
    nlp = stanza.Pipeline(language, processors='tokenize,pos,lemma')

    # Process the articles
    processed_articles =[]
    for article in articles:
        processed_articles.append(nlp.process(article))

    return processed_articles

In [None]:
eng_proc_arts = proc_articles(eng_train_df, 'en')

In [None]:
tur_proc_arts = proc_articles(tur_train_df, 'tr')

In [None]:
import en_core_web_sm

def proc_articles_spacy(df):

    articles = df["Text"]
    nlp =  en_core_web_sm.load()

    # Process the articles
    processed_articles =[]
    for article in articles:
        processed_articles.append(nlp(article))

    return processed_articles

In [None]:
en_spacy_arts = proc_articles_spacy(eng_train_df)

In [None]:
import spacy

def tr_proc_articles_spacy(df):

    articles = df["Text"]
    nlp = spacy.load("tr_core_news_trf")

    # Process the articles
    processed_articles =[]
    for article in articles:
        processed_articles.append(nlp(article))

    return processed_articles

In [None]:
tr_spacy_arts = proc_articles_spacy(tur_train_df)

In [None]:
import lftk

# en_features_list= ['t_word','t_stopword','t_uword','t_sent','a_syll_ps','a_word_ps','a_stopword_ps','fkre','fkgl','rt_fast','rt_average','rt_slow']

ttr = []
word_count=[]
sent_count = []
avg_sentence_len = []
fkre = []

for article in eng_proc_arts:

    # Calculate TTR
    token_frequencies = Counter()
    for sentence in article.sentences:
        all_tokens =[token.text for token in sentence.tokens]
        token_frequencies.update(all_tokens)
    num_types = len(token_frequencies.keys())
    num_tokens = sum(token_frequencies.values())
    tt_ratio = num_types/float(num_tokens)
    ttr.append(tt_ratio)

    # Calculate number of words in the text
    words = 0
    for sentence in article.sentences:
        words += len([token for token in sentence.tokens])
    word_count.append(words)

    # Calculate number of sentences
    sents = 0
    sents += len([sentence for sentence in article.sentences])
    sent_count.append(sents)

    # Calculate average sentence length
    sentence_lengths =[len(sentence.tokens) for sentence in article.sentences]
    avg_sentence_len.append(statistics.mean(sentence_lengths))


for article in en_spacy_arts:
    LFTK = lftk.Extractor(docs = article)
    LFTK.customize(stop_words=True, punctuations=False, round_decimal=3)
    extracted_features = LFTK.extract(features = ['fkre'])
    for key, value in extracted_features.items():
        fkre.append(value)
    

# Add the information to the data frame
eng_train_df["Type-Token Ratio"] = ttr
eng_train_df["Word Count"] = word_count
eng_train_df["Sentence Count"] = sent_count
eng_train_df["Avg Sentence Length"] = avg_sentence_len
eng_train_df["Flesch-Kincaid Reading Ease"] = fkre

eng_train_df.to_csv("en_stylistic_features.csv")

In [None]:
import lftk

ttr = []
avg_sentence_len = []
avg_num_words = []
word_count = []
sent_count = []

for article in tur_proc_arts:

    # Calculate TTR
    token_frequencies = Counter()
    for sentence in article.sentences:
        all_tokens =[token.text for token in sentence.tokens]
        token_frequencies.update(all_tokens)
    num_types = len(token_frequencies.keys())
    num_tokens = sum(token_frequencies.values())
    tt_ratio = num_types/float(num_tokens)
    ttr.append(tt_ratio)

    # Calculate number of words in the text
    words = 0
    for sentence in article.sentences:
        words += len([token for token in sentence.tokens])
    word_count.append(words)

    # Calculate number of sentences
    sents = 0
    sents += len([sentence for sentence in article.sentences])
    sent_count.append(sents)

    # Calculate average sentence length
    sentence_lengths =[len(sentence.tokens) for sentence in article.sentences]
    avg_sentence_len.append(statistics.mean(sentence_lengths))


for article in tr_spacy_arts:
    LFTK = lftk.Extractor(docs = article)
    LFTK.customize(stop_words=True, punctuations=False, round_decimal=3)
    extracted_features = LFTK.extract(features = ['a_word_ps'])
    for key, value in extracted_features.items():
        avg_num_words.append(value)

# Add the information to the data frame
tur_train_df["Type-Token Ratio"] = ttr
tur_train_df["Word Count"] = word_count
tur_train_df["Sentence Count"] = sent_count
tur_train_df["Avg Sentence Length"] = avg_sentence_len
tur_train_df["Avg Num of Words Per Sent"] = avg_num_words

tur_train_df.to_csv("tr_stylistic_features.csv")

Readability check is not working: will be fixed before final submission!

#### Plotting the data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb

time = ["am" if t.startswith("0") else "pm" for t in eng_train_df["Time"] ]

# We transform the time stamps into a categorical value
time = ["am" if t.startswith("0") else "pm" for t in eng_train_df["Time"] ]
print(time)
eng_train_df["Time Category"] = time


sb.lmplot(eng_train_df, x="Avg Sentence Length", y="Type-Token Ratio", hue="Country", col="Time Category", fit_reg = False )
plt.show()

More countries appeared to have outputted articles in English in the later hours.

### Tf-Idf

In [None]:
en_nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma')
tr_nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma')

def eng_preprocess(article):
    processed_article = en_nlp.process(article)
    all_lemmas = []
    for s in processed_article.sentences: 
        if len(s.text.strip())>0:
            lemmas = [word.lemma.lower() for word in s.words if not word.lemma==None]
            clean_lemmas = [lemma for lemma in lemmas if not lemma in en_stopwords and not lemma in string.punctuation]
            all_lemmas.extend(clean_lemmas)
    return all_lemmas

def tur_preprocess(article):
    processed_article = tr_nlp.process(article)
    all_lemmas = []
    for s in processed_article.sentences: 
        if len(s.text.strip())>0:
            lemmas = [word.lemma.lower() for word in s.words if not word.lemma==None]
            clean_lemmas = [lemma for lemma in lemmas if not lemma in tr_stopwords and not lemma in string.punctuation]
            all_lemmas.extend(clean_lemmas)
    return all_lemmas

# Read data files
eng_train_csv = "eng/train/train.csv"
eng_train_df = pd.read_csv(eng_train_csv, keep_default_na=False, header=0)

eng_test_csv = "eng/test/test.csv"
eng_test_df = pd.read_csv(eng_test_csv, keep_default_na=False, header=0)

tur_train_csv = "tur/train/train.csv"
tur_train_df = pd.read_csv(tur_train_csv, keep_default_na=False, header=0)

tur_test_csv = "tur/test/test.csv"
tur_test_df = pd.read_csv(tur_test_csv, keep_default_na=False, header=0)

# Filter out empty articles
eng_train_filter = eng_train_df[eng_train_df["Text"].str.len() >0 ]
eng_train_articles = eng_train_filter["Text"]

eng_test_filter = eng_test_df[eng_test_df["Text"].str.len() >0 ]
eng_test_articles = eng_test_filter["Text"]

tur_train_filter = tur_train_df[tur_train_df["Text"].str.len() >0 ]
tur_train_articles = tur_train_filter["Text"]

tur_test_filter = tur_test_df[tur_test_df["Text"].str.len() >0 ]
tur_test_articles = tur_test_filter["Text"]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# You can play around with the ngram range
eng_vectorizer = TfidfVectorizer(use_idf=True, tokenizer=eng_preprocess)
eng_train_tf_idf = eng_vectorizer.fit_transform(eng_train_articles)
eng_train_all_terms = eng_vectorizer.get_feature_names_out()

eng_test_tf_idf = eng_vectorizer.fit_transform(eng_test_articles)
eng_test_all_terms = eng_vectorizer.get_feature_names_out()

In [None]:
tur_vectorizer = TfidfVectorizer(use_idf=True, tokenizer=tur_preprocess)
tur_train_tf_idf = tur_vectorizer.fit_transform(tur_train_articles)
tur_train_all_terms = eng_vectorizer.get_feature_names_out()

eng_test_tf_idf = tur_vectorizer.fit_transform(tur_test_articles)
tur_test_all_terms = tur_vectorizer.get_feature_names_out()

### Word Vectors

##### Preprocessing

In [None]:
import re

nlp = spacy.load("en_core_web_sm") # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)
    
brief_cleaning = (re.sub(r'https?://\S+|www\.\S+|\[.*?\]|<.*?>+|\w*\d\w*|[{}]'.format(re.escape(string.punctuation)), ' ', str(row)).lower() for row in eng_train_df['Text'])

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]

df_clean = pd.DataFrame({'clean': txt})
df_clean.shape

In [None]:
import nltk
import zeyrek
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = stopwords.words('turkish')

analyzer = zeyrek.MorphAnalyzer()

stemmer = nltk.SnowballStemmer("english")

# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def stemm_text(text):
    text = ' '.join(analyzer.lemmatize(word) for word in text.split(' '))
    return text

def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    # Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    
    return text

In [None]:
tur_train_df['Text_Clean'] = tur_train_df['Text'].apply(preprocess_data)
tur_train_df.head()

In [None]:
tr_clean_df = tur_train_df['Text_Clean']

In [None]:
from gensim.models.phrases import Phrases, Phraser

sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [None]:
bigram = Phraser(phrases)
sentences = bigram[sent]

In [None]:
from collections import defaultdict  # For word frequency

word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

In [None]:
import multiprocessing

from gensim.models import Word2Vec

cores = multiprocessing.cpu_count() # Count the number of cores in a computer

w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [None]:
w2v_model.build_vocab(sentences, progress_per=10000)

In [None]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

In [None]:
w2v_model.init_sims(replace=True)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
 
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
def tsnescatterplot(model, word, list_names):
    """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
    its list of most similar words, and a list of words.
    """
    arrays = np.empty((0, 300), dtype='f')
    word_labels = [word]
    color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    
    # gets list of most similar words
    close_words = model.wv.most_similar([word])
    
    # adds the vector for each of the closest words to the array
    for wrd_score in close_words:
        wrd_vector = model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('blue')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    # adds the vector for each of the words from list_names to the array
    for wrd in list_names:
        wrd_vector = model.wv.__getitem__([wrd])
        word_labels.append(wrd)
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
        
    # Reduces the dimensionality from 300 to 50 dimensions with PCA
    reduc = PCA(n_components=15).fit_transform(arrays)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)
    
    Y = TSNE(n_components=2, random_state=0, perplexity=10).fit_transform(reduc)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))

In [None]:
import numpy as np

tsnescatterplot(w2v_model, 'zionism', [i[0] for i in w2v_model.wv.most_similar(negative=["zionism"])])