# Importing data and libraries

In [None]:
# Downloading csv datasets for Twitter15 and Twitter16 with links
!gdown 1SaSq8kwvNmxq2HoQBenhXC3ejM8BU70d
!gdown 1uGv2afj67P9BGEMwFPyv_IopjMzaqMuG

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import gensim
nltk.download('stopwords')
stop_words = stopwords.words('english')

# Text Preprocessing for Word2Vec



```
Columns in downloaded datasets: d1 for Twitter15 and d2 for Twitter16
tweet_id -> id of the source tweet
text -> text content of the source tweet
label -> label of the source tweet (True/False)
n2v -> node2vec embedding of the source tweet with p=1,q=1 (not important for this part)
```



In [None]:
d1 = pd.read_csv('t15_text_n2v.csv', encoding='utf-8')
d2 = pd.read_csv('t16_text_n2v.csv', encoding='utf-8')

#Preprocess content for word2vec (list of lists).

content1 = d1['text']
content2 = d2['text']

def remove_punctuations(data):
    punct_tag=re.compile(r'[^\w\s]')
    data=punct_tag.sub(r'',data)
    return data

def remove_url(data):
    url_tag=re.compile(r'URL')
    data=url_tag.sub(r'', data)
    return data

def remove_double_spaces(data):
    data = re.sub(' +', ' ', data)
    return data

def get_tokens(data):
    return data.split(' ')

def remove_stopwords(data):
    data = ' '.join([word for word in data.split() if word not in stop_words])
    return data

In [None]:
d1 = pd.read_csv('t15_text_n2v.csv', encoding='utf-8')
d2 = pd.read_csv('t16_text_n2v.csv', encoding='utf-8')

content1 = d1['text']
content2 = d2['text']

In [None]:
content1=content1.apply(lambda z: remove_punctuations(z))
content1=content1.apply(lambda z: remove_url(z))
content1=content1.apply(lambda z: remove_double_spaces(z))
content1=content1.apply(lambda z: remove_stopwords(z))
content1=content1.apply(lambda z: get_tokens(z))

content2=content2.apply(lambda z: remove_punctuations(z))
content2=content2.apply(lambda z: remove_url(z))
content2=content2.apply(lambda z: remove_double_spaces(z))
content2=content2.apply(lambda z: remove_stopwords(z))
content2=content2.apply(lambda z: get_tokens(z))

# Use Word2Vec model from Gensim for word embeddings

In [None]:
# Default alpha=0.025, epochs=5, vector_size=100
model1 = gensim.models.Word2Vec(window=10, min_count=4, sg=1, workers=1)
model1.build_vocab(content1)
model1.train(content1, total_examples=model1.corpus_count, epochs=model1.epochs)

model2 = gensim.models.Word2Vec(window=10, min_count=4, sg=1, workers=1)
model2.build_vocab(content2)
model2.train(content2, total_examples=model2.corpus_count, epochs=model2.epochs)

In [None]:
vocab1_size = len(model1.wv.key_to_index) + 1
vocab2_size = len(model2.wv.key_to_index) + 1
(vocab1_size, vocab2_size)

# Create an Embedding Matrix for each dataset and save them

In [None]:
e1 = np.zeros((vocab1_size, 100))
cnt = 1
for word, i in model1.wv.key_to_index.items():
  embedding_vector = model1.wv[word]
  if embedding_vector is not None:
    e1[cnt] = embedding_vector
    cnt += 1

e2 = np.zeros((vocab2_size, 100))
cnt = 1
for word, i in model2.wv.key_to_index.items():
  embedding_vector = model2.wv[word]
  if embedding_vector is not None:
    e2[cnt] = embedding_vector
    cnt += 1

In [None]:
np.save('t15_w2v_emb_matrix.npy', e1)
np.save('t16_w2v_emb_matrix.npy', e2)