In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_read_xml as pdx
import xml.etree.ElementTree as ET
from lxml import objectify 
import re

import gensim
from nltk import ngrams

from gensim.test.utils import datapath
from gensim import utils
import gensim.models

from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE 
import plotly


In [None]:
#Some offending characters had to be removed manually
df = pdx.read_xml('data/Sabanews_utf_8.xml', ['SabanewsData', 'Sabanews'],
                 root_is_rows=False)

In [33]:
df.head(5)

Unnamed: 0,ID,URL,Headline,Dateline,Text
0,SBN_ARB_0000001,http://www.sabanews.net/ar/news200024.htm,الكونجرس الأمريكي يطالب المجتمع الدولي دعم الي...,08/ديسمبر/2009,08/ديسمبر/2009] واشنطن ـ سبأنت: طالب الكونجرس ...
1,SBN_ARB_0000002,http://www.sabanews.net/ar/news200066.htm,مناقشة إستراتيجية التواصل الخاصة بالأجندة الوط...,08/ديسمبر/2009,08/ديسمبر/2009] صنعاء ـ سبأنت: جرى اليوم بوزار...
2,SBN_ARB_0000003,http://www.sabanews.net/ar/news200073.htm,نائب وزير التخطيط يلتقي بعثة الوكالة اليابانية...,08/ديسمبر/2009,08/ديسمبر/2009] صنعاء ـ سبأنت: أشاد نائب وزير ...
3,SBN_ARB_0000004,http://www.sabanews.net/ar/news200090.htm,الناطق الرسمي: الرؤية الحكيمة للدولة في التعام...,08/ديسمبر/2009,08/ديسمبر/2009] صنعاء – سبأنت : أكد الناطق الر...
4,SBN_ARB_0000005,http://www.sabanews.net/ar/news200092.htm,نوكيا تطرح خدمة الخرائط والملاحة في الأسواق,08/ديسمبر/2009,08/ديسمبر/2009] القاهرة ـ سبأنت: عبداللطيف الك...


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92149 entries, 0 to 92148
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        92149 non-null  object
 1   URL       92149 non-null  object
 2   Headline  92149 non-null  object
 3   Dateline  92149 non-null  object
 4   Text      92149 non-null  object
dtypes: object(5)
memory usage: 3.5+ MB


In [35]:
def clean_text(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']  
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
        
    text = text.strip()
    
    return text

In [39]:
df['clean_text'] = df['Text'].apply(lambda x: clean_text(x))

In [66]:
#removes date from main body of text
df['new_clean_text'] = df['clean_text'].apply(lambda x: x[x.find(']')+1:])

In [83]:
df['new_clean_text']

0         واشنطن  سبانت: طالب الكونجرس الامريكي الاداره...
1         صنعاء  سبانت: جري اليوم بوزاره التخطيط والتعا...
2         صنعاء  سبانت: اشاد نائب وزير التخطيط والتعاون...
3         صنعاء – سبانت : اكد الناطق الرسمي باسم الحكوم...
4         القاهره  سبانت: عبداللطيف الكوماني عرضت شركه ...
                               ...                        
92144     نيامي  سبانت: وقعت حكومه النيجر ومجموعه اريفا...
92145     نيودلهي  سبانت: ادي الزعيم القومي الهندوسي نا...
92146     القاهره  سبانت: اغلقت صناديق الاقتراع في الان...
92147     لندن  سبانت: اغلقت الاسهم الاوروبيه خلال تعام...
92148     المحويت  سبانت: احتفلت كليه المجتمع بمحافظه ا...
Name: new_clean_text, Length: 92149, dtype: object

In [81]:
#make sure model downloaded from https://github.com/bakrianoo/aravec is in local directory
t_model = gensim.models.Word2Vec.load('data/full_grams_cbow_100_wiki.mdl')

In [85]:
class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        #corpus_path = datapath('data/f')
        for line in df['new_clean_text']:
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

In [87]:
sentences = MyCorpus()

In [88]:
model  = gensim.models.Word2Vec(sentences)

In [93]:
pairs = [
    #('يهود', 'إسرائيل'),   # a minivan is a kind of car
    #('إيران', 'تهديد'),   # still a wheeled vehicle
    ('حرب', 'قمح')  # ok, no wheels, but still a vehicle
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, model.wv.similarity(w1, w2)))

'حرب'	'قمح'	0.08


In [97]:
for index, word in enumerate(model.wv.index_to_key):
    if index == 100:
        break
    print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

word #0/82165 is في
word #1/82165 is من
word #2/82165 is علي
word #3/82165 is الي
word #4/82165 is ان
word #5/82165 is التي
word #6/82165 is سبا
word #7/82165 is سبانت
word #8/82165 is اليوم
word #9/82165 is عن
word #10/82165 is مع
word #11/82165 is الذي
word #12/82165 is خلال
word #13/82165 is رئيس
word #14/82165 is اليمن
word #15/82165 is بين
word #16/82165 is العام
word #17/82165 is صنعاء
word #18/82165 is هذه
word #19/82165 is ما
word #20/82165 is هذا
word #21/82165 is عام
word #22/82165 is بعد
word #23/82165 is وفي
word #24/82165 is اليمنيه
word #25/82165 is محمد
word #26/82165 is وزير
word #27/82165 is العامه
word #28/82165 is كما
word #29/82165 is الدكتور
word #30/82165 is الوطني
word #31/82165 is عدد
word #32/82165 is مجلس
word #33/82165 is وقال
word #34/82165 is المحافظه
word #35/82165 is عدن
word #36/82165 is اهميه
word #37/82165 is العمل
word #38/82165 is اللجنه
word #39/82165 is قبل
word #40/82165 is الرئيس
word #41/82165 is عبد
word #42/82165 is المتحده
word #43/82165 is ا

In [105]:
print(model.wv.most_similar(positive=['فرس'], topn=10))

KeyError: "Key 'فرس' not present"

In [107]:
def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels

In [108]:
#this took close to 45 minutes on full power
x_vals, y_vals, labels = reduce_dimensions(model)

In [109]:
def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')

In [110]:
def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

82165

In [119]:
plot_function(x_vals[100:125], y_vals[100:125], labels[100:125])