In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_read_xml as pdx 
import re
import sys
from src import helper

import gensim
from gensim.test.utils import datapath
from gensim import utils
import gensim.models

from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go

from nltk import ngrams

from sklearn.decomposition import IncrementalPCA    
from sklearn.manifold import TSNE 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import src.stop_words




helper functions loaded successfully!


In [2]:
%%time
#Some offending characters had to be removed manually
df = pdx.read_xml('data/Sabanews_utf_8.xml', ['SabanewsData', 'Sabanews'],
                 root_is_rows=False)

CPU times: user 7.44 s, sys: 949 ms, total: 8.39 s
Wall time: 9.05 s


In [3]:
df = helper.date_cleaner(df)

In [4]:
%%time
df['Text'] = df['Text'].apply(lambda x: helper.clean_text(x))

CPU times: user 11.7 s, sys: 317 ms, total: 12 s
Wall time: 13.2 s


In [5]:
df = helper.prep_df_text(df)

In [None]:
df

In [None]:
%%time
#df = helper.lda_vectorizer(df, 'clean_text', 6, 1000)

In [None]:
docs = df['clean_text']

tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=1000,
                                    stop_words=src.stop_words.ar_stop_words)

tf = tf_vectorizer.fit_transform(docs)
tf_feature_names = tf_vectorizer.get_feature_names() # theses are the words in our bag of words

In [None]:
%%time
lda = LatentDirichletAllocation(n_components=6,
                                    max_iter=5,
                                    learning_method='online',
                                    random_state=0,
                                    n_jobs=-1)
lda.fit(tf)

In [None]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [None]:
num_top_words = 20
display_topics(lda, tf_feature_names, num_top_words)

In [54]:
df['totalwords'] = [len(x.split()) for x in df['clean_text'].tolist()]

In [55]:
df['totalwords'].mean()

171.15499896906098

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
ax.hist(df['totalwords'], bins = 50)
plt.axvline(x=df['totalwords'].mean(), linestyle='--', color='red')
plt.xlabel('Number of words per article')
plt.ylabel('Frequency of Articles in Yemen Corpus')
plt.title('Histogram of Article Length (in words) for Yemeni Articles')
ax.legend(['mean article length', 'frequency values'])
plt.savefig('images/histogram.png')

In [6]:
%%time 
#make sure model downloaded from https://github.com/bakrianoo/aravec is in local directory
t_model = gensim.models.Word2Vec.load('data/full_grams_cbow_100_wiki.mdl')

CPU times: user 18.8 s, sys: 1.28 s, total: 20.1 s
Wall time: 23 s


In [7]:
year_dict = helper.text_by_year(df)

In [8]:
%%time
model_dict = helper.token_and_model(year_dict)

CPU times: user 4min 20s, sys: 4.53 s, total: 4min 25s
Wall time: 2min 23s


In [9]:
model_dict.items()

dict_items([('model_df_2011', <gensim.models.word2vec.Word2Vec object at 0x7fb19169f4c0>), ('model_df_2009', <gensim.models.word2vec.Word2Vec object at 0x7fb18f1f7190>), ('model_df_2010', <gensim.models.word2vec.Word2Vec object at 0x7fb168b48e50>), ('model_df_2013', <gensim.models.word2vec.Word2Vec object at 0x7fb1689296a0>), ('model_df_2014', <gensim.models.word2vec.Word2Vec object at 0x7fb168b48d90>), ('model_df_2012', <gensim.models.word2vec.Word2Vec object at 0x7fb168313c70>)])

In [10]:
final_df = pd.read_csv('final_df.csv')

In [None]:
%%time
#final_result = helper.reduce_dimensions(model_dict)

In [None]:
#final_result.to_csv('final_df.csv')

In [None]:
results = set()
df['clean_text'].str.lower().str.split().apply(results.update)
print(len(results))

In [None]:
word = 'ملك'
n = 10
year_a = 2009
year_b = 2014

In [None]:
for m in model_dict.items():
    if year_a == int(m[0][-4:]):
        model_a = m[1]
    if year_b == int(m[0][-4:]):
        model_b = m[1]
    else:
        None 

In [None]:
#generates list of similar words based on input word for each year(a or b) model
sim_list_a = model_a.wv.most_similar(positive=[word], topn=n)
sim_list_b = model_b.wv.most_similar(positive=[word], topn=n)

#holders for values to be graphed
label_list_a = []
x_list_a = []
y_list_a = []

label_list_b = []
x_list_b = []
y_list_b = []

In [None]:

for i, v in enumerate(sim_list_a):
    curr_label = sim_list_a[i][0]
    label_list_a.append(curr_label)
    x_list_a.append(final_df.loc[final_df['labels'] == curr_label, 'x_vals'].iloc[0])
    y_list_a.append(final_df.loc[final_df['labels'] == curr_label, 'y_vals'].iloc[0])
    
for i, v in enumerate(sim_list_b):
    curr_label = sim_list_b[i][0]
    label_list_b.append(curr_label)
    x_list_b.append(final_df.loc[final_df['labels'] == curr_label, 'x_vals'].iloc[0])
    y_list_b.append(final_df.loc[final_df['labels'] == curr_label, 'y_vals'].iloc[0])

In [None]:
fig = make_subplots(rows=1, cols=2)
    
trace_a = go.Scatter(x=x_list_a, y=y_list_a, mode='text', text=label_list_a, 
                     name=f"{word}, {year_a}")
trace_b = go.Scatter(x=x_list_b, y=y_list_b, mode='text', text=label_list_b, 
                     name=f"{word}, {year_b}")
 
fig.add_trace((trace_a), row=1, col=1)
fig.add_trace((trace_b), row=1, col=2)

fig.update_layout(height=600, width=800, 
    title_text=f"Word Embedding Comparison: {n} nearest neighbors for {word}, between {year_a} and {year_b}")
                
fig.show()

In [52]:
word = 'جريمه'
n = 10
year_a = 2009
year_b = 2014

In [53]:
helper.plot_n_closest(final_df, model_dict, word, 10, 2009, 2014).write_image('images/muqaqimah_2009_2014.png')

AttributeError: 'NoneType' object has no attribute 'write_image'

In [None]:
def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    indices = list(range(len(labels)))
    #selected_indices = random.sample(indices, 25)
    for i in indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))


In [None]:
def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')

In [None]:
plot_with_plotly(x_vals[300:325], y_vals[300:325], labels[300:325])