# ML 101

## Text Mining

In this project, we will explore a simplified version of the DPWC model to draw a graph of similar personalities.

The data will be consumed from Twitter, and for each search, we create a DWP profile with the most relevant words.

After we apply a Matrix Factorization to estimate the "real" value of the word with zero weight.

Finally, we use a thresholding method to create the entries on the graph.

In [None]:
!pip install pyvis gensim python-twitter git+git://github.com/mariolpantunes/nmf@main#egg=nmf git+git://github.com/mariolpantunes/uts@main#egg=uts --upgrade

In [None]:
import pprint
import twitter
import numpy as np

from IPython.core.display import display, HTML
from pyvis.network import Network

import uts.thresholding as thres

import nltk

from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import RegexpTokenizer

from nmf.nmf import nmf_mu

import math

pp = pprint.PrettyPrinter(indent=2)

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('rslp')

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('portuguese'))
stop_words.update(set(stopwords.words('english')))
stop_words.add('https')
stemmer = RSLPStemmer()


db = {}


def cosine_similarity(a,b):
  return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

def generate_ngrams(tokens, n=2):
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]


def tokenize(text):
    tokens = tokenizer.tokenize(text)
    tokens = [stemmer.stem(w.lower()) for w in tokens if not w in stop_words and w.isalpha() and len(w) > 3]
    #ngrams = generate_ngrams(tokens)
    #return tokens + ngrams
    return tokens


def get_term_frequency(corpus, p=0.2):
  tf = {}
  # count the terms
  for t in corpus:
    if t not in tf:
      tf[t] = 0
    tf[t]+=1
  
  # discard non-relevant items
  neighborhood = [(k, v) for k, v in tf.items() if v > 1] 
  neighborhood.sort(key=lambda tup: tup[1], reverse=True)
  limit = int(len(neighborhood)*p)
  neighborhood = neighborhood[:limit]

  # return 
  return neighborhood

In [None]:
api = twitter.Api(consumer_key='2lDgkNXdm03bxodf55vlY5IHo',
                  consumer_secret='w5SaNzPCLyaBL1ieyGpm4uwjan5Y2GDqQjbbSUoBTT5Fl3cLP4',
                  access_token_key='276620312-0oyEjiC76ouJXCWALH5P9L3NXHSQ7kPw75jL9wse',
                  access_token_secret='HuJgudHMikT6VGd13M79GkXf0IdzDw20xyePaM8gHRJgg')

terms = ["António Guterres", "Aníbal Cavaco Silva",
    "Mário Soares",
    "Pedro Passos Coelho",
    "José Manuel Durão Barroso",
    "José Sócrates",
    "Pedro Santana Lopes"]

for t in terms:
  if t not in db:
    results = api.GetSearch(term=t, count=300, lang='pt')
    corpus = []
    for r in results:
      corpus.extend(tokenize(r.text.lower()))
    tf = get_term_frequency(corpus)
    db[t] = tf

pp.pprint(f'{db}')

In [None]:
# create vector matrix
vocab = set()

for k in db:
  tf = db[k]
  for t,_ in tf:
    vocab.add(t)

X = np.zeros((len(db), len(vocab)))

r = 0
for k in db:
  tf = db[k]
  c = 0
  for t, v in tf:
    vocab.add(t)
    c += 1
    X[r,c] = v
  r +=1

rows, cols = X.shape
k = int(math.ceil(rows/2.0))
Xr, W, H, cost = nmf_mu(X, k=k, seed=42)

seeds = [3, 5, 7, 11, 13]
for s in seeds:
  Xt, Wt, Ht, costt = nmf_mu(X, k=k, seed=s)
  if costt < cost:
    cost = costt
    Xr = Xt

# compute the distance matrix (graph)
D = np.identity(len(db))
for i in range(0, len(Xr)):
  for j in range(0, len(Xr)):
    similarity = cosine_similarity(Xr[i], Xr[j])
    D[i][j] = similarity
    D[j][i] = similarity

# compute the ideal threshold
flat_distance = D.ravel()
#t = thres.isodata(flat_distance)
t = np.percentile(flat_distance, 75)

D[D<t] = 0

In [None]:
net = Network()

# Create the nodes
i=0
for k in db:
  net.add_node(i, label=k)
  i+=1


for i in range(0, len(D)-1):
  for j in range(1, len(D)):
    if (i!=j) and D[i][j] > 0:
      net.add_edge(i, j, weight=D[i][j])


net.show('network.html')
display(HTML('network.html'))