In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys, os, django

sys.path.append('/home/max/software/django-tmv/tmv_mcc-apsis/BasicBrowser')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "BasicBrowser.settings")
django.setup()

from tmv_app.models import *

django.db.connection.close()

In [6]:
# Get the topic-term scores associated with the run_id from our database
run_id = 2927 
tts = pd.DataFrame(TopicTerm.objects.filter(run_id=run_id).values('topic_id','term_id','score')).sort_values('topic_id')
tts.head()

Unnamed: 0,topic_id,term_id,score
79780,213122,5107,0.013223
51626,213122,140,0.001642
47101,213122,7513,0.002288
65386,213122,55,0.038242
25044,213122,4026,0.013613


In [17]:
# Put them into a matrix, filling empty cells with 0
H = (tts
     .pivot(index="topic_id",columns="term_id")
     .fillna(0).values
    )
H.shape

(70, 6409)

In [18]:
# Get the vocabulary
vocab = pd.DataFrame(Term.objects.filter(id__in=tts['term_id'].unique()).values('title','id'))
vocab

Unnamed: 0,title,id
0,account,1
1,activ,2
2,addit,3
3,affect,4
4,also,5
...,...,...
6404,sandyrel,2396893
6405,particulatematterassoci,2420430
6406,particulatematterbound,2420431
6407,particulatematterparticulatematt,2420432


In [20]:
# Load the data we want to transform into the old model space
df = pd.read_csv('data/included_docs.csv')

# Get the docs with more than 10 words, and strip out copyright boilerplate
df['n_words'] = df['content'].str.findall(r'(\w+)').str.len()
docs = df[df['n_words']>10]
docs['content'] = [x.split("Copyright (C)")[0] for x in docs['content']]
docs['content'] = [re.split("\([C-c]\) [1-2][0-9]{3} Elsevier", x)[0] for x in docs['content']]
docs['content'] = [x.split("Published by Elsevier")[0] for x in docs['content']]
docs['content'] = [x.split("Copyright. (C)")[0] for x in docs['content']]
docs['content'] = [re.split("\. \(C\) [1-2][0-9]{3} ", x)[0] for x in docs['content']]
docs['content'] = [re.split("\. \(C\) Copyright", x)[0] for x in docs['content']]

ids = docs['id']
abstracts = docs['content']

In [21]:
from nltk.stem import SnowballStemmer
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import string

def tokenize(text):
    transtable = {ord(c): None for c in string.punctuation + string.digits}
    tokens = nltk.word_tokenize(text.translate(transtable))
    tokens = [i for i in tokens if len(i) > 2 and len(i) < 100]
    return tokens

class snowball_stemmer(object):
    def __init__(self):
        self.stemmer = SnowballStemmer("english")
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in tokenize(doc)]

tokenizer = snowball_stemmer()
stoplist = set(nltk.corpus.stopwords.words("english"))

# Vectorize the data into a document-term matrix tfidf
vectorizer = TfidfVectorizer(
    max_df=stat.max_df,
    min_df=stat.min_freq,
    ngram_range=(stat.ngram,stat.ngram),
    tokenizer=tokenizer,
    stop_words=stoplist
)
tfidf = vectorizer.fit_transform(abstracts).todense()
vocab_list = vectorizer.get_feature_names()

  'stop_words.' % sorted(inconsistent))


In [22]:
# Initialise an empty matrix with the vocab in the original topic model
X = np.matrix(np.zeros((len(docs),H.shape[1])))

# Fill this matrix with our data from the new documents
vocab_translate = {row['title']: i for i, row in vocab.iterrows()}
for i, word in enumerate(vocab_list):
    try:
        term_idx = vocab_translate[word]
    except KeyError as e:
        error = f"couldn't find {e}"
    X[:,term_idx] = tfidf[:,i]
    
X.shape

(22054, 6409)

In [23]:
from sklearn.decomposition import NMF

# Initialise an NMF classifier with the topic-terms from the run before 
clf = NMF(
    n_components=ttm.shape[0],
    init="custom",
    max_iter=5
)
clf.components_ = H
clf.n_components_ = H.shape[0]

# Transform the new data into the topic model space
W = clf.transform(
    X
)



In [43]:
# Save the nonzero elements in a long csv

dtm = pd.DataFrame(W)
dtm.columns = tts.topic_id.unique()
dtm.index = ids
dtm = (dtm
       .reset_index()
       .rename(columns={"id":"doc_id"})
       .melt(id_vars="doc_id", var_name="topic_id",value_name="score")
       .query("score>0")
      )
dtm.to_csv('data/doc_topic_scores.csv',index=False)
dtm

Unnamed: 0,doc_id,topic_id,score
0,4963912,213122,0.000206
1,4729030,213122,0.000412
3,678134,213122,0.010642
6,3388074,213122,0.002163
7,670073,213122,0.019230
...,...,...,...
1543769,3347158,213191,0.001954
1543772,4959920,213191,0.020337
1543775,5062214,213191,0.018152
1543777,568238,213191,0.000350
