In [7]:
import pandas as pd 
import json
import gzip
import shutil
import glob 
from langdetect import detect 
import numpy as np

# Abstract preprocessing 
# from nltk.stem import PorterStemmer 
# from nltk.tokenize import word_tokenize 
# from nltk.stem import WordNetLemmatizer 
# import gensim

# nltk.download('wordnet')

def save_and_get_authors_df(dataframe, outdir):
    df = dataframe
    authors = []
    for i in df.authors:
        authors.extend(i)
        
    author_df = pd.DataFrame.from_dict(authors)
    
    author_df = author_df.dropna(subset=['ids'])
    author_df.ids = author_df.ids.str[0]
    
    author_df = author_df.dropna() # probably unnecessary 
    author_df.ids.iloc[:] = author_df.ids.astype(int)
    
    author_df.to_csv(outdir+'authors.csv')
    return author_df

def get_train_df(dir_path, limit=-1, lang='en'):
	train_files = sorted(glob.glob(dir_path+"s2-corpus-*.gz"))
	print("Found {} files. Reading {}.".format(len(train_files), limit))

	lines = []
	# Load dataframe for all papers
	if limit == -1: 
		limit = len(train_files)-1  
	for filepath in train_files[:limit]:
	    print("Reading {}".format(filepath))
	    with gzip.open(filepath, 'rb') as f_in:
	        print(f_in)

	        # unzip, but not necessary 
	        # with open(filepath.strip('.gz'), 'wb') as f_out:
	        #     shutil.copyfileobj(f_in, f_out)

	        for cnt, line in enumerate(f_in):
	        	try: 
		            lines.append(json.loads(line))
		        except: # any line errors 
		        	pass 
	        	if cnt > 100: 
		        	break
	print('read in {}. entities'.format(len(lines)))

	# Create dataframe 
	print('Creating training DataFrame')
	train_df = pd.DataFrame.from_dict(lines)

	# remove any entities without abstracts
	print('Removing null abstracts')  
	train_df = train_df[train_df.paperAbstract != '']
    
    # remove null entities 
#     print('Removing null entities')
#     train_df = train_df[train_df.entities != []]

	train_df = train_df.head(20)


	# remove any that aren't of language lang:
	print('Only keeping {} language titles'.format(lang)) 
	train_df = train_df[[detect(i) =='en' for i in train_df.title]]

	print('Complete!')

	return train_df

In [8]:
df = get_train_df('data/papers/', limit=1)

Found 41 files. Reading 1.
Reading data/papers/s2-corpus-002.gz
<gzip _io.BufferedReader name='data/papers/s2-corpus-002.gz' 0x11c293b90>
read in 102. entities
Creating training DataFrame
Removing null abstracts
Only keeping en language titles
Complete!


In [9]:
df["entities"] = [np.nan for d in df['entities'] if d == []]

In [10]:
df.journalName

1                                                      
3                                                      
5                                                      
7                                                      
8                                                      
9                                                      
10              Prikladnaia biokhimiia i mikrobiologiia
13                                                     
14                                                     
15                                                     
19        2017 Global Internet of Things Summit (GIoTS)
22                              Problemy endokrinologii
23                                     The AAPS Journal
28                                                     
31    The Journal of pharmacology and experimental t...
Name: journalName, dtype: object

In [11]:
df = df[['entities', 'id', 'authors','paperAbstract','title']]

In [12]:
df['entities'] = [['b', 'a'] for d in df['entities']]

In [13]:
def save_fast_text_format(df, filename):
    """ Convert pandas data into fasttext friendly for training"""
    
    df['entities'] = [["__label__{}".format(label) for label in d] for d in df['entities']]
    df['entities'] = [" ".join(d) for d in df['entities']]
    df['ft'] =  df['entities'] + [' ' for i in df['entities']] + df['paperAbstract']
    
    with open(filename, 'w') as f:
        for item in list(df['ft'].values):
            f.write("%s\n" % item)    

In [16]:
save_fast_text_format(df, 'fast_text.txt')

In [17]:
# d.to_csv('save2.csv', index=False, columns=['ft'], header=False, mode='a')
#df.to_csv(r'c:\data\pandas.txt', header=None, index=None, sep=' ', mode='a')


In [19]:
df.to_csv('sample_data.csv')

In [21]:
! pwd 

/Users/naziahsiddique/Documents/assessment/2019/Summer


In [220]:
with open('your_file.txt', 'w') as f:
    for item in y:
        f.write("%s\n" % item)

In [None]:
df

In [47]:
import stemmer
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 
import gensim
   

def lemmatize_stemming(text):
    return ps.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result


ps = PorterStemmer() 
  
# choose some words to be stemmed 
words = ["program", "programs", "programer", "programing", "programers"] 
  
for w in words: 
    print(w, " : ", ps.stem(w)) 
    
    
processed_docs = preprocess("Perhaps Skinner was a flamboyant cheerleader , as Leith suggested. His work, however, did have profound effects on educational thought in a number of ways related to the general concept of programmed learning. Attention was focussed first on the objectives of instruction in a general sense (Bloom et al., 1956), and then on the specific objectives of the behavioural objectives movement (Mager, 1962). The idea of mastery learning, popular in the 1930's but failing because of lack of a technology to sustain it (Block, 1971, p.4), was also resurrected as a corollary of programmed instruction, and with it evolved an interest in learning hierarchies and the structure of knowledge (Gagné and Paradise, 1961). These, in turn, have been influential in the development of systems of learning and instruction (Keller, 1968; Bloom, 1968) which have achieved the improvements in standards of learning which Skinner had hoped to bring about by his technological revolution. Here the emphasis has turned from technology in education, with its emphasis on hardware, to the technology of education. The 'Taxonomy of Educational Objectives: Handbook 1: The Cognitive Domain' (1956) arrived at a time when the demand for improvements in the efficiency of education was being made by Skinner and his colleagues. The idea for such a taxonomy came during a meeting of examiners attending the 1948 American Psychological Association Convention. Bloom and thirty three colleagues met over a five year period to discuss their taxonomy, which was then organised and written by a select committee of five members. The taxonomy was to provide a 'theoretical framework which could be used to facilitate communication among examiners.' It is intended to provide for classification of the goals of our educational system. It is expected to be of general help to all teachers, administrators, professional specialists , and research workers who deal with curricular and evaluation problems. It is especially intended to help them discuss these problems with greater precision. (Bloom et al., 1956, p.1) It was hoped that such ambiguous terms as 'really understand', 'internalize knowledge' and 'grasp the core or essence', would be redefined as a set of standard classifications, making exchange of information about curricular development and evaluation more precise. Equally important, the psychological relationships within the taxonomy were seen as forming a basis for psychological investigations to shed light on changes in the learner's behaviour. This aspect of the taxonomy has been pursued in various form")
p = preprocess("Abstract A neutron source having an average energy of 0.532 keV has been established using a SbBe source in a polythene moderator and a B 4 C absorber assembly. The source strength ratio of the moderated to the bare source has been accurately measured using a precision long counter and agrees well with the theoretical value obtained by a Monte Carlo code.")

dictionary = gensim.corpora.Dictionary([processed_docs])
bow_corpus = [dictionary.doc2bow(doc) for doc in [processed_docs]]

lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")
    
bow_vector = dictionary.doc2bow(preprocess(unseen))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

ModuleNotFoundError: No module named 'stemmer'

program  :  program
programs  :  program
programer  :  program
programing  :  program
programers  :  program


In [80]:
bow_vector = dictionary.doc2bow(preprocess(unseen))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.8541603684425354	 Topic: 0.031*"educ" + 0.031*"taxonomi" + 0.026*"learn" + 0.021*"bloom" + 0.021*"object"
Score: 0.02083423361182213	 Topic: 0.008*"taxonomi" + 0.008*"educ" + 0.008*"technolog" + 0.008*"learn" + 0.008*"bloom"
Score: 0.02083423361182213	 Topic: 0.008*"taxonomi" + 0.008*"object" + 0.008*"learn" + 0.008*"educ" + 0.008*"technolog"
Score: 0.02083422988653183	 Topic: 0.008*"educ" + 0.008*"learn" + 0.008*"taxonomi" + 0.008*"bloom" + 0.008*"gener"
Score: 0.02083422988653183	 Topic: 0.008*"taxonomi" + 0.008*"educ" + 0.008*"learn" + 0.008*"technolog" + 0.008*"object"
Score: 0.02083422802388668	 Topic: 0.008*"taxonomi" + 0.008*"educ" + 0.008*"learn" + 0.008*"instruct" + 0.008*"object"
Score: 0.02083422616124153	 Topic: 0.008*"taxonomi" + 0.008*"bloom" + 0.008*"learn" + 0.008*"technolog" + 0.008*"educ"
Score: 0.02083422616124153	 Topic: 0.008*"learn" + 0.008*"taxonomi" + 0.008*"educ" + 0.008*"popular" + 0.008*"paradis"


In [79]:
unseen = ("__label__b __label__a A selective (inhibitory and stimulatory) effect of metal salts on secondary growth of representatives of 5 genera of actinomycetes and 5 genera of fungi was established. The effect of metal salts on the growth of procaryotes and eucaryotes have both common and specific features. Differences in the biosorption of metals (Mo, Mn, Cu, Zn) by cells of primary and secondary growth were revealed. The possibility of enhancing the sorption capacities of cells in secondary colonies of mycelial microorganisms should be taken into account as selecting cultures for the purification of environment from heavy metals.")

In [15]:
df

Unnamed: 0,entities,id,authors,paperAbstract,title
1,"[b, a]",940a0a673ea80522074bd2e37e89b674659d26fd,"[{'name': 'Zeno Atherton', 'ids': ['91488677']...",Abstract The synthetic utility of bis(trimethy...,"Rigid rod σ-acetylide complexes of iron, ruthe..."
3,"[b, a]",a090b5e520ea791acffd57706771bdc0f7ad0428,[],"Perhaps Skinner was a flamboyant cheerleader ,...",Bloom's Taxonomy of Objectives Psychology of L...
5,"[b, a]",2397ef2fcfac4e4a1697e9d29164f2dd022afd73,"[{'name': 'Xu Dong', 'ids': ['145688496']}]",Based on the current situation that lots of ed...,Establishment and Significance of Comparative ...
7,"[b, a]",58af723ae93ee59f918c5b1f76f3090fdc47cac8,"[{'name': 'Seong Heon Kim', 'ids': ['49899775'...",The gap states of the molybdenum-oxide (MoO x ...,Dark current reduction of small molecule organ...
8,"[b, a]",809c5a7f423502c85cf19386c99800350c728df9,"[{'name': 'Maria Hedefalk', 'ids': ['101247739...",The focus of this article is to explore presch...,Teaching for action competence
9,"[b, a]",350ca45195a022835a2f995699da004d4fa1731d,"[{'name': 'Sunita Kamboj', 'ids': ['50786328']...",Abstract A neutron source having an average en...,An intermediate energy neutron source
10,"[b, a]",4b50f4617c3e5d38c56ac728374cc522c90bffc4,"[{'name': 'N. A. Aitkhozhina', 'ids': ['136775...",A selective (inhibitory and stimulatory) effec...,[The stimulation and inhibition of the seconda...
13,"[b, a]",96ae4a0f5579e29a8ba6b2c677cfeaff8622fec3,"[{'name': 'Ganapathy Sambandamurthy', 'ids': [...",Current-voltage (I-V) characteristics of quenc...,Vortex dynamics and upper critical fields in u...
14,"[b, a]",2b4ca23ac7b4b3f6d463d47d089f2c7f38a78dd2,"[{'name': 'Julien Cadot', 'ids': ['115193372']...",Cross and Buccola (2004) established that if t...,Cooperative Strategy and Liquidation in the Bo...
15,"[b, a]",bff29a45efcf98749e99462fb7fe7d286abe307c,"[{'name': 'Henry M. Wood', 'ids': ['37874609']}]",Since the advent of next-generation sequencing...,Applications of Very Low-Coverage Sequencing i...
