In [6]:
# Importing the required libraries.
import numpy as np
import pandas as pd
import pickle, zlib
from random import sample
import scipy.cluster.hierarchy as sch
from gensim.models.doc2vec import Doc2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
from gensim import corpora, models

# Show graph
%matplotlib inline
import matplotlib.pyplot as plt
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from numpy import array
import gensim.corpora as corpora

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# For TF-IDF keywords and scores.
def display_scores(vectorizer, tfidf_result):
    scores = zip(vectorizer.get_feature_names(),np.asarray(tfidf_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    tf_idf_words, tf_idf_scores = [], []
    useless_words = set(['offici','said','govern','near','irregular','special','say','ad','minist','chief','clash','bodi','local','time','work','vigil','mla','region', 'get','start','member','mahatma','congress','state','gram','depart', 'rs', 'crore', 'also', 'card', 'district', 'tuesday', 'offic', 'year', 'meet', 'day', 'would', 'peopl', 'nation', 'lakh', 'plan', 'union', 'alleg', 'provid', 'two', 'km', 'taken', 'guarante', 'take', 'complet', 'report', 'case', 'found', 'per', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'monday', 'tuesday', 'wednesday', 'thrusday', 'friday', 'saturday', 'sunday', 'issu', 'ask', 'level', 'order', 'parti', 'director', 'releas', 'bjp', 'sourc', 'cm', 'injur', 'mr', 'ramesh', 'visit', 'act', 'famili', 'secretari', 'first', 'last', 'includ', 'probe', 'direct', 'month'])
    for item in sorted_scores:
        if item[0] not in useless_words:
            tf_idf_words.append(item[0])
            tf_idf_scores.append(np.round(item[1],2))
    return tf_idf_words,tf_idf_scores

# Resolve article_ids to their corresponding titles and tf_idf_keywords.
def resolve_articles(ids, dataset):
    data = {}
    for i in dataset:
        if i[0] not in data:
            data[i[0]] = [i[1],i[3]]
    titles = []
    resolved_text = []
    for i in ids:
        titles.append(data[i][0])
        resolved_text.append(data[i][1])
    for i in range(len(resolved_text)):
        temp = ''
        for j in resolved_text[i]:
            temp+=(j+' ')
        resolved_text[i] = temp      
    vectorizer = TfidfVectorizer()
    tfidf_result = vectorizer.fit_transform(resolved_text)
    result = display_scores(vectorizer, tfidf_result)
    return titles, result[0], result[1]
        

In [5]:
import numpy as np

def intersection(lst1, lst2): 
    return list(lst1 & lst2) 

def find_unique(articles):
    n = len(articles)
    l = [[0]*n]*n
    matched_count = np.array(l,dtype=np.float32)
    ind = range(n)
    art_split = [set(i.split(' ')) for i in articles]
    art_len = [len(i.split(' ')) for i in articles]
    for i, j in zip(ind, ind[1:]):
        avg = (art_len[i] + art_len[j])/2
        cnt = len(intersection(art_split[i],art_split[j]))/avg
        matched_count[i,j] = cnt
        matched_count[j,i] = cnt
    aggr = [sum(i) for i in matched_count]
    indx = np.argsort(aggr)
    return indx       

In [8]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [9]:
def compute_coherence_values(dictionary, corpus, texts, limit=20, start=5, step=5):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [10]:
def plot_values(coherence_values,start=5,limit=20,step=5):
  x = range(start, limit, step)
  plt.plot(x, coherence_values)
  plt.xlabel("Num Topics")
  plt.ylabel("Coherence score")
  plt.legend(("coherence_values"), loc='best')
  plt.show()

In [59]:
def lda_model(docs,topics=10,start=10,limit=20,step=5,use_coherence=0):
  id2word = corpora.Dictionary(docs)# Create Corpus
  texts = docs# Term Document Frequency
  corpus = [id2word.doc2bow(text) for text in texts]# View
  if use_coherence:
    model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts)
    plot_values(coherence_values)
    lda = model_list[coherence_values.index(max(coherence_values))]
    fin_top = coherence_values.index(max(coherence_values))
  else:
    lda = LdaModel(corpus=corpus, id2word=id2word, num_topics=topics)
  #lda_model.save('/content/drive/My Drive/MTP/MTP_Final/model/lda_nagri_1')
  for i in lda.show_topics():
    print(i)
  return lda,corpus#,fin_top

In [60]:
def top_docs(index,lda_model,corpus,filtered_docs,topics=10,top_num=30):
  topic_dict = {i: [] for i in range(topics)}  # Assuming you have 20 topics. 
  #Loop over all the documents to group the probability of each topic
  num_docs=len(filtered_docs)
  for docID in range(num_docs):
      topic_vector = lda_model[corpus[docID]]
      #print(docID,topic_vector)
      for topicID, prob in topic_vector:
          topic_dict[topicID].append((docID, prob))
  res_index = []
  #Then, you can sort the dictionary to find the top 20 documents:
  docs = []
  for topicID, probs in topic_dict.items():
      doc_probs = sorted(probs, key = lambda x: x[1], reverse = True)
      #print(doc_probs)
      docs_top = [dp[0] for dp in doc_probs[:top_num]]
      ind = [index[dp[0]] for dp in doc_probs[:top_num]]  
      res_index.append(ind)
      docs.append(docs_top)
  #print(docs)
  final_docs=[]
  final_ind = []
  for i in docs:
    lst = []
    for j in i:
      #print(len(lst))
      final_docs.append(filtered_docs[j])
  #print(len(final_docs))
  for i in res_index:
    for j in i:
      final_ind.append(j)
  #print(final_ind)
  return final_docs,final_ind

In [62]:
import pandas as pd
datasets = ['dataset_agriculture', 'dataset_development', 'dataset_environment', 'dataset_industrialization', 'dataset_lifestyle']
model_names = ['model_agriculture', 'model_development', 'model_environment', 'model_industrialization', 'model_lifestyle']
final_res_all = []
for dataset, model in zip(datasets,model_names):
    dataset_name = dataset
    # Printing the collection name.
    collection_name = dataset[8:]
    print('\nCollection:',collection_name.capitalize())

    # Loading the dataset and the model from the drive.
    file = open('Datasets/'+dataset, 'rb')
    dataset = pickle.loads(zlib.decompress(pickle.load(file)))
    file.close()
    
    # Collecting the article_ids, and corresponding article_vectors for each class.
    temp_ids = [set() for _ in range(9)]
    temp_vectors = [[] for _ in range(9)]
    temp_datasets = [[] for _ in range(9)]
    for i in dataset:
        if i[6]=='Unemp' and i[-1]=='Slow':
            if i[0] not in temp_ids[0]:
                temp_ids[0].add(i[0])
                temp_vectors[0].append(i[2].split(' '))
        if i[6]=='Unemp' and i[-1]=='Average':
            if i[0] not in temp_ids[1]:
                temp_ids[1].add(i[0])
                temp_vectors[1].append(i[2].split(' '))
        if i[6]=='Unemp' and i[-1]=='Fast':
            if i[0] not in temp_ids[2]:
                temp_ids[2].add(i[0])
                temp_vectors[2].append(i[2].split(' '))
        if i[6]=='Agri' and i[-1]=='Slow':
            if i[0] not in temp_ids[3]:
                temp_ids[3].add(i[0])
                temp_vectors[3].append(i[2].split(' '))
        if i[6]=='Agri' and i[-1]=='Average':
            if i[0] not in temp_ids[4]:
                temp_ids[4].add(i[0])
                temp_vectors[4].append(i[2].split(' '))
        if i[6]=='Agri' and i[-1]=='Fast':
            if i[0] not in temp_ids[5]:
                temp_ids[5].add(i[0])
                temp_vectors[5].append(i[2].split(' '))
        if i[6]=='Non Agri' and i[-1]=='Slow':
            if i[0] not in temp_ids[6]:
                temp_ids[6].add(i[0])
                temp_vectors[6].append(i[2].split(' '))
        if i[6]=='Non Agri' and i[-1]=='Average':
            if i[0] not in temp_ids[7]:
                temp_ids[7].add(i[0])
                temp_vectors[7].append(i[2].split(' '))
        if i[6]=='Non Agri' and i[-1]=='Fast':
            if i[0] not in temp_ids[8]:
                temp_ids[8].add(i[0])
                temp_vectors[8].append(i[2].split(' '))
    # Finding the top titles and keywords for each class.
    names = ['unemp_avg','unemp_slow','unemp_fast','agri_avg','agri_slow','agri_fast','non_agri_avg','non_agri_slow','non_agri_fast']
    df = pd.DataFrame()
    df_ids = pd.DataFrame()
    for i in range(9):
        print(names[i].capitalize())
        result = lda_model(temp_vectors[i])
        final_res = top_docs(list(temp_ids[i]),result[0],result[1],temp_vectors[i])#,(result[2]+1)*5)
        aresult = resolve_articles(final_res[1],dataset)
        for j in aresult[0][:10]:
            print(j)
        df[names[i]+'_title'] = aresult[0][:10]
        df[names[i]+'_id']= final_res[1][:10]
        if len(final_res[1])<100:
            df_ids[names[i]+'_id'] = final_res[1][:100] + ['0' for _ in range(100-len(final_res[1]))]
        else:
            df_ids[names[i]+'_id'] = final_res[1][:100]
    
    df.to_excel('LDA/'+collection_name+'.xlsx',index=False)
    df_ids.to_excel('LDA/'+collection_name+'_ids.xlsx',index=False)


Collection: Agriculture
Unemp_slow
(0, '0.028*"the" + 0.022*"of" + 0.014*"and" + 0.012*"to" + 0.011*"a" + 0.011*"in" + 0.006*"for" + 0.005*"is" + 0.005*"was" + 0.004*"that"')
(1, '0.034*"the" + 0.019*"of" + 0.016*"in" + 0.015*"to" + 0.014*"and" + 0.006*"that" + 0.006*"a" + 0.005*"for" + 0.004*"on" + 0.004*"is"')
(2, '0.026*"the" + 0.018*"of" + 0.015*"in" + 0.014*"and" + 0.011*"to" + 0.008*"for" + 0.008*"a" + 0.004*"on" + 0.004*"at" + 0.004*"was"')
(3, '0.069*"the" + 0.026*"to" + 0.023*"of" + 0.021*"and" + 0.019*"in" + 0.018*"a" + 0.015*"for" + 0.011*"on" + 0.010*"that" + 0.008*"water"')
(4, '0.063*"the" + 0.042*"of" + 0.033*"in" + 0.029*"to" + 0.026*"and" + 0.013*"a" + 0.011*"is" + 0.010*"for" + 0.008*"has" + 0.008*"on"')
(5, '0.025*"and" + 0.008*"of" + 0.007*"the" + 0.006*"0" + 0.006*"in" + 0.005*"each," + 0.004*"over" + 0.004*"at" + 0.004*"25" + 0.004*"places"')
(6, '0.047*"the" + 0.036*"of" + 0.024*"and" + 0.023*"in" + 0.019*"to" + 0.011*"for" + 0.011*"a" + 0.009*"farmers" + 0.009*

225
MPHRC comes to the rescue of commoners
Expert blames education system for farm crisis
Search on to find dead lion's nails
Sharp rise in prices of small onions
Time to catch a ‘power’ nap
Monsoon still inactive despite rain
Protective measure
Winter’s last lap gets longer
'Stop water supply from N'sagar to Krishna delta'
No house, no car for Koneru!
Agri_fast
(0, '0.024*"the" + 0.017*"in" + 0.015*"to" + 0.015*"and" + 0.014*"of" + 0.007*"a" + 0.007*"is" + 0.005*"was" + 0.005*"are" + 0.004*"for"')
(1, '0.065*"the" + 0.034*"to" + 0.034*"of" + 0.024*"and" + 0.022*"in" + 0.014*"a" + 0.012*"for" + 0.010*"is" + 0.008*"that" + 0.008*"on"')
(2, '0.037*"the" + 0.015*"a" + 0.013*"to" + 0.013*"and" + 0.013*"of" + 0.013*"in" + 0.007*"for" + 0.006*"is" + 0.006*"was" + 0.005*"on"')
(3, '0.064*"the" + 0.035*"in" + 0.034*"of" + 0.030*"and" + 0.023*"to" + 0.017*"a" + 0.011*"on" + 0.008*"is" + 0.008*"from" + 0.007*"for"')
(4, '0.071*"the" + 0.035*"of" + 0.023*"in" + 0.019*"and" + 0.017*"a" + 0.016*"to

224
Marhoura loco unit may go on stream in 3 yrs
Mysuru-Kochi flights likely to begin after poll code is lifted
Nitish Kumar opens stadium, museum in Munger
Ekma gears up for CM Nitish Kumar’s visit this month
Shuttle flies outdoors: Badminton’s new version hopes to bring in casual fans, players into fold
4,000 junior engineers to be recruited soon: CM Nitish Kumar
New system to aid wage-earners
Applicants for MGNREGS work on the rise in cyclone-hit districts
College dog show  
Jawan suicide
Unemp_stag
(0, '0.037*"the" + 0.022*"and" + 0.022*"of" + 0.018*"in" + 0.017*"to" + 0.014*"a" + 0.008*"for" + 0.007*"be" + 0.007*"on" + 0.007*"is"')
(1, '0.063*"the" + 0.033*"of" + 0.025*"to" + 0.019*"in" + 0.018*"and" + 0.016*"a" + 0.013*"for" + 0.008*"on" + 0.007*"by" + 0.007*"is"')
(2, '0.041*"the" + 0.025*"of" + 0.023*"in" + 0.021*"and" + 0.019*"to" + 0.016*"a" + 0.009*"for" + 0.009*"that" + 0.008*"is" + 0.007*"has"')
(3, '0.025*"the" + 0.018*"of" + 0.015*"to" + 0.013*"in" + 0.011*"and" + 0.010*

170
New Solapur-Bangalore train in rail budget likely
MNREGA funds to revive two more rivers
Imported buoys to be floated in Gulf of Mannar
BJP bests Congress, JD-S in Karnataka district polls
Tribal areas undergoing socio-economic transformation in Himachal: Study
Dual taxation puts off Koreans
Maharashtra to sign MoU for development of 10 airports in the state
Parched earth, broken promises
Falling potato prices worries Idukki farmers
Pay revision for polytechnic teachers okayed
Non_agri_slow
(0, '0.066*"the" + 0.035*"of" + 0.030*"to" + 0.022*"and" + 0.020*"in" + 0.011*"for" + 0.010*"a" + 0.008*"at" + 0.008*"is" + 0.007*"on"')
(1, '0.030*"the" + 0.023*"of" + 0.019*"and" + 0.018*"in" + 0.013*"a" + 0.012*"to" + 0.010*"on" + 0.008*"for" + 0.007*"that" + 0.006*"be"')
(2, '0.013*"the" + 0.012*"to" + 0.012*"of" + 0.006*"and" + 0.005*"a" + 0.005*"pm:" + 0.005*"in" + 0.004*"on" + 0.004*"for" + 0.004*"at"')
(3, '0.036*"the" + 0.024*"to" + 0.021*"and" + 0.016*"of" + 0.015*"in" + 0.011*"a" + 0.

(0, '0.025*"the" + 0.011*"of" + 0.011*"and" + 0.009*"to" + 0.007*"in" + 0.006*"a" + 0.005*"is" + 0.004*"have" + 0.003*"for" + 0.003*"on"')
(1, '0.003*"the" + 0.002*"to" + 0.002*"in" + 0.001*"a" + 0.001*"and" + 0.001*"of" + 0.001*"by" + 0.001*"on" + 0.001*"from" + 0.001*"that"')
(2, '0.063*"the" + 0.030*"in" + 0.028*"and" + 0.026*"of" + 0.022*"a" + 0.021*"to" + 0.012*"on" + 0.009*"was" + 0.008*"for" + 0.008*"from"')
(3, '0.012*"the" + 0.011*"of" + 0.007*"to" + 0.006*"and" + 0.005*"a" + 0.005*"in" + 0.003*"is" + 0.002*"for" + 0.002*"are" + 0.002*"that"')
(4, '0.057*"the" + 0.036*"of" + 0.031*"to" + 0.022*"in" + 0.020*"a" + 0.016*"and" + 0.012*"on" + 0.010*"for" + 0.009*"is" + 0.008*"The"')
(5, '0.069*"the" + 0.038*"of" + 0.033*"and" + 0.028*"to" + 0.025*"in" + 0.015*"a" + 0.012*"for" + 0.010*"on" + 0.010*"is" + 0.007*"that"')
(6, '0.019*"the" + 0.011*"of" + 0.011*"in" + 0.007*"and" + 0.006*"to" + 0.005*"a" + 0.004*"for" + 0.003*"on" + 0.003*"be" + 0.003*"said"')
(7, '0.060*"the" + 0.035*

224
Tiger, Tiger Burning Slight
Four cops suspended in Lohit
Land grab: Two women, man get one year in jail, Rs 5,000 fine
Forest officials dismissed over tree felling
P.C. George seeks CBI probe
Idukki sanctuary to be a tourist spot soon
New evergreen tree species found in Western Ghats
State nod to road project through forest
Textile processing cluster coming up
Govt acts tough with tendu contractors
Non_agri_slow
(0, '0.007*"the" + 0.005*"and" + 0.004*"of" + 0.004*"to" + 0.003*"in" + 0.002*"a" + 0.001*"for" + 0.001*"on" + 0.001*"is" + 0.001*"The"')
(1, '0.022*"the" + 0.017*"and" + 0.011*"of" + 0.011*"to" + 0.010*"in" + 0.007*"a" + 0.005*"on" + 0.004*"for" + 0.004*"be" + 0.004*"at"')
(2, '0.034*"the" + 0.022*"to" + 0.017*"a" + 0.016*"and" + 0.014*"of" + 0.012*"in" + 0.008*"for" + 0.007*"at" + 0.006*"on" + 0.005*"from"')
(3, '0.070*"the" + 0.031*"to" + 0.030*"of" + 0.027*"and" + 0.026*"in" + 0.021*"a" + 0.012*"on" + 0.010*"was" + 0.008*"that" + 0.008*"is"')
(4, '0.020*"the" + 0.016*"a

137
Bid closes for dream home
Marda’s 276-year wait for divine siblings continues
Army officer's wife on a 450-km walk for a 'better society for girls'
Merger with NMDC: VSP unions to take fight to Delhi
Chennai floods, workers' agitation hit city's textile biz
Mystery over  death in stone quarry
Focus on rabbit farming 
Patna High Court fines AIG for initiating action against retired official
Odisha: Chief of chit fund firm Artha Tatwa gets 7-year jail
After flunking swachhta test, Punjab schools drop out this year
Unemp_fast
(0, '0.027*"the" + 0.011*"of" + 0.010*"to" + 0.009*"and" + 0.008*"a" + 0.007*"in" + 0.003*"for" + 0.003*"on" + 0.003*"has" + 0.003*"is"')
(1, '0.022*"the" + 0.020*"and" + 0.020*"to" + 0.015*"of" + 0.013*"in" + 0.011*"a" + 0.006*"is" + 0.006*"that" + 0.005*"on" + 0.004*"for"')
(2, '0.017*"the" + 0.013*"and" + 0.012*"of" + 0.010*"in" + 0.007*"to" + 0.007*"a" + 0.004*"for" + 0.004*"that" + 0.004*"is" + 0.004*"will"')
(3, '0.063*"the" + 0.033*"of" + 0.032*"to" + 0.02

215
Victims’ kin yet to be informed
Ore freeze in parched plant 
Youths to bond with the best — ham radio
Leopard carcass found in Tinsukia
Note and Jat worry for BJP
Sensitising students to sexual abuse
Power cut
Election campaigns lead to spike in diesel sales in State
Salem: focus on infrastructure
Bikaner-Coimbatore Weekly AC Express inaugural run today
Non_agri_stag
(0, '0.009*"and" + 0.009*"the" + 0.008*"to" + 0.008*"of" + 0.007*"a" + 0.006*"in" + 0.004*"has" + 0.004*"as" + 0.003*"for" + 0.003*"is"')
(1, '0.047*"the" + 0.032*"of" + 0.029*"and" + 0.027*"to" + 0.027*"in" + 0.018*"a" + 0.015*"is" + 0.011*"for" + 0.008*"are" + 0.008*"that"')
(2, '0.014*"the" + 0.009*"in" + 0.009*"of" + 0.008*"and" + 0.006*"to" + 0.005*"a" + 0.003*"is" + 0.002*"as" + 0.002*"that" + 0.002*"was"')
(3, '0.024*"the" + 0.011*"of" + 0.010*"and" + 0.009*"in" + 0.009*"to" + 0.004*"a" + 0.004*"is" + 0.004*"for" + 0.003*"on" + 0.003*"that"')
(4, '0.021*"the" + 0.011*"in" + 0.010*"of" + 0.010*"to" + 0.007*"and" 

171
Minor rioting breaks out in Bawamanpura
Chandy calls for law to ensure right to health
MSU to sensitize students on khadi
Deft caste moves give wings to BJP's 265-plus dream
UP unveils infrastructure development plan
Botched operations: National Human Rights commission seeks fresh report
‘Regular exercise must for keeping arthritis at bay’
'In view of it's religious significance, Allahabad should be renamed Prayag'
AMC seeks clarity on plastic ban
Vadodara boy wins tourism painting contest
Agri_slow
(0, '0.007*"the" + 0.006*"and" + 0.004*"to" + 0.003*"of" + 0.003*"in" + 0.002*"on" + 0.002*"for" + 0.002*"a" + 0.001*"be" + 0.001*"at"')
(1, '0.065*"the" + 0.036*"and" + 0.035*"of" + 0.027*"to" + 0.022*"in" + 0.017*"a" + 0.011*"for" + 0.011*"on" + 0.010*"is" + 0.009*"at"')
(2, '0.019*"the" + 0.010*"of" + 0.009*"in" + 0.008*"to" + 0.006*"and" + 0.006*"a" + 0.004*"for" + 0.003*"is" + 0.002*"on" + 0.002*"with"')
(3, '0.059*"the" + 0.025*"to" + 0.019*"in" + 0.018*"of" + 0.016*"and" + 0.011*

234
Rain batters Uttar Pradesh, Bihar
Free path tests at Lohia from today
Google’s biggest campus outside US in Hyderabad
Bandit returns to Chambal for film
Gangs from Iran keep cops on their toes
29.74% calls to UP100 related to personal disputes
Lakshmi Bai’s birthplace to be turned into memorial
Consequences of deviating from natural farm practices stressed
Cops arrest 2 persons, claim success in South City case
Jassie Gill: Maine socha tha Dilli tab jaunga jab logon ko pata hoga Jassie Gill kaun hai
Non_agri_fast
(0, '0.002*"AES" + 0.001*"b" + 0.001*"(Marxist)" + 0.001*"the" + 0.001*"c" + 0.000*"IMR" + 0.000*"JE" + 0.000*"to" + 0.000*"of" + 0.000*"innings:"')
(2, '0.055*"the" + 0.038*"of" + 0.034*"and" + 0.031*"to" + 0.022*"in" + 0.019*"a" + 0.014*"is" + 0.012*"for" + 0.009*"are" + 0.009*"that"')
(3, '0.030*"of" + 0.028*"and" + 0.018*"the" + 0.014*"on" + 0.013*"in" + 0.012*"for" + 0.009*"by" + 0.009*"at" + 0.007*"students" + 0.007*"to"')
(4, '0.000*"the" + 0.000*"of" + 0.000*"and" 