In [63]:
import wikipedia
import json

In [64]:
p = wikipedia.page("Transport Planning")
print(p.url)
print(p.title)
content = p.content # Content of page.

https://en.wikipedia.org/wiki/Transportation_planning
Transportation planning


In [71]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [72]:
SAGE_FIELDS_FILE = '/home/urwa/Documents/Coleridge/ProjectFiles/train_test/train_test/sage_research_fields.json'
SAGE_WIKI_DIR = '/home/urwa/Documents/Coleridge/notebooks/SubsetAnalysis/fieldsWiki/'
TEXT_DIRECTORY = '/home/urwa/Documents/Coleridge/ProjectFiles/train_test/train_test/files/text/'

In [73]:
text = load_doc(SAGE_FIELDS_FILE)
sage_fields_json = json.loads(text)
fields = list(sage_fields_json.keys())

In [85]:
subFields = []
containerField = []
for i in range(len(fields)):
    f = fields[i]
    sf = sage_fields_json[f].keys()
    subFields += sf
    containerField += [fields[i]] * len(sf)

In [77]:
len(containerField)

150

In [86]:
subFields = [s.replace('/',' ') for s in subFields]
subFields

['Ecology & Conservation',
 'Physical Geography',
 'Meteorology',
 'Environmental Chemistry, Substances & Processes',
 'Human Geography',
 'Planning',
 'General Geography, Earth & Environmental Science',
 'Environmental Sciences',
 'Environmental Policy & Law',
 'Environmental Health',
 'Research Methods for Geography, Earth & Environmental Science',
 'General Sociology',
 'Sociology of Education',
 'Social Research',
 'Social Policy',
 'Social Change & Transformation',
 'Sociology of Work & Labor Studies',
 'Sociology of Religion',
 'Area Studies',
 'Gender & Sexuality',
 'Social Sociological Theory',
 'Environmental, Urban & Regional Sociology',
 'Sociology of Sport & Leisure',
 'Sociology of Family',
 'Age & the Life Course',
 'Anthropology',
 'Sociology of Organizations, Institutions & Structure',
 'Political Sociology',
 'Race, Ethnicity & Migration',
 'Crime & Deviance',
 'Sociology of Arts & Culture',
 'Sociology of Health, Illness & the Body',
 'Social Interaction & Everyday Li

In [124]:
missing = []
for i in range(len(subFields)):
    sf = subFields[i]
    try:
        p = wikipedia.page(sf)
    except wikipedia.exceptions.DisambiguationError as e:
        print ('Options: ',e.options)
        p = wikipedia.page(e.options[0])
    except wikipedia.exceptions.PageError as e:
        missing.append(sf)
        continue
    data = p.content
    cleanedsf = " ".join(clean_doc(sf))
    cleanedcf = " ".join(clean_doc(containerField[i]))
    data += " " + ' '.join([cleanedcf] * 30)
    data += " " + ' '.join([cleanedsf] * 60)
    
    filename = sf+'.txt'
    file = open(SAGE_WIKI_DIR+filename, 'w')
    file.write(data)
    file.close()
    print(sf," : ",p.title)

Options:  ['conservation biology', 'Ecology and Society']
Ecology & Conservation  :  Conservation biology
Physical Geography  :  Physical geography
Meteorology  :  Meteorology
Environmental Chemistry, Substances & Processes  :  Environmental chemistry
Human Geography  :  Human geography
Planning  :  Planning
General Geography, Earth & Environmental Science  :  Earth science
Environmental Sciences  :  Environmental science
Environmental Policy & Law  :  Environmental policy
Environmental Health  :  Environmental health
Research Methods for Geography, Earth & Environmental Science  :  Earth science
General Sociology  :  Sociology
Sociology of Education  :  Sociology of education
Social Research  :  Social research
Social Policy  :  Social policy
Social Change & Transformation  :  Social change
Sociology of Work & Labor Studies  :  Sociology
Sociology of Religion  :  Sociology of religion
Area Studies  :  Area studies
Gender & Sexuality  :  Gender studies
Social Sociological Theory  :  So

In [84]:
[]+[subFields[0]]*5

['Ecology & Conservation',
 'Ecology & Conservation',
 'Ecology & Conservation',
 'Ecology & Conservation',
 'Ecology & Conservation']

In [125]:
missing

['Research Methods & Data Analysis in Psychology',
 'Counseling Setting   Client Groups',
 'Teaching Diverse Students',
 'Teacher Assistants Support',
 'Preservice Training']

In [107]:
sage_fields_json[fields[0]]['Ecology & Conservation']

KeyError: 'Nursing'

In [126]:
files = ['315.txt',
 '1149.txt',
 '1492.txt',
 '1579.txt',
 '1737.txt',
 '1905.txt',
 '2009.txt',
 '2327.txt',
 '2573.txt',
 '2970.txt',
 '3343.txt',
 '3579.txt',
 '4014.txt',
 '5148.txt',
 '5400.txt']

In [127]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
import string
import json
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import random
import os
from sklearn.feature_extraction.text import TfidfVectorizer

In [128]:
# turn a doc into clean tokens
def clean_doc(doc):
    #
    doc = doc.replace('(general)','')
    
    # split into tokens by white space
    tokens = doc.split()
    
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    
    # make lower case
    tokens = [word.lower() for word in tokens]
    
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    
    # filter special words
    special_words = ['fieldaltlabel','fieldid','fieldlabel']
    tokens = [w for w in tokens if not w in special_words]
    
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [129]:
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# update counts
	vocab.update(tokens)

In [130]:
# save list to file
def save_list(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

In [131]:
sageFiles = os.listdir(SAGE_WIKI_DIR)
fields = [f.split('.')[0] for f in sageFiles]

In [132]:
save_list(fields, 'sage_fields_wiki.txt')

In [133]:
# define vocab
vocab = Counter()
for f in sageFiles:
    add_doc_to_vocab(SAGE_WIKI_DIR+f, vocab)
    
tokens = [k for k,c in vocab.items()]
save_list(tokens, 'sage_fields_wiki_vocab.txt')

In [134]:
# load doc, clean and return line of tokens
def field_to_line(file, vocab):
	# load the doc
	doc = load_doc(SAGE_WIKI_DIR+file)
	# clean doc
	tokens = clean_doc(doc)
	# filter by vocab
	tokens = [w for w in tokens if w in vocab]
	return ' '.join(tokens)

In [135]:
fieldLines = []
for f in sageFiles:
    fieldLines.append(field_to_line(f, vocab))
save_list(fieldLines, 'wiki_fields_lines.txt')

In [136]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
	# load the doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# filter by vocab
	tokens = [w for w in tokens if w in vocab]
	return ' '.join(tokens)

In [137]:
# load all docs in a directory
def process_docs_bow(files, vocab):
    lines = list()
    for f in files:
        path = TEXT_DIRECTORY+f
        line = doc_to_line(path, vocab)
        lines.append(line)
    return lines

In [138]:
# prepare negative reviews
processed_lines = process_docs_bow(files, vocab)
save_list(processed_lines, 'wiki_sagefields_lines.txt')

In [139]:
def getFieldsSimMatrix(field_lines,pub_lines):
    # create the tokenizer
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    # fit the tokenizer on the documents
    fieldTokenizer = vectorizer.fit(field_lines)
    # encode training data set
    fieldsNgram = fieldTokenizer.transform(field_lines)
    #print(fieldsNgram.shape)
    # encode training data set
    pubNgram = fieldTokenizer.transform(pub_lines)
    #print(doc_bow.shape)
    data_sim = cosine_similarity(pubNgram,fieldsNgram,dense_output=True)
    #print(data_sim.shape)
    #group_sim_scores = min_max_scale(data_sim.reshape(data_sim.shape[1]))
    return data_sim

In [140]:
sim = getFieldsSimMatrix(fieldLines,processed_lines)
fieldLabels = [fields[np.argmax(c_s)] for c_s in sim]

In [141]:
fieldLabels

['Community Medicine & Health Care',
 'Community Medicine & Health Care',
 'Economics',
 'Race, Ethnicity & Migration',
 'Nursing',
 'Race, Ethnicity & Migration',
 'Diseases & Epidemiology',
 'Community Medicine & Health Care',
 'American Government & Politics',
 'Diseases & Epidemiology',
 'Criminal Justice',
 'Research Methods for Criminology & Criminal Justice',
 'Social Psychology',
 'Diseases & Epidemiology',
 'Special & Inclusive Education']

In [105]:
['Education : Research Methods for the Social Sciences', 'Health & Social Care : Physician Assistant', 'Politics & International Relations : Research Methods for Politics & International Relations', 'Psychology : Research Methods & Data Analysis in Psychology', 'Health & Social Care : Nursing', 'Sociology : Sociology of Religion', 'Education : Research Methods for the Social Sciences', 'Health & Social Care : Public Health', 'Education : Research Methods for Education', 'Health & Social Care : Diseases & Epidemiology', 'Criminology & Criminal Justice : General Criminology & Criminal Justice', 'Psychology : Research Methods & Data Analysis in Psychology', 'Psychology : Research Methods & Data Analysis in Psychology', 'Sociology : Area Studies', 'Education : School Counseling']

['Education : Research Methods for the Social Sciences',
 'Health & Social Care : Physician Assistant',
 'Politics & International Relations : Research Methods for Politics & International Relations',
 'Psychology : Research Methods & Data Analysis in Psychology',
 'Health & Social Care : Nursing',
 'Sociology : Sociology of Religion',
 'Education : Research Methods for the Social Sciences',
 'Health & Social Care : Public Health',
 'Education : Research Methods for Education',
 'Health & Social Care : Diseases & Epidemiology',
 'Criminology & Criminal Justice : General Criminology & Criminal Justice',
 'Psychology : Research Methods & Data Analysis in Psychology',
 'Psychology : Research Methods & Data Analysis in Psychology',
 'Sociology : Area Studies',
 'Education : School Counseling']

In [62]:
files

['315.txt',
 '1149.txt',
 '1492.txt',
 '1579.txt',
 '1737.txt',
 '1905.txt',
 '2009.txt',
 '2327.txt',
 '2573.txt',
 '2970.txt',
 '3343.txt',
 '3579.txt',
 '4014.txt',
 '5148.txt',
 '5400.txt']