### Build a Classifier
1. Read a json and links on each page
2. Whenever we see a null link we check the text and use it to identify our label
3. Collect links for these pages to build the feature vector 


In [5]:
# Required folder structure
#Folder "Data" should contain all json files!
#create a folder with name  - "LABEL_MAP"!
#create 2 sub folders inside LABEL MAP - 1. TRAIN !
#                                        2. TEST !

# create folder with name - "FEATURES"!
#create 2 sub folders inside FEATURES also - 1. TRAIN !
#                                            2. TEST !

#create "TEST_FOLDER" !

In [1]:
import os
import glob
import json
import shutil
import pickle
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from itertools import product
from sklearn.svm import LinearSVC
from collections import OrderedDict
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix,classification_report

### Create Entities

In [2]:
def get_entity_list(file):
    entities =[]
    all_texts =[]
    links=[]
    with open(file) as fp:    
        for l in fp:
            if l != '':
                tb_tokens = [l1.strip() for l1 in l.split('\t')]
                tokens = [l1.strip() for l1 in l.split('/')]
                link = ("/".join(tokens[-2:]))
                links.append(link)
                text = [t.strip() for t in tb_tokens[0].split()]
                replace_dot = [t.replace('.','') for t in text[-2:]]
                name_bigrams = tuple(replace_dot)
                if len(name_bigrams) == 2:
                    all_texts.append(name_bigrams) 
                    
    links_text = [e.split('/')[1] for e in links if e != '']
    text_to_link = [l for l in zip(links_text,all_texts)]
    links = [e for e in set(links) if e != '']
    entities = [(l.split('/'))[1] for l in links]
    return entities, all_texts,text_to_link

In [3]:
# def create_folders(top_folder_path, entities):
#     for e in entities:
#         entity_path = top_folder_path +os.sep+ e
#         if not os.path.exists(entity_path):
#             os.makedirs(entity_path)

In [4]:
### update this folder name!

folder_name = "Supplementary_data"
entities,all_texts,text_to_link = get_entity_list(folder_name + os.sep +'161111_chicago schools.txt')
all_texts = all_texts[:-4]
text_to_link = text_to_link[:-4]
print ("Total entities : ",len(entities))
print ("Total names for matching texts : ",len(all_texts))
print ("Total text to link mapping : ",len(text_to_link))


Total entities :  41
Total names for matching texts :  56
Total text to link mapping :  56


In [5]:
dtext={}
for t in text_to_link:
    dtext[t[1]] = t[0]
print ("Items in dictionary for mapping text to enitites : ", len(dtext))

Items in dictionary for mapping text to enitites :  56


In [6]:
entities

['chicago_school_of_medicine',
 'Chicago_Architecture_Foundation',
 'chicago_school_of_sewing',
 'Chicago_school_(mathematical_analysis)',
 'chicago_school_of_law',
 'chicago_school_of_expression',
 'chicago_school_of_pharmacy',
 'chicago_school_of_psychology',
 'chicago_school_of_arms',
 'first_Chicago_school',
 'chicago_school_of_fiction',
 'chicago_school_of_normal_and_applied_art',
 'chicago_school_of_science',
 'chicago_school_of_osteopaty',
 'chicago_school_of_psycho-physiology',
 'old_Chicago_school',
 'chicago_school_of_design',
 'chicago_school_of_hebrew',
 'chicago_school_of_cutting',
 'chicago_school_of_instruction',
 'chicago_school_of_education',
 'chicago_school_of_dentistry',
 'chicago_school_of_anatomy',
 'chicago_school_of_sanitary instruction',
 'chicago_school_of_applied_and_normal_art',
 'chicago_school_at_arms',
 'second_Chicago_school',
 'chicago_school_of_thought',
 'chicago_school_of_oratory',
 'Chicago_school_(sociology)',
 'Chicago_school_(literary_criticism)'

In [7]:
text_to_link

[('Chicago_Public_Schools', ('school', 'education')),
 ('Chicago_Public_Schools', ('school', 'system')),
 ('Chicago_Public_Schools', ('chicago', 'schools')),
 ('Chicago_Public_Schools', ('school', 'inspection')),
 ('Chicago_Public_Schools', ('school', 'board')),
 ('Chicago_Public_Schools', ('chicago', 'schoolroom')),
 ('Chicago_Public_Schools', ('chicago', 'schoolma')),
 ('Chicago_Public_Schools', ('deaf', 'mutes')),
 ('Chicago_Public_Schools', ('school', 'children')),
 ('Chicago_Public_Schools', ('school', 'curriculum')),
 ('Chicago_Public_Schools', ('school', 'section')),
 ('Chicago_Public_Schools', ('school', 'house')),
 ('Chicago_Public_Schools', ('school', 'extension')),
 ('Chicago_Public_Schools', ('school', 'heating')),
 ('Chicago_Public_Schools', ('school', 'ventilation')),
 ('Chicago_school_(sociology)', ('of', 'sociology')),
 ('Chicago_school_(mathematical_analysis)', ('of', 'mathematical')),
 ('Chicago_school_(mathematical_analysis)', ('of', 'mathematics')),
 ('Chicago_schoo

In [8]:
all_texts

[('school', 'education'),
 ('school', 'system'),
 ('chicago', 'schools'),
 ('school', 'inspection'),
 ('school', 'board'),
 ('chicago', 'schoolroom'),
 ('chicago', 'schoolma'),
 ('deaf', 'mutes'),
 ('school', 'children'),
 ('school', 'curriculum'),
 ('school', 'section'),
 ('school', 'house'),
 ('school', 'extension'),
 ('school', 'heating'),
 ('school', 'ventilation'),
 ('of', 'sociology'),
 ('of', 'mathematical'),
 ('of', 'mathematics'),
 ('chicago', 'math'),
 ('of', 'criticism'),
 ('of', 'literary'),
 ('of', 'architects'),
 ('architecture', 'foundation'),
 ('of', 'design'),
 ('of', 'fiction'),
 ('normal', 'art'),
 ('lip', 'reading'),
 ('of', 'music'),
 ('of', 'osteopathy'),
 ('of', 'sewing'),
 ('of', 'art'),
 ('of', 'expression'),
 ('of', 'elocution'),
 ('of', 'oratory'),
 ('of', 'civics'),
 ('of', 'dentistry'),
 ('applied', 'art'),
 ('of', 'hebrew'),
 ('of', 'anatomy'),
 ('of', 'arms'),
 ('domestic', 'science'),
 ('of', 'science'),
 ('sanitary', 'instruction'),
 ('of', 'instruction

In [9]:
text_to_link

[('Chicago_Public_Schools', ('school', 'education')),
 ('Chicago_Public_Schools', ('school', 'system')),
 ('Chicago_Public_Schools', ('chicago', 'schools')),
 ('Chicago_Public_Schools', ('school', 'inspection')),
 ('Chicago_Public_Schools', ('school', 'board')),
 ('Chicago_Public_Schools', ('chicago', 'schoolroom')),
 ('Chicago_Public_Schools', ('chicago', 'schoolma')),
 ('Chicago_Public_Schools', ('deaf', 'mutes')),
 ('Chicago_Public_Schools', ('school', 'children')),
 ('Chicago_Public_Schools', ('school', 'curriculum')),
 ('Chicago_Public_Schools', ('school', 'section')),
 ('Chicago_Public_Schools', ('school', 'house')),
 ('Chicago_Public_Schools', ('school', 'extension')),
 ('Chicago_Public_Schools', ('school', 'heating')),
 ('Chicago_Public_Schools', ('school', 'ventilation')),
 ('Chicago_school_(sociology)', ('of', 'sociology')),
 ('Chicago_school_(mathematical_analysis)', ('of', 'mathematical')),
 ('Chicago_school_(mathematical_analysis)', ('of', 'mathematics')),
 ('Chicago_schoo

In [10]:
dtext

{('and', 'boxing'): 'chicago_school_at_arms',
 ('applied', 'art'): 'chicago_school_of_normal_and_applied_art',
 ('architecture', 'foundation'): 'Chicago_Architecture_Foundation',
 ('chicago', 'math'): 'Chicago_school_(mathematical_analysis)',
 ('chicago', 'schoolma'): 'Chicago_Public_Schools',
 ('chicago', 'schoolroom'): 'Chicago_Public_Schools',
 ('chicago', 'schools'): 'Chicago_Public_Schools',
 ('deaf', 'mutes'): 'Chicago_Public_Schools',
 ('domestic', 'science'): 'chicago_school_of_domestic_science',
 ('lip', 'reading'): 'chicago_school_of_lip_reading',
 ('normal', 'art'): 'chicago_school_of_applied_and_normal_art',
 ('of', 'anatomy'): 'chicago_school_of_anatomy',
 ('of', 'architects'): 'Chicago_school_(architecture)',
 ('of', 'arms'): 'chicago_school_of_arms',
 ('of', 'art'): 'chicago_school_of_art',
 ('of', 'civics'): 'chicago_school_of_civics',
 ('of', 'criticism'): 'Chicago_school_(literary_criticism)',
 ('of', 'cutting'): 'chicago_school_of_cutting',
 ('of', 'dentistry'): 'chi

# Do not run next 3 cells from build classifier 

### Build Classifier

In [11]:
def collect_links(f_type,file_name,dtext,all_texts):
    feature_vectors_for_book = OrderedDict()
    file_page_to_link =[]
    with open(file_name,'r') as fp:
        data = json.load(fp)
        all_pages = data['pages']
        f_name = os.path.basename(file_name)
        f_name = f_name.split('.json')[0]

        for p in all_pages:
            label = None
            page_id = p['pid']
            page_keys = p.keys()
            if 'wikifier' in page_keys:
                all_links = len(p['wikifier'])
                links_on_page =[]
                for l in range(all_links):
                    if p['wikifier'][l]['link'] != None:
                        links_on_page.append((p['wikifier'][l]['link']).split('/')[-1])
                    elif p['wikifier'][l]['link'] == None:
                        links_on_page.append(None)
                        
                if len(links_on_page) >0:
                    if None in links_on_page:
                        idx = links_on_page.index(None)
                        text = p['wikifier'][idx]['text']
                        text = [t.replace('.','').lower() for t in text.split()]
                        bigrams = [b for b in zip(text[:-1], text[1:])]
                        common_name = set(bigrams) & set(all_texts) 
                        if len(common_name)==1:
                            t = list(common_name)[0]
                            if t in dtext.keys():
                                label = dtext[t]
                                features = [l for l in links_on_page if l != None]
                                feature_vectors_for_book[page_id] = (label,features)
                                with open("LABEL_MAP" + os.sep + f_type+os.sep + "file_labels" , 'ab') as fp1:
                                    pickle.dump((f_name + "_" + page_id, label),fp1)
            elif 'wiki' in page.keys():
                all_links = len(p['wiki'])
                links_on_page =[]
                for l in range(all_links):
                    if p['wiki'][l]['link'] != None:
                        links_on_page.append((p['wiki'][l]['link']).split('/')[-1])
                    elif p['wiki'][l]['link'] == None:
                        links_on_page.append(None)
                
                if len(links_on_page)>0:
                    if None in links_on_page: 
                        idx = links_on_page.index(None)
                        text = p['wiki'][idx]['text']
                        text = [t.replace('.','').lower() for t in text.split()]
                        bigrams = [b for b in zip(text[:-1], text[1:])]
                        common_name = set(bigrams) & set(all_texts) 
                        if len(common_name)==1:
                            t = list(common_name)[0]
                            if t in dtext.keys():
                                label = dtext[t]
                                features = [l for l in links_on_page if l != None]
                                feature_vectors_for_book[page_id] = (label,features)
                                with open("LABEL_MAP" + os.sep + f_type + os.sep + "file_labels" , 'ab') as fp1:
                                    pickle.dump((f_name + "_" + page_id, label),fp1)
                        
    with open("FEATURES" + os.sep + f_type+ os.sep + "features_" + f_name,'wb') as fop:
        pickle.dump(feature_vectors_for_book,fop)

In [12]:
def collect_features_for_each_file(f_type,files,dtext,all_texts):
    for file_name in files:
        #comment this if don't want to print
        print ("Processing  : ",file_name) 
#         collect_links(f_type,file_name,dtext,all_texts)

In [14]:
#Folder "Data" should contain all json files!
# create a folder with name  - "LABEL_MAP"!
#create 2 sub folders inside LABEL MAP - 1. TRAIN !
#                                        2. TEST !

# create folder with name - "FEATURES"!
#create 2 sub folders inside FEATURES also - 1. TRAIN !
#                                            2. TEST !

source_folder = "Data"
files = [f1 for f1 in glob.glob(source_folder +os.sep + '*.json')]
    
label_file = "LABEL_MAP" +os.sep + "file_labels"
if os.path.exists(label_file):
    os.remove(label_file)
    
f_train = "TRAIN"
# collect_features_for_each_file(f_train, files,dtext,all_texts)

# Start from below again 

In [15]:
def build_feature_list(folder_name):
    featured_files = glob.glob(folder_name + os.sep + '*')
    print("Total featured files : ",len(featured_files))
    
    features=[]
    labels =[]
    for f in featured_files:
        with open(f,'rb') as fp:
            data = pickle.load(fp)
            for k,v in data.items():
                f = " ".join(x for x in v[1])
                features.append(f)
                labels.append(v[0])
    return features,labels

In [16]:
# Read feature files in train folder and build feature and true label list
feature_folder = "FEATURES/TRAIN"
features,labels = build_feature_list(feature_folder)

Total featured files :  9


In [17]:
#Build feature matrix
vec = TfidfVectorizer()
X = vec.fit_transform(features)
print ("Matrix shape : ", X.shape)
y = np.array(labels)
print ("Number of true labels : ", len(y))

Matrix shape :  (4, 43)
Number of true labels :  4


### Random Forest

In [18]:
def find_RF_comb(X,y):
    random_state = [35,40,42,50,60] 
    cols = ['avg_acc','docs','features','random_state']
    df = pd.DataFrame(columns=cols)
    combs = product(random_state)
    max_acc =0
    feats = X.shape[0]
    docs = X.shape[1]
    for i, c in enumerate(random_state):
        print ("Combination  : ",i+1)
        clf = RandomForestClassifier(random_state = c, class_weight = 'balanced')
        clf.fit(X, labels)
        acc = cross_val_score(clf,X,y,cv =5)
        avg_acc = np.mean(acc)
        print ("Accuracies :",acc)
        print ("Average accuracy : ",avg_acc)
        val = [avg_acc,docs,feats,c]
        df.loc[i] = val
        if avg_acc > max_acc:
            max_acc = avg_acc
            combi = c
    print('\nMax accuracy : ', max_acc)
    print('Best Combination : ',combi)
    df = df.sort_values(by='avg_acc', ascending=False)

    df.to_csv('resultsRF.txt', header=None, index=None, sep=' ', mode='w')
    with open('accuraciesRF.txt','a') as fp:
        fp.write('\nMax accuracy of RF: '+ str(max_acc))
        fp.write('\nCorresponding random state of RF : ' + str(combi))

    return combi

In [19]:
combi_RF = find_RF_comb(X,y)

Combination  :  1


ValueError: Cannot have number of splits n_splits=5 greater than the number of samples: 4.

## Collecting test features

In [20]:
def collect_links_test_data(f_type,file_name,dtext,all_texts):
    feature_vectors_for_book = OrderedDict()
    file_page_to_link =[]
    
    with open(file_name,'r') as fp:
        data = json.load(fp)
        all_pages = data['pages']
        f_name = os.path.basename(file_name)
        f_name = f_name.split('.json')[0]

        for p in all_pages:
            label = None
            page_id = p['pid']
            page_keys = p.keys()
            if 'wikifier' in page_keys:
                all_links = len(p['wikifier'])
                links_on_page =[]
                for l in range(all_links):
                    if p['wikifier'][l]['link'] != None:
                        links_on_page.append((p['wikifier'][l]['link']).split('/')[-1])
                    elif p['wikifier'][l]['link'] == None:
                        links_on_page.append(None)
                        
                if len(links_on_page) >0:
                    if None in links_on_page:
                        idx = links_on_page.index(None)
                        text = p['wikifier'][idx]['text']
                        text = [t.replace('.','').lower() for t in text.split()]
                        bigrams = [b for b in zip(text[:-1], text[1:])]
                        common_name = set(bigrams) & set(all_texts) 
                        if len(common_name)==0:
                            features = [l for l in links_on_page if l != None]
                            feature_vectors_for_book[page_id] = features
                            with open("LABEL_MAP" + os.sep + f_type+os.sep + "file_labels" , 'ab') as fp1:
                                pickle.dump((f_name + "_" + page_id),fp1)
            elif 'wiki' in page.keys():
                all_links = len(p['wiki'])
                links_on_page =[]
                for l in range(all_links):
                    if p['wiki'][l]['link'] != None:
                        links_on_page.append((p['wiki'][l]['link']).split('/')[-1])
                    elif p['wiki'][l]['link'] == None:
                        links_on_page.append(None)
                
                if len(links_on_page)>0:
                    if None in links_on_page: 
                        idx = links_on_page.index(None)
                        text = p['wiki'][idx]['text']
                        text = [t.replace('.','').lower() for t in text.split()]
                        bigrams = [b for b in zip(text[:-1], text[1:])]
                        common_name = set(bigrams) & set(all_texts) 
                        if len(common_name)==0:
                            features = [l for l in links_on_page if l != None]
                            feature_vectors_for_book[page_id] = features
                            with open("LABEL_MAP" + os.sep + f_type+os.sep + "file_labels" , 'ab') as fp1:
                                pickle.dump((f_name + "_" + page_id),fp1)
                                
    with open("FEATURES" + os.sep + f_type+ os.sep + "features_" + f_name,'wb') as fop:
        pickle.dump(feature_vectors_for_book,fop)

In [21]:
def build_feature_list_test_data(folder_name):
    featured_files = glob.glob(folder_name + os.sep + '*')
    print("Total featured test files : ",len(featured_files))
    
    test_features=[]
    test_feature_keys=[]
    labels =[]
    for f in featured_files:
        with open(f,'rb') as fp:
            data = pickle.load(fp)
            for k,v in data.items():
                f = " ".join(x for x in v)
                test_features.append(f)
                test_feature_keys.append(f+ "_"+k)
    return test_features,test_feature_keys

In [22]:
def copy_file(file_list,source_folder,target_folder):
    for file_name in file_list:
        source_location = source_folder + os.sep + file_name
        dest_location = target_folder + os.sep + file_name
        if (os.path.exists(dest_location)) == False:
            shutil.copy(source_location, dest_location)
        else:
            print ("file already exists")

In [23]:
source_folder = "Data"
all_files = [os.path.basename(f1) for f1 in glob.glob(source_folder +os.sep + '*.json')]
print("Total  files : ",len(all_files))

featured_folder = "FEATURES/TRAIN"
processed_files = [os.path.basename(file_name) for file_name in glob.glob(featured_folder + os.sep + '*')]
print("Total processed files : ",len(processed_files))


Total  files :  9
Total processed files :  9


In [24]:
# Collect features for test files 
f_test = "TEST"
source_folder = "Data"
# os.path.basename(f1)
all_files = [f1 for f1 in glob.glob(source_folder +os.sep + '*.json')]
print("Total files : ",len(all_files))
# all_files = all_files[1:]
for f in all_files:
    print("Processing test data file : ", f)
    collect_links_test_data(f_test,f,dtext,all_texts)

Total  files :  9
Processing test data file :  Data/1904_chi.098235568.json
Processing test data file :  Data/1914_chi.098235576.json
Processing test data file :  Data/mdp.39015062797538.json
Processing test data file :  Data/mdp.39015064582888.json
Processing test data file :  Data/nnc1.ar53666712.json
Processing test data file :  Data/uc1.$b715276.json
Processing test data file :  Data/uc1.b3819355.json
Processing test data file :  Data/uiug.30112078740336.json
Processing test data file :  Data/uva.x001197927.json


In [27]:
test_feature_folder = "FEATURES/TEST"
processed_files_test = [os.path.basename(file_name) for file_name in glob.glob(test_feature_folder + os.sep + '*')]
print("Total processed test files : ",len(processed_files_test))

Total processed test files :  9


In [28]:
# Predicting on files 

In [29]:
# Read feature vectors from test feature files
test_feature_folder = "FEATURES/TEST"
test_features_RF,test_feature_keys_RF = build_feature_list_test_data(test_feature_folder)

Total featured test files :  9


In [30]:
clf = RandomForestClassifier(random_state = combi_RF, class_weight = 'balanced')
clf.fit(X,y)
RF_X_test = vec.transform(test_features_RF)
RF_predicted = clf.predict(RF_X_test)

NameError: name 'combi_RF' is not defined

In [31]:
print (RF_predicted)

NameError: name 'RF_predicted' is not defined

In [32]:
file_prediction_map = [(i[0],i[1]) for i in zip(test_feature_keys_RF, RF_predicted)]
print (file_prediction_map)

NameError: name 'RF_predicted' is not defined

In [33]:
with open('RF_prediction','w') as fp:
    pickle.dump(file_prediction_map,fp)

NameError: name 'file_prediction_map' is not defined