In [1]:
import os
import glob
import json
import shutil
import pickle
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [2]:
with open("files_to_labels.pkl", 'rb') as f:
    labels = pickle.load(f)
    #[(page_id, label),(page_id1, label1)]

In [3]:
with open("files_to_features.pkl", 'rb') as fp:
    features = pickle.load(fp)
     #[(page_id, feat1),(page_id1, feta2)]

In [4]:
def convert_lists_to_dicts(features,labels):
    all_features ={}
    for f in features:
        all_features[f[0]] = f[1]

    all_labels ={}
    for l in labels:
        all_labels[l[0]] = l[1]
        
    return all_features,all_labels

In [5]:
feature_dict,label_dict = convert_lists_to_dicts(features,labels)

In [6]:
print(len(feature_dict),len(label_dict))

193675 24895


In [7]:
def map_feat_label(feat_dict, label_dict):
    all_ids = feat_dict.keys()
    mapping_feats_lables = []
    test_feature_list =[]
    for ids in all_ids:
        if ids in label_dict.keys(): 
            mapping_feats_lables.append((feat_dict[ids], label_dict[ids]))
        else:
            test_feature_list.append((ids,feat_dict[ids]))
            
    return mapping_feats_lables,test_feature_list

In [8]:
mapping_feats_lables,test_feature_list = map_feat_label(feature_dict,label_dict)

In [9]:
with open('feat_label_mapping.pkl', 'wb') as fp1:
    pickle.dump(mapping_feats_lables,fp1)
    
with open('test_features.pkl', 'wb') as fp2:
    pickle.dump(test_feature_list,fp2)

In [10]:
with open('feat_label_mapping.pkl', 'rb') as fp3:
    feat_to_labels = pickle.load(fp3)
    feat_list=[]
    label_list =[]
    for f in feat_to_labels:
        try:
            feat_list.append(" ".join(f[0]))
            label_list.append(f[1])
        except:
            feat = f[0][1]
            feat_list.append(" ".join(feat))
            label_list.append(f[1])

In [11]:
print(len(feat_list), len(label_list))

24887 24887


In [12]:
with open('test_features.pkl', 'rb') as fp4:
    test_list = pickle.load(fp4)
    test_features_list=[]
    test_id_list=[]
    for t in test_list:
        try:
            test_features_list.append(" ".join(t[1]))
            test_id_list.append(t[0])
        except:
            feat = f[1][1]
            test_features_list.append(" ".join(feat))
            test_id_list.append(t[0])

In [13]:
print(len(test_features_list), len(test_id_list))

168788 168788


In [14]:
168788 + 24887

193675

In [15]:
vec = TfidfVectorizer()
X_train = vec.fit_transform(feat_list)
Y_train = np.array(label_list)

In [16]:
print(X_train.shape)
print(len(Y_train))

(24887, 159542)
24887


In [17]:
X_test = vec.transform(test_features_list)

In [18]:
print(X_test.shape)

(168788, 159542)


In [19]:
clf = RandomForestClassifier(class_weight = 'balanced', random_state = 35)

In [20]:
acc = cross_val_score(clf,X_train,Y_train,cv=5)
print("Average accuracy : ", np.mean(acc))



Average accuracy :  0.766229524802


In [21]:
clf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=35, verbose=0, warm_start=False)

In [22]:
feature_names = vec.get_feature_names()
important_feats = clf.feature_importances_

In [23]:
feature_value_map = [x for x in zip(feature_names,important_feats)]
feature_value_map.sort(key = lambda x : x[1],reverse = True)

print("Most important features : \n",feature_value_map[:50])
print("\n")
print("Least important features : \n",feature_value_map[-50:])

Most important features : 
 [('chicago', 0.0055883690235963767), ('sociology', 0.0047990762269615531), ('chicago_school_', 0.0042778992659433098), ('chicago_school_of_economics', 0.0040032558643440267), ('chicago_school_of_music', 0.0029743516321472543), ('united_states', 0.0029360077598752617), ('george_stigler', 0.0028371114737791594), ('chicago_school_of_thought', 0.0027328835536785346), ('architecture', 0.0026421651747990165), ('chicago_school_of_education', 0.0025246865742262988), ('chicago_school_of_fiction', 0.0024978824721198275), ('_u', 0.0024698047118688433), ('social_work', 0.0023509573402903808), ('chicago_school_of_osteopaty', 0.0022047086567522117), ('band', 0.0021991333245796204), ('chicago_school_of_civics', 0.0021640750354588361), ('milton_friedman', 0.002150156470064992), ('massage', 0.0021450233560988893), ('black_books', 0.0020240539169649324), ('chicago_school_of_science', 0.0019558241722379237), ('chicago_school_of_design', 0.0019075918489202758), ('ge_capital_avi

In [24]:
predicted = clf.predict(X_test)
print("First 10 predicted values : ", predicted[:10])

First 10 predicted values :  ['CS_UCS_Medicine' 'CS_UCS_Medicine' 'CS_UCS_Medicine' 'CS_Architecture'
 'CS_Architecture' 'CS_Architecture' 'CS_UCS_Medicine' 'CS_UCS_Medicine'
 'CS_UCS_Medicine' 'CS_UCS_Medicine']


In [25]:
probs = clf.predict_proba(X_test)
classes = list(clf.classes_)
print(classes)

['CS_1', 'CS_AICS_Drama__The_Drama', 'CS_AICS__Drama', 'CS_Advertising', 'CS_Anatomy', 'CS_Anthropology', 'CS_Anthropology__Social_Anthropology', 'CS_Anthropology__Symbolic_Anthropologists', 'CS_Anthropology__Symbolic_Interact', 'CS_Anthropology__Urban_Anthro', 'CS_Antitrust_Analysis', 'CS_Applied_And_Normal_Arts', 'CS_Applied_Art.', 'CS_Architects', 'CS_Architecture', 'CS_Architecture_Foundation', 'CS_Architecture__1880', 'CS_Architecture__American_architecture', 'CS_Architecture__Building', 'CS_Architecture__Burnham', 'CS_Architecture__Commercial_American_Arch', 'CS_Architecture__Commercial_Architecture', 'CS_Architecture__Commercial_Buildings', 'CS_Architecture__Commercial_Skyscraper__Design__Architecture__Commercial_Architects', 'CS_Architecture__Modern_A', 'CS_Architecture__Urban_Architecture', 'CS_Art', 'CS_Assaying', 'CS_Automatic_Transmission', 'CS_Aviation', 'CS_Ballet', 'CS_Baseball_Writing', 'CS_Beauty', 'CS_Behavioris.', 'CS_Biology', 'CS_Bookkeeping', 'CS_Broadcasting', 'C

In [26]:
def get_label_prob(test_id_list,classes,probs,predicted):
    class_to_prob =[]
    for i,p in enumerate(zip(test_id_list,predicted,probs)):
            if p[1] in classes:
                idx = np.argmax(probs[i])
                pb = probs[i][idx]
                class_to_prob.append((p[0],p[1],pb))
    return class_to_prob

In [27]:
label_probas = get_label_prob(test_id_list,classes,probs,predicted)

In [28]:
print(label_probas[:20])

[('0_coo.31924058941158_00000447', 'CS_UCS_Medicine', 0.90000000000000002), ('0_coo.31924058941158_00000471', 'CS_UCS_Medicine', 0.90000000000000002), ('0_coo.31924058941158_00000507', 'CS_UCS_Medicine', 0.90000000000000002), ('0_inu.30000011528365_00000612', 'CS_Architecture', 0.69999999999999996), ('0_inu.30000011528365_00000613', 'CS_Architecture', 0.40000000000000002), ('0_loc.ark+=13960=t23b6h241_00000095', 'CS_Architecture', 0.20000000000000001), ('0_mdp.39015000770506_00000195', 'CS_UCS_Medicine', 0.59999999999999998), ('0_mdp.39015000770506_00000196', 'CS_UCS_Medicine', 0.5), ('0_mdp.39015000770514_00000190', 'CS_UCS_Medicine', 0.69999999999999996), ('0_mdp.39015000786593_00000025', 'CS_UCS_Medicine', 0.29999999999999999), ('0_mdp.39015001525446_00000576', 'CS_Architecture', 0.69999999999999996), ('0_mdp.39015001640328_00000313', 'CS_Economic', 0.5), ('0_mdp.39015003736124_00000093', 'CS_Sociology', 0.5), ('0_mdp.39015003793919_00000683', 'CS_Architecture', 0.59999999999999998)

In [29]:
len(label_probas)

168788

In [30]:
with open("label_prob_RF.pkl", 'wb') as fp7:
    pickle.dump(label_probas,fp7)

In [31]:
id_to_predicted = [p for p in zip(test_id_list,predicted)]

In [32]:
with open("predicted_RF.pkl", 'wb') as fp5:
    pickle.dump(id_to_predicted,fp5)

In [33]:
with open("predicted_RF.pkl", 'rb') as fp6:
    predicted_labels = pickle.load(fp6)


In [34]:
predicted_labels[:100]

[('0_coo.31924058941158_00000447', 'CS_UCS_Medicine'),
 ('0_coo.31924058941158_00000471', 'CS_UCS_Medicine'),
 ('0_coo.31924058941158_00000507', 'CS_UCS_Medicine'),
 ('0_inu.30000011528365_00000612', 'CS_Architecture'),
 ('0_inu.30000011528365_00000613', 'CS_Architecture'),
 ('0_loc.ark+=13960=t23b6h241_00000095', 'CS_Architecture'),
 ('0_mdp.39015000770506_00000195', 'CS_UCS_Medicine'),
 ('0_mdp.39015000770506_00000196', 'CS_UCS_Medicine'),
 ('0_mdp.39015000770514_00000190', 'CS_UCS_Medicine'),
 ('0_mdp.39015000786593_00000025', 'CS_UCS_Medicine'),
 ('0_mdp.39015001525446_00000576', 'CS_Architecture'),
 ('0_mdp.39015001640328_00000313', 'CS_Economic'),
 ('0_mdp.39015003736124_00000093', 'CS_Sociology'),
 ('0_mdp.39015003793919_00000683', 'CS_Architecture'),
 ('0_mdp.39015004725290_00000229', 'CS_Business'),
 ('0_mdp.39015004725290_00000230', 'CS_Architecture'),
 ('0_mdp.39015004796366_00000290', 'CS_Business'),
 ('0_mdp.39015004809110_00000065', 'CS_Architecture'),
 ('0_mdp.3901500513

In [44]:
predicted_cats = [p[1] for p in predicted_labels]
Counter(predicted_cats)

Counter({'CS_1': 60,
         'CS_AICS_Drama__The_Drama': 94,
         'CS_AICS__Drama': 19,
         'CS_Advertising': 51,
         'CS_Anatomy': 28,
         'CS_Anthropology': 81,
         'CS_Anthropology__Social_Anthropology': 9,
         'CS_Anthropology__Symbolic_Anthropologists': 6,
         'CS_Anthropology__Symbolic_Interact': 72,
         'CS_Anthropology__Urban_Anthro': 14,
         'CS_Antitrust_Analysis': 775,
         'CS_Applied_And_Normal_Arts': 1039,
         'CS_Applied_Art.': 33,
         'CS_Architects': 97,
         'CS_Architecture': 24923,
         'CS_Architecture_Foundation': 474,
         'CS_Architecture__1880': 2,
         'CS_Architecture__American_architecture': 5,
         'CS_Architecture__Building': 1,
         'CS_Architecture__Burnham': 1,
         'CS_Architecture__Commercial_Architecture': 19,
         'CS_Architecture__Commercial_Skyscraper__Design__Architecture__Commercial_Architects': 1,
         'CS_Architecture__Modern_A': 1,
         'CS_Art'

In [36]:
with open("label_prob_RF.pkl", 'rb') as fp8:
    data = pickle.load(fp8)
    
(data[:100])

[('0_coo.31924058941158_00000447', 'CS_UCS_Medicine', 0.90000000000000002),
 ('0_coo.31924058941158_00000471', 'CS_UCS_Medicine', 0.90000000000000002),
 ('0_coo.31924058941158_00000507', 'CS_UCS_Medicine', 0.90000000000000002),
 ('0_inu.30000011528365_00000612', 'CS_Architecture', 0.69999999999999996),
 ('0_inu.30000011528365_00000613', 'CS_Architecture', 0.40000000000000002),
 ('0_loc.ark+=13960=t23b6h241_00000095',
  'CS_Architecture',
  0.20000000000000001),
 ('0_mdp.39015000770506_00000195', 'CS_UCS_Medicine', 0.59999999999999998),
 ('0_mdp.39015000770506_00000196', 'CS_UCS_Medicine', 0.5),
 ('0_mdp.39015000770514_00000190', 'CS_UCS_Medicine', 0.69999999999999996),
 ('0_mdp.39015000786593_00000025', 'CS_UCS_Medicine', 0.29999999999999999),
 ('0_mdp.39015001525446_00000576', 'CS_Architecture', 0.69999999999999996),
 ('0_mdp.39015001640328_00000313', 'CS_Economic', 0.5),
 ('0_mdp.39015003736124_00000093', 'CS_Sociology', 0.5),
 ('0_mdp.39015003793919_00000683', 'CS_Architecture', 0.5

In [37]:
len(data)

168788

In [38]:
thresholded_prob = [x for x in data if x[2]>0.2]
print(len(thresholded_prob))

140916


In [39]:
with open("label_prob_RF.pkl", 'rb') as fp7:
    all_pbs = pickle.load(fp7)

In [40]:
all_pbs[:50]

[('0_coo.31924058941158_00000447', 'CS_UCS_Medicine', 0.90000000000000002),
 ('0_coo.31924058941158_00000471', 'CS_UCS_Medicine', 0.90000000000000002),
 ('0_coo.31924058941158_00000507', 'CS_UCS_Medicine', 0.90000000000000002),
 ('0_inu.30000011528365_00000612', 'CS_Architecture', 0.69999999999999996),
 ('0_inu.30000011528365_00000613', 'CS_Architecture', 0.40000000000000002),
 ('0_loc.ark+=13960=t23b6h241_00000095',
  'CS_Architecture',
  0.20000000000000001),
 ('0_mdp.39015000770506_00000195', 'CS_UCS_Medicine', 0.59999999999999998),
 ('0_mdp.39015000770506_00000196', 'CS_UCS_Medicine', 0.5),
 ('0_mdp.39015000770514_00000190', 'CS_UCS_Medicine', 0.69999999999999996),
 ('0_mdp.39015000786593_00000025', 'CS_UCS_Medicine', 0.29999999999999999),
 ('0_mdp.39015001525446_00000576', 'CS_Architecture', 0.69999999999999996),
 ('0_mdp.39015001640328_00000313', 'CS_Economic', 0.5),
 ('0_mdp.39015003736124_00000093', 'CS_Sociology', 0.5),
 ('0_mdp.39015003793919_00000683', 'CS_Architecture', 0.5

In [41]:
all_ps = [x[2] for x in all_pbs]
print(max(all_ps))
print(min(all_ps))

1.0
0.1


In [42]:
all_ones = [x for x in all_pbs if x[2] == 1]
print(len(all_ones))

2728


In [43]:
all_smalls = [x for x in all_pbs if x[2] == .1]
print(len(all_smalls))

1935
