# Overview Classifier 

Used supervised machine learning to train a classifier based on the heuristic classes, then apply that classifier to the Overview topic. 

In [7]:
import csv

import numpy as np 

## Document Loading 

In [2]:
doc_lookup = {}

with open("data/lit-review-doc-metadata.csv", "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        doc_lookup[row["Document Title"]] = row

print(len(doc_lookup.keys()))

9734


In [4]:
def load_documents(path="data/lit-review-categories.csv", ignore=()):
    not_found = 0
    with open(path) as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row["Document Title"] in doc_lookup:
                doc_lookup[row["Document Title"]]["Label"] = row["Domain"]
            else:
                not_found += 1

    print("Missing docs: {}".format(not_found))
    
    docs = [d for d in doc_lookup.values() if "Label" in d and d["Label"] not in ignore] 
    print("Filtered docs: {}".format(len(docs)))

    return [d["Abstract"] for d in docs], [d["Label"] for d in docs]


IGNORE = [
    "Exclude",
    "Overview",
    "PMU Placement",
    "PMU",
    "PDC",
    "Unknown",
    "Undefined",
    "Control",
    "Communications",
    "Phasor Estimation",
    "Simulation",
    "Islanding",
    "Time Synchronization"
    "Testing",
    "Real Time Operations",
    "Tools",
    "EMS",
    "Standards",
    "",
    "Phasor estimation",
    "Protection Systems"
]

X, y = load_documents(ignore=IGNORE)

Missing docs: 296
Filtered docs: 3872


## Modeling and Cross-Validation

In [16]:
from sklearn.base import clone
from sklearn.pipeline import Pipeline 
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 

from sklearn.model_selection import cross_val_score

no_features = 750

tf_vec = CountVectorizer(
    max_df=0.8, min_df=2, max_features=no_features, stop_words='english', ngram_range=(1,3)
)

tfidf_vec = TfidfVectorizer(**tf_vec.get_params())


def make_pipeline(model, idf=False):
    vec = tfidf_vec if idf else tf_vec 
    return Pipeline([
        ('vec', clone(vec)),
        ('clf', model), 
    ])

In [18]:
clf = make_pipeline(MultinomialNB())
cross_val_score(clf, X, y).mean()



0.5971256643774617

In [19]:
clf = make_pipeline(MultinomialNB(), idf=True)
cross_val_score(clf, X, y).mean()



0.5070241428417664

In [20]:
from sklearn.ensemble import GradientBoostingClassifier 

clf = make_pipeline(GradientBoostingClassifier(), idf=True)
cross_val_score(clf, X, y).mean()



0.5890884733759267

In [24]:
from sklearn.ensemble import GradientBoostingClassifier 

clf = make_pipeline(GradientBoostingClassifier(), idf=False)
cross_val_score(clf, X, y).mean()



0.6024274548186485

In [22]:
from sklearn.linear_model import LogisticRegression

clf = make_pipeline(LogisticRegression(), idf=True)
cross_val_score(clf, X, y).mean()



0.610567577220215

In [23]:
from sklearn.linear_model import LogisticRegression

clf = make_pipeline(LogisticRegression(), idf=False)
cross_val_score(clf, X, y).mean()



0.5765096422761253

## Selected Model

Currently selecting the TF-IDF LogisticRegression which had a simple cross-val scoe of 0.61

In [25]:
from sklearn.externals import joblib

clf = make_pipeline(LogisticRegression(), idf=True)
clf.fit(X, y)

joblib.dump(clf, 'maxent_tfidf_overview.joblib') 

['maxent_tfidf_overview.joblib']

## Assign to Overview 

In this section, we assign the classes to the papers in the overview section. 

In [33]:
def load_overview(path="data/lit-review-categories.csv"):
    with open(path) as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row["Document Title"] in doc_lookup:
                doc_lookup[row["Document Title"]]["Label"] = row["Domain"]
    
    docs = [d for d in doc_lookup.values() if "Label" in d and d["Label"] == 'Overview'] 
    print("Filtered docs: {}".format(len(docs)))

    return docs


overview = load_overview()

Filtered docs: 730


In [40]:
def overview_info(idx=0):
    X = [overview[idx]['Abstract']]
    probs = dict(zip(clf.classes_, clf.predict_proba(X)[0]))
    probs["Label"] = clf.predict(X)[0]
    probs["Document Title"] = overview[idx]["Document Title"]
    return probs


with open('data/lit-overview-classified.csv', 'w') as f:
    fields = ['Document Title', 'Label',] + list(clf.classes_)
    writer = csv.DictWriter(f, fields)
    writer.writeheader()
    
    for idx in range(len(overview)):
        writer.writerow(overview_info(idx))

## Analysis of Classification

In [42]:
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

overview = pd.read_csv('data/lit-overview-classified.csv')

In [47]:
overview.groupby('Label')['Label'].count()

Label
Cyber Security              85
Data Quality                16
Dynamic Stability           15
Economics                    8
Equipment Health            11
Event Analysis              11
Event Detection              4
Fault Analysis              12
Inertial Estimation          1
Load Shedding                2
Measurement Techniques     340
Oscillations                45
Power Quality                6
Stability                   29
State Estimation            63
Testing                      9
Time Synchronization        11
Topology Identification     40
Transient Stability          2
Visualization                2
Voltage Stability           18
Name: Label, dtype: int64