In [2]:
%matplotlib widget
from collections import defaultdict
import glob
import sys
sys.path.append('/Users/nmiles/PACMan_dist/')


from joblib import dump, load
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
import pacman2020
from utils import tokenizer

from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

In [3]:
pman = pacman2020.PACManTrain(cycles_to_analyze=[24, 25])

In [4]:
pman.read_training_data(parallel=False)

INFO [pacman2020.read_training_data:200] Reading in 1093 proposals...
Data Directory: /Users/nmiles/PACMan_dist/training_data/training_corpus_cy24
100%|██████████| 1093/1093 [07:29<00:00,  2.43it/s]
INFO [pacman2020.preprocess:134] Total time for preprocessing: 7.500
INFO [pacman2020.read_training_data:200] Reading in 1208 proposals...
Data Directory: /Users/nmiles/PACMan_dist/training_data/training_corpus_cy25
100%|██████████| 1208/1208 [08:07<00:00,  2.48it/s]
INFO [pacman2020.preprocess:134] Total time for preprocessing: 8.133


In [7]:
pman.proposal_data['cycle_25'].head()

Unnamed: 0,text,cleaned_text,fname,proposal_num,hand_classification,encoded_hand_classification
0,We propose to obtain far-ultraviolet COS spect...,propose obtain far ultraviolet cos spectra glo...,/Users/nmiles/PACMan_dist/training_data/traini...,954,stellar populations and the ism,5
1,Our team is using Spitzer in a long-term searc...,team spitzer long term search extragalactic mi...,/Users/nmiles/PACMan_dist/training_data/traini...,901,stellar physics,4
2,The evolution of stars with masses above 20-25...,evolution star mass msun -- end life black hol...,/Users/nmiles/PACMan_dist/training_data/traini...,284,stellar physics,4
3,We propose a IR spectroscopic survey of galaxi...,propose ir spectroscopic survey galaxy z=1 sta...,/Users/nmiles/PACMan_dist/training_data/traini...,359,galaxies and the igm,0
4,We propose Hubble imaging of a newly-discovere...,propose hubble imaging newly discover sample i...,/Users/nmiles/PACMan_dist/training_data/traini...,428,galaxies and the igm,0


In [8]:
pman.proposal_data['cycle_24'].head()

Unnamed: 0,text,cleaned_text,fname,proposal_num,hand_classification,encoded_hand_classification
0,This proposal seeks to use STIS with one orbit...,proposal seek stis orbit map disk titan wavele...,/Users/nmiles/PACMan_dist/training_data/traini...,954,solar system,3
1,Stellar mass loss rates are only known for 10 ...,stellar mass loss rate know sun like star meas...,/Users/nmiles/PACMan_dist/training_data/traini...,901,stellar physics,4
2,Local AGN were the first extragalactic objects...,local agn extragalactic object observe vacuum ...,/Users/nmiles/PACMan_dist/training_data/traini...,284,supermassive black holes and active galaxies,6
3,This is project is designed to explore the AGB...,project design explore agb rgb population lsb ...,/Users/nmiles/PACMan_dist/training_data/traini...,359,stellar populations and the ism,5
4,We have discovered a short period (18 minutes)...,discover short period minute far ultraviolet v...,/Users/nmiles/PACMan_dist/training_data/traini...,428,stellar physics,4


In [9]:
pman.fit_model(pman.proposal_data['cycle_25'])

In [11]:
pred = pman.model.predict(pman.proposal_data['cycle_24']['cleaned_text'])

In [12]:
pred_prob = pman.model.predict_proba(pman.proposal_data['cycle_24']['cleaned_text'])

In [13]:
pman.proposal_data['cycle_24']['encoded_pred_classification'] = pred

In [14]:
pman.proposal_data['cycle_24']['pred_classification'] = pman.encoder.inverse_transform(pred)

In [15]:
print(classification_report(pman.proposal_data['cycle_24']['encoded_hand_classification'], pred))

              precision    recall  f1-score   support

           0       0.84      0.81      0.83       335
           1       0.50      0.57      0.53        60
           2       0.86      0.96      0.91       130
           3       0.98      0.80      0.88        60
           4       0.91      0.89      0.90       261
           5       0.74      0.77      0.75       116
           6       0.89      0.92      0.90       131

    accuracy                           0.84      1093
   macro avg       0.82      0.82      0.82      1093
weighted avg       0.85      0.84      0.84      1093



In [16]:
data_out = defaultdict(list)
for i, row in pman.proposal_data['cycle_24'].iterrows():
    data_out['fname'].append(row['fname'])
    data_out['encoded_pred_classification'].append(row['encoded_pred_classification'])
    data_out['pred_classification'].append(row['pred_classification'])
    data_out['hand_classification'].append(row['hand_classification'])
    data_out['encoded_hand_classification'].append(row['encoded_hand_classification'])
    for j, class_prob in enumerate(pred_prob[i]):
        data_out[f"{pman.encoder.classes_[j].replace(' ','_')}_prob"].append(class_prob)

In [17]:
df = pd.DataFrame(data_out)

In [18]:
top_pred = 0
top_two_pred = 0
custom_accuracy_dict = {}
for c in pman.encoder.classes_:
    custom_accuracy_dict[c] = {}
for key in custom_accuracy_dict.keys():
    custom_accuracy_dict[key]['top'] = []
    custom_accuracy_dict[key]['top_two'] = []
    custom_accuracy_dict[key]['misclassified'] = []
for num, row in df.iterrows():
    hand_classification = row['hand_classification']
    top_two = row[row.index.str.contains('prob')].sort_values(ascending=False)[:2]
    categories = list(top_two.index)
    categories = [val.replace('_prob','').replace('_',' ') for val in categories]
    if hand_classification == categories[0]:
        custom_accuracy_dict[hand_classification]['top'].append(1)
        top_pred +=1
        top_two_pred +=1
    elif hand_classification in categories:
        custom_accuracy_dict[hand_classification]['top_two'].append(1)
        top_two_pred += 1
    else:
        custom_accuracy_dict[hand_classification]['misclassified'].append(1)

In [19]:
top_pred/len(df)

0.8417200365965233

In [20]:
top_two_pred/len(df)

0.9551692589204026

In [21]:
pman.save_model('pacman_production_model.joblib')

In [38]:
from sklearn.model_selection import cross_val_score, train_test_split

In [26]:
scores = cross_val_score(
    pman.model, pman.proposal_data['cycle_24']['cleaned_text'], pman.proposal_data['cycle_24']['encoded_hand_classification'], cv=10, scoring='f1_macro')
scores   

  'precision', 'predicted', average, warn_for)


array([0.78046805, 0.71681478, 0.8112345 , 0.80433255, 0.80876795,
       0.79130448, 0.78713807, 0.75917137, 0.8356312 , 0.74882884])

In [34]:
def combine_proposals(pman):
    df1 = pman.proposal_data['cycle_24']
    df2 = pman.proposal_data['cycle_25']
    df = df1.append(df2)
    return df

In [31]:
scores25 = cross_val_score(
    pman.model, pman.proposal_data['cycle_25']['cleaned_text'], pman.proposal_data['cycle_25']['encoded_hand_classification'], cv=10, scoring='f1_macro')
scores25  

array([0.82676654, 0.83300332, 0.78268655, 0.79351331, 0.74748481,
       0.82480188, 0.78736088, 0.77505003, 0.80831729, 0.69287953])

In [35]:
total_dataset = combine_proposals(pman)

In [37]:
total_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2301 entries, 0 to 1207
Data columns (total 8 columns):
cleaned_text                   2301 non-null object
encoded_hand_classification    2301 non-null int64
encoded_pred_classification    1093 non-null float64
fname                          2301 non-null object
hand_classification            2301 non-null object
pred_classification            1093 non-null object
proposal_num                   2301 non-null int64
text                           2301 non-null object
dtypes: float64(1), int64(2), object(5)
memory usage: 161.8+ KB


In [97]:
X_train, X_test, y_train, y_test = train_test_split(total_dataset['cleaned_text'], total_dataset['encoded_hand_classification'], test_size=0.3, random_state=42)

In [98]:
len(y_train)/ len(total_dataset['cleaned_text'])

0.6996957844415471

In [99]:
train_df = pd.DataFrame()
train_df['cleaned_text'] = X_train
train_df['encoded_hand_classification'] = y_train

In [100]:
test_df = pd.DataFrame()
test_df['cleaned_text'] = X_test
test_df['encoded_hand_classification'] = y_test

In [101]:
pman.fit_model(train_df)

In [102]:
test_df

Unnamed: 0,cleaned_text,encoded_hand_classification
941,far ultraviolet fuv emission old stellar puzzl...,5
286,propose new deep polarization imaging m87 jet ...,6
929,striking difference nearby high z star formati...,0
176,propose analyze ksec time tag observation pre ...,5
464,supernovae sne powerful event universe profoun...,4
...,...,...
944,star cluster play central role understand star...,0
939,accretion process young low mass star classica...,4
733,discovery relation supermassive black hole bh ...,6
344,galactic stellar population interstellar mediu...,4


In [103]:
pred = pman.model.predict(test_df['cleaned_text'])

In [104]:
print(classification_report(test_df['encoded_hand_classification'], pred))

              precision    recall  f1-score   support

           0       0.79      0.86      0.82       208
           1       0.69      0.52      0.59        46
           2       0.91      0.93      0.92        83
           3       0.97      0.97      0.97        31
           4       0.85      0.89      0.87       148
           5       0.75      0.64      0.69        90
           6       0.90      0.88      0.89        85

    accuracy                           0.83       691
   macro avg       0.84      0.81      0.82       691
weighted avg       0.83      0.83      0.83       691



In [96]:
len(pred)

1151

In [106]:
data_out = defaultdict(list)
for i, row in pman.proposal_data['cycle_24'].iterrows():
    data_out['fname'].append(row['fname'])
    data_out['encoded_pred_classification'].append(row['encoded_pred_classification'])
    data_out['pred_classification'].append(row['pred_classification'])
    data_out['hand_classification'].append(row['hand_classification'])
    data_out['encoded_hand_classification'].append(row['encoded_hand_classification'])
    for j, class_prob in enumerate(pred_prob[i]):
        data_out[f"{pman.encoder.classes_[j].replace(' ','_')}_prob"].append(class_prob)

AttributeError: 'Series' object has no attribute 'iterrows'