## 1. Prepare training and test data

### Load libraries

In [64]:
import pandas as pd
import codecs

### Import data from pickles

In [65]:
train_df = pd.read_pickle('pickles/train_df.pkl')
test_df = pd.read_pickle('pickles/test_df.pkl')

In [66]:
# review samlple data
train_df.sample(n=5)

Unnamed: 0,corpus,category
9113,thought emergency condition sts put good size ...,Science
6744,article nrpstandrewsacuk norman r paterson wri...,Religion
8437,sometime future diet evangelist may get diet w...,Science
5520,said article nelsonpapollohpcom peter nelson w...,Politics
7281,jhessenetcomcom john hesse writes oh great won...,Science


### Change columns order

In [67]:
col_name = 'category'

# training data
first_col = train_df.pop(col_name)
train_df.insert(0, col_name, first_col)
print(train_df.sample(n=5))

# test data
first_col = test_df.pop(col_name)
test_df.insert(0, col_name, first_col)
print(test_df.sample(n=5))

         category                                             corpus
362    Automotive  neilsonseoulmprca robert neilson writes sorry ...
2826    Computers  anyone know answer offhand please answer email...
10528      Sports  deepak chhabra dchhabrastplistsca wrote speaki...
1479    Computers  ive recently got hold pc card id like c progra...
1342    Computers  cptullymeduncedu christopher p writes tight ma...
       category                                             corpus
2152  Computers  eric bosco eboscousoraclecom wrote first pleas...
1299  Computers  rhive noticed save model mapping plane rhposit...
2983   Politics  article stevehthoriscbrcom steve hendricks wri...
6975     Sports                     believe nhl draft june weekend
3085   Politics  sunshineccocaltechedu tom renner writes pvasil...


### Save training data into .csv file

In [68]:
train_df.to_csv("data/Comprehend/train.csv", index=False, header=False)

### Preparing .txt file with training data

In [69]:
# remove categories column
test_df.drop(columns='category', axis=0, inplace=True)

In [70]:
test_df.head()

Unnamed: 0,corpus
0,corp opinion expressed user necessarily convex...
1,tobiasconvexcom allen tobias writes better sti...
2,actually simple principle porous adsorbent lik...
3,dont know california false representation odom...
4,article davewcsumdedu david g wonnacott writes...


In [71]:
# save file
with codecs.open('data/Comprehend/test.txt', 'w', "utf-8-sig") as f:
    for index, row in test_df.iterrows():
        f.write("%s\n" % row['corpus'])

## 2. Read and evaluate model predictions

### Load libraries

In [72]:
import pandas as pd
import json

### Open predictions file

In [73]:
with open("data/Comprehend/predictions.jsonl", "r") as file:
    json_list = list(file)

obj_list = []

for json_str in json_list:
    result = json.loads(json_str)
    obj_list.append(result)

print(obj_list[0])

{'File': 'test.txt', 'Line': '0', 'Classes': [{'Name': 'Automotive', 'Score': 0.9087}, {'Name': 'Science', 'Score': 0.064}, {'Name': 'Computers', 'Score': 0.0163}]}


### Prepare list with predicted classes (categories)

In [74]:
predicted_classes = []

for obj in obj_list:
    predicted_classes.append(obj['Classes'][0]['Name'])

print(predicted_classes[0])

Automotive


### Load test data

In [75]:
test_df = pd.read_pickle('pickles/test_df.pkl')

### Read test categories

In [76]:
y_test = test_df.iloc[:, -1].values
print(y_test)

['Automotive' 'Automotive' 'Automotive' ... 'Sports' 'Sports' 'Sports']


### Prepare accuracy score and confusion matrix

In [77]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(accuracy_score(y_test, predicted_classes))
cm = confusion_matrix(y_test, predicted_classes, labels=['Automotive', 'Computers', 'Politics', 'Religion', 'Science', 'Sports'])
print(cm)

0.8840660879305516
[[ 711   34    5    4   38    2]
 [  14 1815    7   12  105    2]
 [   7   23  892   65   57    6]
 [   2   34   33  861   36    2]
 [  14  203   34   32 1293    3]
 [   1   23    6   11   13  742]]
