In [1]:

# coding: utf-8

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

from sklearn.metrics import confusion_matrix, classification_report


reports_df = pd.read_csv("data.csv")
#reports_df.head()

#'''overview of the data'''

reports_df['RESULT'].unique()

#'''listing unique classes'''

reports_filtered_df = reports_df[pd.notnull(reports_df['VARIABLES'])]
reports_filtered_df.info()

#'''''''''''''overview of the input dataset after removing null rows
fig = plt.figure(figsize = (10,6))
df = reports_filtered_df[['RESULT', 'VARIABLES']]
df.groupby('RESULT').count().plot.bar(ylim=0)
plt.show()

#'''distirbution of classes in the dataset'''

labels = df['RESULT']
text = df['VARIABLES']

X_train, X_test, y_train, y_test = train_test_split(text, labels, random_state=0, test_size=0.3)

count_vect = TfidfVectorizer()
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train)


#print(count_vect.get_feature_names())
#print(X_train_counts.toarray())
#print(X_train_counts.shape)

tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_transformed = tf_transformer.transform(X_train_counts)

X_test_counts = count_vect.transform(X_test)
X_test_transformed = tf_transformer.transform(X_test_counts)

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X_train_counts)

# print idf values
#df_idf = pd.DataFrame(tfidf_transformer.idf_, index = count_vect.get_feature_names(),columns=["tf_idf_weights"])
 
# sort ascending
#df_idf.sort_values(by=['tf_idf_weights'])

 
# count matrix
count_vector = count_vect.transform(text)
 
# tf-idf scores
tf_idf_vector=tfidf_transformer.transform(count_vector)

feature_names = count_vect.get_feature_names()
 
#get tfidf vector for first document
first_document_vector=tf_idf_vector[0]
 
#print the scores
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)


# get class label
labels = LabelEncoder()
y_train_labels_fit = labels.fit(y_train)
y_train_labels_trf = labels.transform(y_train)

print(labels.classes_)

#'''preprocessing input data'''

linearsvc = LinearSVC()
clf = linearsvc.fit(X_train_transformed, y_train_labels_trf)

calibrated_svc = CalibratedClassifierCV(base_estimator = linearsvc, cv = "prefit", method = "sigmoid")

calibrated_svc.fit(X_train_transformed, y_train_labels_trf)
predicted = calibrated_svc.predict(X_test_transformed)

to_predict = ["VARIABLES"]
p_count = count_vect.transform(to_predict)
p_tfidf = tf_transformer.transform(p_count)

#print('Average accuracy on test set = {}'. format(np.mean(predicted == labels.transform(y_test))))

print('Predicted probabilities of the input string are')
print(calibrated_svc.predict_proba(p_tfidf))


#'''''''''''''''Training using classifier'''''''''''

pd.DataFrame(calibrated_svc.predict_proba(p_tfidf)*100, columns = labels.classes_)

#'''''''''''''Prediction''''''''''''''''


report = classification_report(labels.transform(y_test), predicted, digits=4)
print (report)


def get_confusion_matrix_values(y_test, predicted):
    cm = confusion_matrix(labels.transform(y_test), predicted)
    return(cm[1][1], cm[1][2], cm[2][1], cm[2][2])

TP, FP, FN, TN = get_confusion_matrix_values(y_test, predicted)
print("True Positive = ",TP)
print("False Positive = ",FP)
print("False Negative = ",FN)
print("True Negative = ",TN)

sensitivity  = (TP / (TP+FN)) * 100
specificity  = (TN / (TN+FP)) * 100
pos_pred_val = (TP/ (TP+FP)) * 100
neg_pred_val = (TN/ (TN+FN)) * 100
accuracy = ((TP + TN) / (TP + TN + FP + FN)) * 100 
fscore = 2 * ((pos_pred_val * sensitivity) / (pos_pred_val + sensitivity))


print("Sensitivity  = ", sensitivity)
print("Specificity = ", specificity)
print("Positive Predictive Value = ", pos_pred_val)
print("Negative Predictive Value = ", neg_pred_val)
print("Accuracy = ", accuracy)
print("F1-Score = ", fscore)


cm = confusion_matrix(labels.transform(y_test), predicted)
cm_df = pd.DataFrame(cm, index = [labels.classes_], columns = [labels.classes_])

plt.figure(figsize = (5.5, 4))
sns.heatmap(cm_df, annot = True)
plt.title('Contigency Table \n Accuracy: {0:4f}'.format(accuracy))
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()


from sklearn.externals import joblib
joblib.dump(clf,'pipedcalibrated.pkl')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 0 to 399
Data columns (total 3 columns):
X-RAY NUMBER    400 non-null int64
VARIABLES       400 non-null object
RESULT          400 non-null object
dtypes: int64(1), object(2)
memory usage: 12.5+ KB


<matplotlib.figure.Figure at 0x249d91f26d8>

<matplotlib.figure.Figure at 0x249d2797198>

['Indeterminate' 'Negative' 'Positive']
Predicted probabilities of the input string are
[[0.13632867 0.39021989 0.47345144]]
             precision    recall  f1-score   support

          0     1.0000    0.2222    0.3636         9
          1     0.9351    0.9730    0.9536        74
          2     0.8537    0.9459    0.8974        37

avg / total     0.9148    0.9083    0.8921       120

True Positive =  72
False Positive =  2
False Negative =  2
True Negative =  35
Sensitivity  =  97.2972972972973
Specificity =  94.5945945945946
Positive Predictive Value =  97.2972972972973
Negative Predictive Value =  94.5945945945946
Accuracy =  96.3963963963964
F1-Score =  97.2972972972973


<matplotlib.figure.Figure at 0x249d94f9e48>

['pipedcalibrated.pkl']