### TASK-3 NLP 

<br><b>Filename: <font color='red'>visualizing_results.ipynb</font></b> ---> defines the necessary functions in order to obtain useful statistics from the results and visualize them for enhanced comprehension.
<hr/>
This notebook specifies the following functions: ( the sequence of description is same as the sequence of their definition in the notebook cells below )
<ol>
    <li><b>get_stats( actual,predicted ): </b> Given the list of actual and predicted labels, the function returns stnadard inferences that can be obtained from these values, such as accuracy and visual representation of True positives and misclassified records through a pie chart </li>
    <li><b>get_heatmap( actual,predicted ):</b> Given the actual and predicted labels, the function returns the heatmap as a visual represntation of the percentage distribution of records of each possible label. </li>
    <li><b>autolabel(rect):</b>To display the value over the particular bar in barplot</li>
    <li><b>get_grouped_graph( actual,predicted ):</b> Given the actual and predicted labels, the function returns a grouped bar graph as a visual representation of number of records from each label which have been correctly predicted and those which have not been.</li>
    <li><b>vis_data(data,ax):</b>Given the dataset, this function visualizes the number of records from each category in the form of a barplot.</li>
    <li><b>visualize():</b>Driver function for the visualization pipeline.</li>
</ol>

<img src='images/visualize.png'>

### CELL #1: importing required modules

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import json
from sklearn.metrics import plot_confusion_matrix
from collections import Counter
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

### CELL #2: defining get_stats( actual, predicted )
Function description in the top cell

In [2]:
def get_stats(actual,predicted):
    
    
    print("----------------- THE RESULTS OBTAINED ARE AS FOLLOWS ---------------")
    print("")
    print("Total number of records: ",len(actual))
    correct = 0
    wrong = 0
    
    #----------------------------- RECORDING THE NUMBER OF RECORDS PREDICTED CORRECTLY AND INCORRECTLY
    
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct+=1
        else:
            wrong+=1
    
    #----------------------------------------------------------------------------
    
    
    print("Number of records predicted correctly = ",correct)
    print("Number of records misclassified = ",wrong)
    
    #------------------------------------------------------------------------- COMPUTING ACCURACY
    
    print("The accuracy of the model is = ",correct/(correct+wrong)*100,"%")
    
    #--------------------------------------------------------------------------
    
    x = precision_score(actual, predicted, average='macro',labels=np.unique(predicted))
    #print(set(actual) - set(predicted))
    print("Precision is = ",x)
    
    y = recall_score(actual, predicted , average='macro', labels = np.unique(predicted))
    print("Recall is = ",y)
    
    #------------------------- PIE CHART VISUALIZATION CODE SEGMENT
    
    y = np.array([correct,wrong])
    textprops = {"fontsize":11}
    fig1, ax1 = plt.subplots(figsize=(5,5))
    
    
    ax1.pie(y,labels=["Correct Predictions","Incorrect Predictions"],autopct='%1.4f%%',textprops=textprops)
    #plt.legend()
    plt.show()

### CELL #3: defining get_heatmap( actual, predicted )
Function description in the top cell

In [3]:
def get_heatmap(actual,predicted):
    count = Counter(zip(actual,predicted))
    #print(count)
    
    #------------------------------------- DEFINING AND POPULATING DATAFRAME FOR CONSTRUCTING THE HEATMAP
    
    df = pd.DataFrame(0,columns=np.unique(actual),index=np.unique(predicted))
    
    for i,j in count.items():
        df[i[0]][i[1]] = j
    total = []
    s=0
    for i in range(len(df.columns)):
        for j in range(len(df)):
            s = s + df.iloc[j,i]
        total.append(s)
        s=0
    for i in range(len(df)):
        for j in range(len(df.columns)):
            df.iloc[i,j] = (df.iloc[i,j]/total[j])*100
            
    #--------------------------------------------------------------------------------------
    
    #--------------------------------------------------- HEATMAP VISUALIZATION CODE SEGMENT
    
    fig, ax = plt.subplots(figsize=(45,20))  
    df.replace(0,np.nan)
    sns.set(font_scale=2)
    plt.xlabel("Predicted Value ------>") 
    plt.ylabel("Actual Value ------>")
    res = sns.heatmap(df,linewidths=0.5,annot=True,fmt='.2f',cmap="icefire",annot_kws={"size": 20},cbar_kws={'label': 'Percentage(%)'})
    res.set_xticklabels(res.get_xmajorticklabels(), fontsize = 22)
    res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 22)
    ax.set(xlabel="Actual Value -------->", ylabel = "Predicted Value -------->")
    ax.set_title('Heatmap for predictions ( in %age )')
    
    #--------------------------------------------------------------------------------------

### CELL #4: defining get_grouped_graph( actual, predicted )
Function description in the top cell

In [4]:
def autolabel(rects,ax): # ------------- FOR DISPLAYING LABELS OVER EACH BAR OF THE GROUPED BAR GRAPH
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
        

def get_grouped_graph(actual,predicted):
    #print(Counter(actual))
    #print(len(actual))
    cat_correct = []
    cat_wrong = []
    correct = 0
    wrong = 0
    cat = np.unique(actual).tolist()
    #print(cat)
    
    #------------------------------------------------------------------ PREPARING DATA FOR VISUALIZATION
    # for each category, keeping record of how many correct and incorrect predictions in a list
    
    for i in range(len(cat)):
        for j in range(len(predicted)):
            if actual[j] == cat[i]:
                if predicted[j] == actual[j]:
                    correct = correct + 1
                else:
                    wrong = wrong + 1
        if(wrong>=correct):
            print("The category '",cat[i],"' has fewer True Positives (",correct,") than misclassifications (",wrong,")")
        cat_correct.append(correct)
        cat_wrong.append(wrong)
        correct = 0
        wrong = 0
    
    #------------------------------------------------------------------
    
    #print("Predicted correct = ",cat_correct)
    #print("Predicted wrong = ",cat_wrong)
    
    #------------------------------------------------------------ GROUPED BAR GRAPH VISUALIZATION CODE SEGMENT
    
    fig, ax = plt.subplots()
    fig.set_figheight(25)
    fig.set_figwidth(85)
    width = 0.35
    plt.rcParams.update({'font.size': 35})
    x = np.arange(len(cat))
    #x=3*x
    rects1 = ax.bar(x - width/2, cat_correct, width, label='Predicted Correctly')
    rects2 = ax.bar(x + width/2, cat_wrong, width, label='Predicted Incorrectly',color='red')
    

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Count ------>',fontsize=70)
    ax.set_xlabel('Categories ------>',fontsize=70)
    ax.set_title('Category wise predicted counts')
    ax.set_xticks(x)
    ax.set_xticklabels(cat,rotation=90,fontsize=45)
    ax.legend(prop={'size': 70})
    
    autolabel(rects1,ax)
    autolabel(rects2,ax)

    #fig.tight_layout()
    plt.show()
    
    #------------------------------------------------------------------

### CELL #4a): defining vis_data( data )
<br> visualizes the dataset after the preprocessing gets completed

In [None]:
def vis_data(data):
    labels, values = zip(*Counter(data['cat'].tolist()).items())
    df = pd.DataFrame(columns=['labels','count'])
    df['labels'] = labels
    for i in range(len(df)):
        df.loc[i,'count'] = values[i]
    df = df.sort_values('count')
    fig, ax = plt.subplots(figsize=(45,20))
    ax = sns.barplot(x="labels", y="count", data=df, order=df['labels'])
    #autolabel(b,ax)
    plt.xticks(rotation='vertical',fontsize = 50)
    plt.yticks(fontsize = 50)
    ax.grid(b=True, which='major', color='#d3d3d3', linewidth=1.0)
    ax.grid(b=True, which='minor', color='#d3d3d3', linewidth=0.5)
    '''indexes = np.arange(len(labels))
    width = 1
    plt.rcParams.update({'font.size': 35})
    b = ax.bar(indexes, values, width)
    ax.set_ylabel('Count ------>',fontsize=70)
    ax.set_xlabel('Categories ------>',fontsize=70)
    autolabel(b,ax)
    plt.xticks(indexes + width * 0.05, labels,rotation=90,fontsize=40)'''
    plt.show()

### CELL #5: defining visualize()
<br>Driver function for the implementation pipeline

In [5]:
def visualize():
    
    print("====================================================================")
    print("==================== VISUALIZING THE RESULTS =======================")
    print("====================================================================")
    filename=["output_files/string_svm_results.csv","output_files/bow_results.csv"]
    models=["SVM MODEL","BAG OF WORDS MODEL"]
    
    for i in range(len(filename)):
        f = filename[i]
        print("")
        print("****************************************************************")
        print("")
        print("                          ",models[i],"                       ")
        print("")
        print("****************************************************************")
        print("")
        print("")
        d = pd.read_csv(f)
        #print(np.unique(bow['actual']))
        get_stats(d['actual'].tolist() , d['predicted'].tolist())
        get_heatmap( d['actual'].tolist() , d['predicted'].tolist() )
        get_grouped_graph(d['actual'].tolist() , d['predicted'].tolist())
        print("")
        print("****************************************************************")
        print("")