# Automatically assess quality of a dataset before doing any manual analysis.

In [1]:
import fasttext#0.8
import pandas as pd
import numpy as np

#pd.set_option('display.max_rows', 120)
#pd.set_option('display.max_columns', 30)
#pd.set_option('display.width', 1000) 

import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings('ignore')


from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize



from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy.lang.en.stop_words import STOP_WORDS
stop_words = list(set(list(STOP_WORDS) + list(ENGLISH_STOP_WORDS)))




In [2]:
# Model Training


import string
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def train_model(df):
    


    #setting 'id' unique value for each row
    ind=[]
    for i in range(len(df)):
        ind.append(i)
    df['ind']=ind




    #Train and Test set (70%-30% for each category class)
    groups=list(set(df['out_class'].tolist()))
    df_train1=pd.DataFrame(columns=df.columns.tolist())
    for g in groups:
        df1=df[df['out_class']==g]
        df1=df1[:int(len(df1)*.70)]
        df_train1=pd.concat([df_train1,df1])

    df_test1=df[~(df['ind'].isin(df_train1['ind']))]




    f = open('all_tickets_train.txt','w')

    train_size=df_train1.shape[0]
    for i in range(df_train1.shape[0]):
        label=str(df_train1['out_class'].iloc[i])
        label=label.replace(" ", "-")
        label=label.lower()

        body=str(df_train1['merged'].iloc[i])
        body=body.translate(str.maketrans('','',string.punctuation))
        body=body.replace('\n',' ')
        body=body.lower()
        f.write('__label__'+label+' '+body+'\n')

    f.close()
    print('\nWritten')
    classifier = fasttext.supervised('all_tickets_train.txt','model_all_text',lr=0.025,epoch=25,word_ngrams=2,bucket=200000,dim=300,loss='softmax')
    print('Classifier Trained')
    # classifier.save_model("model_all_tickets.bin")




    sent=[]
    labels=[]

    t_size=df_test1.shape[0]
    for i in range(df_test1.shape[0]):
        label=str(df_test1['out_class'].iloc[i])
        label=label.replace(" ", "-")
        label=label.lower()
        labels.append(label)

        body=str(df_test1['merged'].iloc[i])
        body=body.translate(str.maketrans('','',string.punctuation))
        body=body.replace('\n',' ')
        body=body.lower()
        sent.append(body)




    df_result=pd.DataFrame(columns=['real','pred'])
    pred=[]
    for s in sent:
        prediction=classifier.predict([s])[0][0]
        pred.append(prediction)
        #print(len(pred))


    df_result['pred']=pred
    df_result['real']=labels


    
    #print('Accuracy: ',accuracy_score(pred,labels))
    #print('f1_score: ',f1_score(pred,labels, average='weighted'))

    return accuracy_score(pred,labels),f1_score(pred,labels, average='weighted')

# loading Other Data

In [None]:
data = pd.read_csv('./../input/all_data.csv')


mapping = {
    "input_colls" : {
        "coll1" :"title", # "name of column 1 in csv"
        "coll2": "body"},  # "name of column 2 in csv"
    "output_class" : {
        "coll1":"category"},  # "name of out class column 1 in csv"
    "important_out_class" : "" # list of classes to classify. If empty, consider all classes in csv
}



In [4]:
# Data Quality 

In [None]:
out_text = ""
out_text = out_text + '\n'+ "Data Stats:"
print('Data Stats:')



txt =  'Total Dataset Size = ' + str(len(data))
print(txt)
out_text = out_text + '\n'+ txt



inp_cols = list (mapping['input_colls'].values()) 
out_cols = list (mapping['output_class'].values())
inp = data.loc[:,inp_cols]
out = data.loc[:,out_cols]
out[out_cols] = out[out_cols].astype(str)




for c in inp_cols:
    txt = 'Unique values in "' + str(c) + '"  = ' + str(len(inp.loc[:,c].unique())) + '/' + str(len(data))
    print(txt)
    out_text = out_text +  '\n' + txt

    
txt =  'Unique values in "' + str(out_cols[0]) + '"  = ' + str(len(out.loc[:,out_cols[0]].unique()) )   
print(txt)
out_text = out_text + '\n' + txt


inp = inp.fillna('')
out = out.fillna('')


# Merging input
merged_inp = inp.apply(lambda x: x.str.cat(sep=','), axis=1)
merged_inp = pd.DataFrame(merged_inp)
merged_inp.columns=['merged']
out.columns = ['out_class']
data_all = pd.concat([merged_inp,out],axis=1)





txt = 'Unique values in  "Merged input" = ' + str(len(data_all.merged.unique()) ) + '/' +  str(len(data))
print(txt)
out_text = out_text + '\n' + txt


# nan
txt = 'Persentage of nans in Merged input  = ' + str(len(merged_inp[merged_inp.merged==','])*100/len(merged_inp)) + '%'
print(txt)
out_text = out_text + '\n' + txt

txt = 'Persentage of nans in output = ' + str(len(out[out.out_class==''])*100/len(out)) + '%'
print(txt)
out_text = out_text + '\n' + txt


######################### Red Rules
out_text = out_text + '\n\n\n'+ "Red Rules:"
print('\n\nRed Rules:')



persentage_nans = len(data_all[data_all.merged==','][data_all.out_class != ''])*100/len(data_all)
txt = 'Persentage of input NAN with valid Output Class Assigned = ' + str(persentage_nans) + '%'
print(txt)
out_text = out_text + '\n' + txt

persentage_nans = len(data_all[data_all.out_class==''][data_all.merged != ','])*100/len(data_all)
txt = 'Persentage Output NANs with Valid input text = ' + str(persentage_nans) + '%'

print(txt)
out_text = out_text + '\n' + txt





# No of output clssses with data distribution

df_t = data_all.out_class.value_counts()
df_t2 = pd.DataFrame()
df_t2.loc[:,'vals'] = df_t.index
df_t2.loc[:,'counts'] = df_t.tolist()
df_t2.loc[:,'counts'] = df_t2.loc[:,'counts']*100/max(df_t2.loc[:,'counts'])
# Output Class Data Distribution
df_t2.index = df_t2.vals


#Red Rules:
#1. Minimum number of records required for each assignment group
#2. Same short+long description, going to different assignment groups

#Yellow Rules:
#Based on distribution point out classes for user to review the classes with comparivtely less data (print class names and percentage of records)

#Minimum number of records required for each assignment group = 2% of highest class count
#Recommended number of records required for each assignment group = 35% of highest class count



min_vals_per_class = 3
recommended_vals_per_class = 15

x = len(df_t2[df_t2.counts<min_vals_per_class])

txt = 'Output Classes not satisfying "Minimum Class Examples" requirement of having  less than ' + str(min_vals_per_class)  + '% maximum class example count  = '+str(x)+'/'+str(len(data_all.loc[:,'out_class'].unique()))
print(txt)
out_text = out_text + '\n' + txt




### Plot

df_t3 = df_t2.copy()
df_t3.loc[df_t2['counts'].between(min_vals_per_class, recommended_vals_per_class+1),'colour'] = 'y'
df_t3.loc[df_t2['counts']>recommended_vals_per_class,'colour'] = 'g'
df_t3.loc[df_t2['counts']<min_vals_per_class,'colour'] = 'r'

pie = df_t2.plot.bar(figsize=(15,10), color=[df_t3.colour.tolist()],title = 'Out Class Data Distribution. (Green = Good, Yellow = Acceptable, Red = Unacceptable)',fontsize = 8)

plt.savefig('pie')






#cleaning
import re
def clean_sent(sent):
    sent = sent.lower()
    special_chars2 = ['\n','\t','\\']
    for i in special_chars2:
        sent = sent.replace(i,' ')
    special_chars = '~!@#$%^&*()_+{}[]:;"<>?,./\|`-='
    for i in special_chars:
        sent = sent.replace(i,' ')
    special_chars3 = "'"
    for i in special_chars3:
        sent = sent.replace(i,' ')
    sent = re.sub(' +', ' ',sent)
    return sent


data_all['merged'] = data_all['merged'].apply(clean_sent)




mis_classified_instances = data_all.groupby('merged').filter(lambda x : (len(x['merged'])==x['out_class'].nunique())&(len(x['merged'])>1))#.sort_values(by = 'merged')

txt = 'Same input going to different Output classes: ' + str(len(mis_classified_instances))
print(txt)
out_text = out_text + '\n' + txt



############################# Yellow Rules
out_text = out_text + '\n\n\n'+ "Yellow Rules:"
print('\n\n\nYellow Rules:')
x = len(df_t2[df_t2.counts<recommended_vals_per_class])
txt = 'Output Classes not satisfying "Recommended Class Examples" requirement of having  less than ' + str(recommended_vals_per_class)  + '% maximum class example count  = '+str(x)+'/'+str(len(data_all.loc[:,'out_class'].unique()))
print(txt)
out_text = out_text + '\n' + txt






# Append-adds at last 
#file1 = open("Results.txt","w")
#file1.write(out_text) 
#file1.close()








# Removing Problematic Data and Model Training

In [6]:
# removing nans from merged data
data_no_nans = data_all[data_all.merged != ','].copy()
data_no_nans = data_no_nans[data_no_nans.out_class != ''].copy()
# model training
acc_no_nan,f1_no_nan = train_model(data_no_nans.reset_index(drop=True))
print('Accuraccy with NANs removed : ',acc_no_nan, '\nF1 score with NANs removed : ',f1_no_nan )


# removing data failing red rules
# removing same data_different_classes
mis_classified_instances = data_no_nans.groupby('merged').filter(lambda x : (len(x['merged'])==x['out_class'].nunique())&(len(x['merged'])>1))#.sort_values(by = 'merged')
data_no_nans_no_misclassified = data_no_nans.iloc[~data_no_nans.index.isin(mis_classified_instances.index),:].copy()


classes_below_lower_lim = df_t2[df_t2.counts<min_vals_per_class].vals.tolist()
data_no_nans_no_misclassified_no_red = data_no_nans_no_misclassified[~data_no_nans_no_misclassified.out_class.isin(classes_below_lower_lim)].copy()

# model training
acc_no_nan_no_red,f1_no_nan_no_red = train_model(data_no_nans_no_misclassified_no_red.reset_index(drop=True))
print('Accuraccy with no NANs and no Red Rules violating Classes : ',acc_no_nan_no_red, '\nF1 score with no NANs and no Red Rules violating Classes : ',f1_no_nan_no_red )





# removing data failing yellow rules
classes_below_yellow_lim = df_t2[df_t2.counts<recommended_vals_per_class].vals.tolist()
data_no_nans_no_misclassified_no_yelllow = data_no_nans_no_misclassified_no_red[~data_no_nans_no_misclassified_no_red.out_class.isin(classes_below_yellow_lim)].copy()


# model training
acc_no_nan_no_red_no_yellow,f1_no_nan_no_red_no_yellow = train_model(data_no_nans_no_misclassified_no_yelllow.reset_index(drop=True))
print('Accuraccy with no NANs and no Red and Yellow Rules violating Classes : ',acc_no_nan_no_red_no_yellow, '\nF1 score with no NANs and no Red and Yellow Rules violating Classes : ',f1_no_nan_no_red_no_yellow )





Written
Classifier Trained
Accuraccy with NANs removed :  0.23283858998144713 
F1 score with NANs removed :  0.3178429638769952

Written
Classifier Trained
Accuraccy with no NANs and no Red Rules violating Classes :  0.261139896373057 
F1 score with no NANs and no Red Rules violating Classes :  0.34808643117898075

Written
Classifier Trained
Accuraccy with no NANs and no Red and Yellow Rules violating Classes :  0.3540701522170748 
F1 score with no NANs and no Red and Yellow Rules violating Classes :  0.45570917828293933


In [7]:
# new ideas

# sentences with all words gibbersih to be removed

# same outclass in differenrt catagory

# text length


In [8]:
# data Quality 
# data left after removing data not following red rules
# removing nans from merged data
data_no_nans = data_all[data_all.merged != ','].copy()
data_no_nans = data_no_nans[data_no_nans.out_class != ''].copy()
# removing data failing red rules
# removing same data_different_classes
mis_classified_instances = data_no_nans.groupby('merged').filter(lambda x : (len(x['merged'])==x['out_class'].nunique())&(len(x['merged'])>1))#.sort_values(by = 'merged')
data_no_nans_no_misclassified = data_no_nans.iloc[~data_no_nans.index.isin(mis_classified_instances.index),:].copy()
classes_below_lower_lim = df_t2[df_t2.counts<min_vals_per_class].vals.tolist()
data_no_nans_no_misclassified_no_red = data_no_nans_no_misclassified[~data_no_nans_no_misclassified.out_class.isin(classes_below_lower_lim)].copy()


# removing data failing yellow rules
classes_below_yellow_lim = df_t2[df_t2.counts<recommended_vals_per_class].vals.tolist()
data_no_nans_no_misclassified_no_yelllow = data_no_nans_no_misclassified_no_red[~data_no_nans_no_misclassified_no_red.out_class.isin(classes_below_yellow_lim)].copy()







In [9]:
txt = '\n\nResults/Recommendations:\n'
out_text = out_text + '\n' + txt
print(txt)





Results/Recommendations:



In [10]:

txt = 'Data after removing Red Rules defying data: ' + str(len(data_no_nans_no_misclassified_no_red))  + '/' + str(len(data_all)) + ', ' + str(len(data_no_nans_no_misclassified_no_red)/len(data_all)*100) + '%'
out_text = out_text + '\n' + txt
print(txt)

txt = 'Data after removing Yellow and Red Rules defying data: ' + str( len(data_no_nans_no_misclassified_no_yelllow)) + '/' + str(len(data_all))+ ', '+ str(len(data_no_nans_no_misclassified_no_yelllow)/len(data_all)*100)+'%'
out_text = out_text + '\n' + txt
print(txt)


Data after removing Red Rules defying data: 6378/7018, 90.8805927614705%
Data after removing Yellow and Red Rules defying data: 5012/7018, 71.41635793673412%


In [11]:

txt = 'Classes after removing Red Rules defying data: ' + str(len(data_no_nans_no_misclassified_no_red.out_class.unique())) + '/' + str(len(data_all.out_class.unique()))+ ', ' + str(len(data_no_nans_no_misclassified_no_red.out_class.unique())/len(data_all.out_class.unique())*100)+ '%'
out_text = out_text + '\n' + txt
print(txt)

txt = 'Classes after removing Yellow and Red Rules defying data: ' + str(len(data_no_nans_no_misclassified_no_yelllow.out_class.unique())) +'/'+str(len(data_all.out_class.unique()))+', '+str(len(data_no_nans_no_misclassified_no_yelllow.out_class.unique())/len(data_all.out_class.unique())*100)+ '%'
out_text = out_text + '\n' + txt
print(txt)


Classes after removing Red Rules defying data: 33/110, 30.0%
Classes after removing Yellow and Red Rules defying data: 14/110, 12.727272727272727%


In [None]:

x = (set(data_all.out_class.unique()) - set(data_no_nans_no_misclassified_no_red.out_class.unique()))
txt = '\n\nClasses failing Red Rules are : \n' + str(x)
out_text = out_text + '\n' + txt
print(txt)



x = ( set(data_no_nans_no_misclassified_no_red.out_class.unique()) - set(data_no_nans_no_misclassified_no_yelllow.out_class.unique()))
txt = '\n\nClasses Failing Yellow and Red Rules are : \n' + str(x)
out_text = out_text + '\n' + txt
print(txt)


x = (set(data_no_nans_no_misclassified_no_yelllow.out_class.unique()))
txt = '\n\nClasses Passing Red and Yellow Rules are : \n'+ str(x)
out_text = out_text + '\n' + txt
print(txt)


In [None]:
print(out_text)