In [47]:
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import re
import math
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter
from scipy.sparse import hstack
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold 
from collections import Counter, defaultdict
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import math
from sklearn.metrics import normalized_mutual_info_score
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings("ignore")

from mlxtend.classifier import StackingClassifier

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
import pickle

In [2]:
def get_gv_fea_dict(alpha, feature, df, train_df):

        value_count = train_df[feature].value_counts()

        gv_dict = dict()
        for i, denominator in value_count.items():
            vec = []
            for k in range(1,10):
                cls_cnt = train_df.loc[(train_df['Class']==k) & (train_df[feature]==i)]
                vec.append((cls_cnt.shape[0] + alpha*10)/ (denominator + 90*alpha))
            gv_dict[i]=vec
        return gv_dict

In [3]:
def get_gv_feature(alpha, feature, df, train_df):

        gv_dict = get_gv_fea_dict(alpha, feature, df, train_df)
        value_count = train_df[feature].value_counts()
        gv_fea = []
        for index, row in df.iterrows():
            if row[feature] in dict(value_count).keys():
                gv_fea.append(gv_dict[row[feature]])
            else:
                gv_fea.append([1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9])
        return gv_fea

In [4]:
def extract_dictionary_paddle(cls_text):
        dictionary = defaultdict(int)
        for index, row in cls_text.iterrows():
            for word in row['TEXT'].split():
                dictionary[word] +=1
        return dictionary    

In [5]:
def get_text_responsecoding(df, dict_list, total_dict):
        text_feature_responseCoding = np.zeros((df.shape[0],9))
        for i in range(0,9):
            row_index = 0
            for index, row in df.iterrows():
                sum_prob = 0
                for word in row['TEXT'].split():
                    sum_prob += math.log(((dict_list[i].get(word,0)+10 )/(total_dict.get(word,0)+90)))
                text_feature_responseCoding[row_index][i] = math.exp(sum_prob/len(row['TEXT'].split()))
                row_index += 1
        return text_feature_responseCoding

In [6]:
def get_intersec_text(df):
        df_text_vec = CountVectorizer(min_df=3)
        df_text_fea = df_text_vec.fit_transform(df['TEXT'])
        df_text_features = df_text_vec.get_feature_names()

        df_text_fea_counts = df_text_fea.sum(axis=0).A1
        df_text_fea_dict = dict(zip(list(df_text_features),df_text_fea_counts))
        len1 = len(set(df_text_features))
        len2 = len(set(train_text_features) & set(df_text_features))
        return len1,len2


In [7]:
def predict_something(train_x, train_y,test_x, test_y, clf):
        clf.fit(train_x, train_y)
        sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
        sig_clf.fit(train_x, train_y)
        pred_y = sig_clf.predict(test_x)

        # for calculating log_loss we willl provide the array of probabilities belongs to each class
        log_loss_value = log_loss(test_y, sig_clf.predict_proba(test_x))
        # calculating the number of data points that are misclassified
        number_of_misclassified_points = np.count_nonzero((pred_y- test_y))/test_y.shape[0]
        
        return log_loss_value, number_of_misclassified_points

In [8]:
def report_log_loss(train_x, train_y, test_x, test_y,  clf):
        clf.fit(train_x, train_y)
        sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
        sig_clf.fit(train_x, train_y)
        sig_clf_probs = sig_clf.predict_proba(test_x)
        return log_loss(test_y, sig_clf_probs, eps=1e-15)

In [9]:
def get_impfeature_names(indices, text, gene, var, no_features, train_df):
        gene_count_vec = CountVectorizer()
        var_count_vec = CountVectorizer()
        text_count_vec = CountVectorizer(min_df=3)

        gene_vec = gene_count_vec.fit(train_df['Gene'])
        var_vec  = var_count_vec.fit(train_df['Variation'])
        text_vec = text_count_vec.fit(train_df['TEXT'])

        fea1_len = len(gene_vec.get_feature_names())
        fea2_len = len(var_count_vec.get_feature_names())

        word_present = 0
        for i,v in enumerate(indices):
            if (v < fea1_len):
                word = gene_vec.get_feature_names()[v]
                yes_no = True if word == gene else False
                if yes_no:
                    word_present += 1
    
            elif (v < fea1_len+fea2_len):
                word = var_vec.get_feature_names()[v-(fea1_len)]
                yes_no = True if word == var else False
                if yes_no:
                    word_present += 1
           
            else:
                word = text_vec.get_feature_names()[v-(fea1_len+fea2_len)]
                yes_no = True if word in text.split() else False
                if yes_no:
                    word_present += 1

        return no_features, word_present

# Da Magic Function

In [10]:
def convert_csv_to_something_good(dataset, type_of_output = 'Onehotencoding'):
    result = pd.read_csv(dataset)
    y_true = result['Class'].values
    X_train, test_df, y_train, y_test = train_test_split(result, y_true, stratify=y_true, test_size=0.2)
    # split the train data into train and cross validation by maintaining same distribution of output varaible 'y_train' [stratify=y_train]
    train_df, cv_df, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2)

    # get_gv_fea_dict: Get Gene varaition Feature Dict
    alpha = 1
    # train gene feature
    train_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", train_df, train_df))
    # test gene feature
    test_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", test_df, train_df))
    # cross validation gene feature
    cv_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", cv_df, train_df))

    # one-hot encoding of Gene feature.
    gene_vectorizer = CountVectorizer()
    train_gene_feature_onehotCoding = gene_vectorizer.fit_transform(train_df['Gene'])
    test_gene_feature_onehotCoding = gene_vectorizer.transform(test_df['Gene'])
    cv_gene_feature_onehotCoding = gene_vectorizer.transform(cv_df['Gene'])

    # alpha is used for laplace smoothing
    alpha = 1
    # train gene feature
    train_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", train_df, train_df))
    # test gene feature
    test_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", test_df, train_df))
    # cross validation gene feature
    cv_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", cv_df, train_df))

    # one-hot encoding of variation feature.
    variation_vectorizer = CountVectorizer()
    train_variation_feature_onehotCoding = variation_vectorizer.fit_transform(train_df['Variation'])
    test_variation_feature_onehotCoding = variation_vectorizer.transform(test_df['Variation'])
    cv_variation_feature_onehotCoding = variation_vectorizer.transform(cv_df['Variation'])


    text_vectorizer = CountVectorizer(min_df=3)
    train_text_feature_onehotCoding = text_vectorizer.fit_transform(train_df['TEXT'])
    # getting all the feature names (words)
    train_text_features= text_vectorizer.get_feature_names()

    # train_text_feature_onehotCoding.sum(axis=0).A1 will sum every row and returns (1*number of features) vector
    train_text_fea_counts = train_text_feature_onehotCoding.sum(axis=0).A1

    # zip(list(text_features),text_fea_counts) will zip a word with its number of times it occured
    text_fea_dict = dict(zip(list(train_text_features),train_text_fea_counts))


    dict_list = []
    # dict_list =[] contains 9 dictoinaries each corresponds to a class
    for i in range(1,10):
        cls_text = train_df[train_df['Class']==i]
        # build a word dict based on the words in that class
        dict_list.append(extract_dictionary_paddle(cls_text))

    total_dict = extract_dictionary_paddle(train_df)

    #response coding of text features
    train_text_feature_responseCoding  = get_text_responsecoding(train_df, dict_list, total_dict)
    test_text_feature_responseCoding  = get_text_responsecoding(test_df, dict_list, total_dict)
    cv_text_feature_responseCoding  = get_text_responsecoding(cv_df, dict_list, total_dict)

    # https://stackoverflow.com/a/16202486
    # we convert each row values such that they sum to 1  
    train_text_feature_responseCoding = (train_text_feature_responseCoding.T/train_text_feature_responseCoding.sum(axis=1)).T
    test_text_feature_responseCoding = (test_text_feature_responseCoding.T/test_text_feature_responseCoding.sum(axis=1)).T
    cv_text_feature_responseCoding = (cv_text_feature_responseCoding.T/cv_text_feature_responseCoding.sum(axis=1)).T

    # don't forget to normalize every feature
    train_text_feature_onehotCoding = normalize(train_text_feature_onehotCoding, axis=0)

    # we use the same vectorizer that was trained on train data
    test_text_feature_onehotCoding = text_vectorizer.transform(test_df['TEXT'])
    # don't forget to normalize every feature
    test_text_feature_onehotCoding = normalize(test_text_feature_onehotCoding, axis=0)

    # we use the same vectorizer that was trained on train data
    cv_text_feature_onehotCoding = text_vectorizer.transform(cv_df['TEXT'])
    # don't forget to normalize every feature
    cv_text_feature_onehotCoding = normalize(cv_text_feature_onehotCoding, axis=0)

    #https://stackoverflow.com/a/2258273/4084039
    sorted_text_fea_dict = dict(sorted(text_fea_dict.items(), key=lambda x: x[1] , reverse=True))
    sorted_text_occur = np.array(list(sorted_text_fea_dict.values()))

    train_gene_var_onehotCoding = hstack((train_gene_feature_onehotCoding,train_variation_feature_onehotCoding))
    test_gene_var_onehotCoding = hstack((test_gene_feature_onehotCoding,test_variation_feature_onehotCoding))
    cv_gene_var_onehotCoding = hstack((cv_gene_feature_onehotCoding,cv_variation_feature_onehotCoding))

    train_x_onehotCoding = hstack((train_gene_var_onehotCoding, train_text_feature_onehotCoding)).tocsr()
    train_y = np.array(list(train_df['Class']))

    test_x_onehotCoding = hstack((test_gene_var_onehotCoding, test_text_feature_onehotCoding)).tocsr()
    test_y = np.array(list(test_df['Class']))

    cv_x_onehotCoding = hstack((cv_gene_var_onehotCoding, cv_text_feature_onehotCoding)).tocsr()
    cv_y = np.array(list(cv_df['Class']))


    train_gene_var_responseCoding = np.hstack((train_gene_feature_responseCoding,train_variation_feature_responseCoding))
    test_gene_var_responseCoding = np.hstack((test_gene_feature_responseCoding,test_variation_feature_responseCoding))
    cv_gene_var_responseCoding = np.hstack((cv_gene_feature_responseCoding,cv_variation_feature_responseCoding))

    train_x_responseCoding = np.hstack((train_gene_var_responseCoding, train_text_feature_responseCoding))
    test_x_responseCoding = np.hstack((test_gene_var_responseCoding, test_text_feature_responseCoding))
    cv_x_responseCoding = np.hstack((cv_gene_var_responseCoding, cv_text_feature_responseCoding))
    
    if type_of_output == 'Onehotencoding':
        return train_x_onehotCoding, train_y, test_x_onehotCoding, test_y, cv_x_onehotCoding, cv_y, train_df, test_df
    
    if type_of_output == 'responseCoding':
        return train_x_responseCoding, train_y, test_x_responseCoding, test_y, cv_x_responseCoding, cv_y, train_df, test_df

In [11]:
train_x_onehotCoding, train_y, test_x_onehotCoding, test_y, cv_x_onehotCoding, cv_y, train_df, test_df = convert_csv_to_something_good('results.csv', type_of_output = 'Onehotencoding')

In [12]:
train_x_responseCoding, train_y, test_x_responseCoding, test_y, cv_x_responseCoding, cv_y, train_df, test_df = convert_csv_to_something_good('results.csv', type_of_output = 'responseCoding')

# NaiveBayes

In [63]:
def NaiveBayes(train_x_onehotCoding, train_y, testing_dataset_x, testing_dataset_y, test_point_index,):
    clf = MultinomialNB(alpha=0.1)
    clf.fit(train_x_onehotCoding, train_y)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(train_x_onehotCoding, train_y)
    
    predicted_cls = sig_clf.predict(testing_dataset_x[test_point_index])

    predicted_class_probabilities = np.round(sig_clf.predict_proba(testing_dataset_x[test_point_index]),4)
    actual_class = testing_dataset_y[test_point_index]
    
    return  predicted_cls, predicted_class_probabilities, actual_class

# K Nearest Neighbour Classification

In [69]:
def KNearestNeighbour(train_x_responseCoding, train_y, testing_dataset_x, testing_dataset_y, test_point_index):
    clf = KNeighborsClassifier(n_neighbors=5)
    clf.fit(train_x_responseCoding, train_y)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(train_x_responseCoding, train_y)

    predicted_cls = sig_clf.predict(testing_dataset_x[0].reshape(1,-1))

    predicted_class = predicted_cls[0]
    actual_class = testing_dataset_y[test_point_index]

    neighbors = clf.kneighbors(testing_dataset_x[test_point_index].reshape(1, -1), 5)
    nearest_neighbours = train_y[neighbors[1][0]]

    return predicted_class, actual_class, nearest_neighbours

In [None]:
predicted_class, actual_class, nearest_neighbours, frequency_of_nearest_points = KNearestNeighbour(train_x_responseCoding, train_y, test_x_responseCoding, test_y, 100)

# Logistic Regression

In [85]:
def LogisticRegression_balanced(train_x_onehotCoding, train_y, testing_dataset_x, testing_dataset_y, test_point_index):
    clf = SGDClassifier(class_weight='balanced', alpha=0.001 , penalty='l2', loss='log', random_state=42)
    clf.fit(train_x_onehotCoding,train_y)

    predicted_cls = clf.predict(testing_dataset_x[test_point_index])
    predicted_class = predicted_cls[0]
    predicted_class_probabilities = np.round(clf.predict_proba(testing_dataset_x[test_point_index]),4)
    actual_class = testing_dataset_y[test_point_index]
    return predicted_class, predicted_class_probabilities, actual_class

In [95]:
def LogisticRegression_unbalanced(train_x_onehotCoding, train_y, testing_dataset_x, testing_dataset_y, test_point_index):
    clf = SGDClassifier(alpha=0.001 , penalty='l2', loss='log', random_state=42)
    clf.fit(train_x_onehotCoding,train_y)

    predicted_cls = clf.predict(testing_dataset_x[test_point_index])
    predicted_class = predicted_cls[0]
    predicted_class_probabilities = np.round(clf.predict_proba(testing_dataset_x[test_point_index]),4)
    actual_class = testing_dataset_y[test_point_index]
    return predicted_class, predicted_class_probabilities, actual_class

# Linear Support Vector Machines

In [106]:
def LinearSupportVectorMachines(train_x_onehotCoding, train_y, testing_dataset_x, testing_dataset_y, test_point_index):
    clf = SGDClassifier(alpha=0.001 , penalty='l2', loss='hinge', random_state=42)
    clf.fit(train_x_onehotCoding,train_y)

    predicted_cls = clf.predict(testing_dataset_x[test_point_index])
    predicted_class = predicted_cls[0]
    actual_class = testing_dataset_y[test_point_index]
    return predicted_class, actual_class

In [107]:
predicted_class,  actual_class, no_features, word_present = LogisticRegression_unbalanced(train_x_onehotCoding, train_y, test_x_onehotCoding, test_y, test_df, train_df, 2, 500)

TypeError: LogisticRegression_unbalanced() takes 5 positional arguments but 8 were given

# Random Forest Classifier

In [112]:
def Random_Forest_Classifier_OneHotEncoding(train_x_onehotCoding, train_y, testing_dataset_x, testing_dataset_y, test_point_index):
    clf = RandomForestClassifier(n_estimators=500 , criterion='gini', max_depth=10, random_state=42, n_jobs=-1)
    clf.fit(train_x_onehotCoding, train_y)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(train_x_onehotCoding, train_y)

    predicted_cls = sig_clf.predict(testing_dataset_x[test_point_index])
    predicted_cls = clf.predict(testing_dataset_x[test_point_index])
    predicted_class = predicted_cls[0]
    actual_class = testing_dataset_y[test_point_index]
    return predicted_class, actual_class

In [None]:
predicted_class, predicted_class_probabilities, actual_class, no_features, word_present = Random_Forest_Classifier_OneHotEncoding(train_x_onehotCoding, train_y, test_x_onehotCoding, test_y, test_df, train_df, 2, 500)

In [127]:
def Random_Forest_Classifier_responseCoding(train_x_onehotCoding, train_y, testing_dataset_x, testing_dataset_y, test_point_index):
    clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=5, random_state=42, n_jobs=-1)
    clf.fit(train_x_responseCoding, train_y)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(train_x_responseCoding, train_y)

    predicted_cls = sig_clf.predict(test_x_responseCoding[test_point_index].reshape(1,-1))
    predicted_class = predicted_cls[0]
    actual_class = test_y[test_point_index]

    return predicted_class, actual_class

In [None]:
predicted_class, predicted_class_probabilities, actual_class = Random_Forest_Classifier_responseCoding(train_x_responseCoding, train_y, test_x_responseCoding, test_y, test_point_index, no_feature)

In [105]:
predicted_cls, predicted_class_probabilities, actual_class = LinearSupportVectorMachines(train_x_onehotCoding, train_y, train_x_onehotCoding, train_y, 1)

AttributeError: probability estimates are not available for loss='hinge'

In [128]:
def output(Dataset,Test_Index,No_of_features,Algorithms):

    #train_x_encoded, train_y, test_x_encoded, test_y, cv_x_encoded, cv_y = convert_csv_to_something_good('results.csv', type_of_output = 'Onehotencoding')
#     train_x_onehotCoding, train_y, test_x_onehotCoding, test_y, cv_x_onehotCoding, cv_y, train_df, test_df=convert_csv_to_something_good(Dataset,)

    if Dataset == 'Dataset1':
        dataset_onehotCoding = [train_x_onehotCoding, train_y]
        dataset_responseCoding = [train_x_responseCoding, train_y]
    elif Dataset == 'Dataset2':
        dataset_onehotCoding = [test_x_onehotCoding, test_y]
        dataset_responseCoding = [test_x_responseCoding, test_y]
    elif Dataset == 'Dataset3':
        dataset_onehotCoding = [cv_x_onehotCoding, cv_y]
        dataset_responseCoding = [cv_x_responseCoding, cv_y]
    
    if Algorithms == "Naive Bayes":
        predicted_cls, predicted_class_probabilities, actual_class = NaiveBayes(train_x_onehotCoding, train_y, dataset_onehotCoding[0], dataset_onehotCoding[1], Test_Index)
    elif Algorithms == "K Nearest Neighbour Classification":
        predicted_cls, predicted_class_probabilities, actual_class = KNearestNeighbour(train_x_responseCoding, train_y, dataset_responseCoding[0], dataset_responseCoding[1], Test_Index)
    elif Algorithms == "Logistic Regression Balanced":
        predicted_cls, predicted_class_probabilities, actual_class = LogisticRegression_balanced(train_x_onehotCoding, train_y, dataset_onehotCoding[0], dataset_onehotCoding[1], Test_Index)
    elif Algorithms == "Logistic Regression Unbalanced":
        predicted_cls, predicted_class_probabilities, actual_class = LogisticRegression_unbalanced(train_x_onehotCoding, train_y, dataset_onehotCoding[0], dataset_onehotCoding[1], Test_Index)
    elif Algorithms == "Linear Support Vector Machines":
        predicted_cls, actual_class = LinearSupportVectorMachines(train_x_onehotCoding, train_y, dataset_onehotCoding[0], dataset_onehotCoding[1], Test_Index)
    elif Algorithms == "Random Forest Classifier OneHotEncoded":
        predicted_cls, actual_class = Random_Forest_Classifier_OneHotEncoding(train_x_onehotCoding, train_y, dataset_onehotCoding[0], dataset_onehotCoding[1], Test_Index)
    elif Algorithms == "Random Forest Classifier ResponseCoded":
        predicted_cls, actual_class = Random_Forest_Classifier_responseCoding(train_x_onehotCoding, train_y, dataset_responseCoding[0], dataset_responseCoding[1], Test_Index)
    #prediction = mobile_net.predict(arr).flatten()
    
    #return (classpred,actual class,{labels[i]: float(prediction[i]) for i in range(1000)})
    return predicted_cls, actual_class




In [129]:
iface = gr.Interface(
    output, 
    
     [
       
        gr.inputs.Dropdown( choices=['Dataset1','Dataset2','Dataset3'], type="value", label='Dataset'),
        gr.inputs.Slider( minimum=0, maximum=100, step=1, default=0, label='Test_Index'),
        gr.inputs.Slider( minimum=0, maximum=100, step=1, default=0, label='No_of_features'),
        gr.inputs.Radio(["Naive Bayes", "K Nearest Neighbour Classification", "Logistic Regression Balanced","Logistic Regression Unbalanced","Linear Support Vector Machines","Random Forest Classifier OneHotEncoded","Random Forest Classifier ResponseCoded"],label="Algorithms"),
        
        ],
     [gr.outputs.Textbox(label="Classifed Class"),
     gr.outputs.Textbox(label="Actual Class")],
    
)

In [130]:
 iface.launch()

Running locally at: http://127.0.0.1:7889/
To create a public link, set `share=True` in `launch()`.
Interface loading below...


(<Flask 'gradio.networking'>, 'http://127.0.0.1:7889/', None)