In [4]:
import pandas as pd
import numpy as np
import string
import scipy
from collections import Counter

 ## first step read the data

In [5]:
train_data = pd.read_csv("medical_dataset/train.csv")
train_data.head()

Unnamed: 0,label,text
0,2,"2-D STUDY,1. Mild aortic stenosis, widely calc..."
1,1,"PREOPERATIVE DIAGNOSES: , Dysphagia and esopha..."
2,2,"CHIEF COMPLAINT:, The patient comes for three..."
3,1,"PROCEDURE: , Bilateral L5, S1, S2, and S3 radi..."
4,2,"DISCHARGE DIAGNOSES:,1. Chronic obstructive pu..."


In [6]:
test_data = pd.read_csv("medical_dataset/test.csv")
test_data.head()

Unnamed: 0,label,text
0,2,"ADMISSION DIAGNOSES: ,Fracture of the right f..."
1,1,"PREOPERATIVE DIAGNOSIS:, Plantar fascitis, le..."
2,2,"ADMISSION DIAGNOSIS: , Microinvasive carcinoma..."
3,4,"PREOPERATIVE DIAGNOSIS:, Severe degenerative ..."
4,1,"DIAGNOSIS: , Left breast adenocarcinoma stage ..."


In [7]:
valid_data = pd.read_csv("medical_dataset/valid.csv")
valid_data.head()

Unnamed: 0,label,text
0,2,"S - ,This patient has reoccurring ingrown infe..."
1,3,"REASON FOR ADMISSION:, Intraperitoneal chemot..."
2,1,"PREOPERATIVE DIAGNOSES: , Bilateral cleft lip ..."
3,2,"HISTORY OF PRESENT ILLNESS:, Ms. A is a 55-ye..."
4,3,"PROCEDURE: , Right L5-S1 intralaminar epidural..."


### preprocessing the data in 2 steps: 1- lowercase the words 2- remove punctuation from the words 

In [8]:
def preprocessing(data):
    data["text"] = data['text'].str.replace(r'[^\w\s]+', ' ').str.lower()
    return data

In [9]:
train_data = preprocessing(train_data)
print("Train data after pre-processing:")
train_data.head()

Train data after pre-processing:


Unnamed: 0,label,text
0,2,2 d study 1 mild aortic stenosis widely calc...
1,1,preoperative diagnoses dysphagia and esopha...
2,2,chief complaint the patient comes for three ...
3,1,procedure bilateral l5 s1 s2 and s3 radi...
4,2,discharge diagnoses 1 chronic obstructive pul...


In [10]:
test_data = preprocessing(test_data)
print("Test data after pre-processing:")
test_data.head()

Test data after pre-processing:


Unnamed: 0,label,text
0,2,admission diagnoses fracture of the right f...
1,1,preoperative diagnosis plantar fascitis lef...
2,2,admission diagnosis microinvasive carcinoma...
3,4,preoperative diagnosis severe degenerative j...
4,1,diagnosis left breast adenocarcinoma stage ...


In [11]:
valid_data = preprocessing(valid_data)
print("Valid data after pre-processing:")
valid_data.head()

Valid data after pre-processing:


Unnamed: 0,label,text
0,2,s this patient has reoccurring ingrown infe...
1,3,reason for admission intraperitoneal chemoth...
2,1,preoperative diagnoses bilateral cleft lip ...
3,2,history of present illness ms a is a 55 yea...
4,3,procedure right l5 s1 intralaminar epidural...


# create word vocabulary

 # 1

In [12]:
words = [word for sentence in train_data['text'].str.split().tolist() for word in sentence]
words_top = Counter(words).most_common(10000)
words_top_pd = pd.DataFrame(words_top)
unique_words = list(words_top_pd[0])
print("The first five unique words:", unique_words[:5])

The first five unique words: ['the', 'and', 'was', 'of', 'to']


# submission the unique words

In [13]:
with open('medical text-vocab.txt', 'w') as f:
    for i, words in enumerate(words_top, start = 1):
        f.write(str(words[0]) + "\t" + "\t" + str(i) + "\t" + str(words[1]) + "\n")

# Submission the valid, test and train dataset

In [14]:
def submission(data, unique_word_dic, dataset_name):
    data_split = data['text'].str.split().tolist()
    with open('medical text-' + dataset_name + '.txt', 'w') as f:
        for i, sentence in enumerate(data_split):
            f.write(' '.join([str(unique_word_dic[word]) for word in sentence if word in unique_word_dic]) +'\t' + str(data['label'][i]) +'\n')
            
        
        

In [15]:
unique_word_dic = {word: i for i, word in enumerate(unique_words, start = 1)}

In [16]:
submission(train_data, unique_word_dic, 'train')
submission(test_data, unique_word_dic, 'test')
submission(valid_data, unique_word_dic, 'valid')

# Binary Bag of words

In [17]:
def bbow(data, unique_words):
    dataset_bbow = np.zeros((data.shape[0], len(unique_words)))
    data_split = data['text'].str.split().tolist()
    for i, sentence in enumerate(data_split):
        for index, words in enumerate(unique_words):
            if unique_words[index] in data_split[i]:
                dataset_bbow[i][index] = 1
            else:
                dataset_bbow[i][index] = 0
    
    
    
    return dataset_bbow

In [18]:
train_data_bbow = bbow(train_data, unique_words)
pd.DataFrame(train_data_bbow).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,9960,9961,9962,9963,9964,9965,9966,9967,9968,9969,9970,9971,9972,9973,9974,9975,9976,9977,9978,9979,9980,9981,9982,9983,9984,9985,9986,9987,9988,9989,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
valid_data_bbow = bbow(valid_data, unique_words)
pd.DataFrame(valid_data_bbow).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,9960,9961,9962,9963,9964,9965,9966,9967,9968,9969,9970,9971,9972,9973,9974,9975,9976,9977,9978,9979,9980,9981,9982,9983,9984,9985,9986,9987,9988,9989,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
test_data_bbow = bbow(test_data, unique_words)
pd.DataFrame(test_data_bbow).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,9960,9961,9962,9963,9964,9965,9966,9967,9968,9969,9970,9971,9972,9973,9974,9975,9976,9977,9978,9979,9980,9981,9982,9983,9984,9985,9986,9987,9988,9989,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Frequency Bag of words

In [23]:
def fbow(data, unique_words):
    dataset_fbow = np.zeros((data.shape[0], len(unique_words)))
    data_split = data['text'].str.split().tolist()

    words_in_data =np.zeros((len(data_split), len(unique_words)))

  
    for i, sentence in enumerate(data_split):
        for index, words in enumerate(unique_words):
            count = data_split[i].count(words)
            words_in_data[i,:] += int(count)
            dataset_fbow[i][index] = int(count)
    
    
    return dataset_fbow / words_in_data

In [24]:
train_data_fbow = fbow(train_data, unique_words)
pd.DataFrame(train_data_fbow)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,9960,9961,9962,9963,9964,9965,9966,9967,9968,9969,9970,9971,9972,9973,9974,9975,9976,9977,9978,9979,9980,9981,9982,9983,9984,9985,9986,9987,9988,9989,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.008850,0.017699,0.000000,0.008850,0.008850,0.000000,0.008850,0.017699,0.008850,0.000000,0.008850,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.026549,0.000000,0.035398,0.000000,0.000000,0.000000,0.000000,0.053097,0.035398,0.000000,0.008850,0.000000,0.00000,0.000000,0.044248,0.000000,0.000000,0.017699,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.125436,0.031359,0.055749,0.031359,0.020906,0.013937,0.010453,0.003484,0.000000,0.003484,0.003484,0.000000,0.000000,0.000000,0.006969,0.000000,0.006969,0.017422,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.003484,0.000000,0.003484,0.003484,0.000000,0.006969,0.000000,0.00000,0.003484,0.024390,0.010453,0.003484,0.003484,0.000000,0.003484,0.003484,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.016588,0.035545,0.004739,0.011848,0.014218,0.018957,0.009479,0.009479,0.037915,0.002370,0.014218,0.059242,0.011848,0.000000,0.000000,0.002370,0.002370,0.000000,0.002370,0.000000,0.004739,0.000000,0.028436,0.004739,0.011848,0.021327,0.000000,0.000000,0.004739,0.000000,0.009479,0.00000,0.002370,0.002370,0.000000,0.011848,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.091408,0.042048,0.038391,0.021938,0.027422,0.012797,0.020110,0.010969,0.000000,0.020110,0.003656,0.000000,0.016453,0.000000,0.010969,0.001828,0.001828,0.009141,0.001828,0.000000,0.007313,0.000000,0.000000,0.005484,0.001828,0.000000,0.003656,0.010969,0.000000,0.000000,0.000000,0.00000,0.001828,0.001828,0.020110,0.000000,0.000000,0.001828,0.000000,0.001828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.027094,0.032020,0.017241,0.049261,0.032020,0.007389,0.027094,0.007389,0.009852,0.004926,0.000000,0.019704,0.007389,0.000000,0.002463,0.019704,0.000000,0.009852,0.000000,0.000000,0.004926,0.002463,0.000000,0.000000,0.004926,0.017241,0.014778,0.007389,0.007389,0.000000,0.004926,0.00000,0.014778,0.000000,0.000000,0.000000,0.004926,0.000000,0.002463,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0.058333,0.033333,0.016667,0.008333,0.008333,0.000000,0.050000,0.025000,0.008333,0.025000,0.008333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.008333,0.000000,0.008333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.008333,0.00000,0.000000,0.008333,0.025000,0.008333,0.000000,0.000000,0.000000,0.008333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3996,0.089820,0.041916,0.047904,0.011976,0.041916,0.011976,0.005988,0.023952,0.005988,0.005988,0.023952,0.000000,0.005988,0.000000,0.005988,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.005988,0.000000,0.00000,0.005988,0.017964,0.017964,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3997,0.104141,0.026349,0.053952,0.027604,0.022585,0.027604,0.010038,0.016311,0.001255,0.012547,0.001255,0.000000,0.007528,0.000000,0.013802,0.003764,0.002509,0.002509,0.002509,0.000000,0.002509,0.006274,0.001255,0.001255,0.000000,0.000000,0.006274,0.003764,0.000000,0.000000,0.000000,0.00000,0.003764,0.000000,0.008783,0.001255,0.000000,0.001255,0.006274,0.011292,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3998,0.027328,0.035425,0.008097,0.016194,0.019231,0.024291,0.013158,0.014170,0.009109,0.017206,0.010121,0.001012,0.015182,0.033401,0.002024,0.004049,0.005061,0.006073,0.000000,0.000000,0.014170,0.001012,0.012146,0.002024,0.010121,0.000000,0.005061,0.001012,0.006073,0.001012,0.007085,0.01417,0.001012,0.006073,0.000000,0.003036,0.000000,0.000000,0.004049,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
test_data_fbow = fbow(test_data, unique_words)
pd.DataFrame(test_data_fbow).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,9960,9961,9962,9963,9964,9965,9966,9967,9968,9969,9970,9971,9972,9973,9974,9975,9976,9977,9978,9979,9980,9981,9982,9983,9984,9985,9986,9987,9988,9989,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.048673,0.039823,0.022124,0.039823,0.026549,0.004425,0.00885,0.004425,0.004425,0.022124,0.0,0.0,0.013274,0.004425,0.004425,0.013274,0.0,0.0,0.0,0.022124,0.0,0.0,0.0,0.004425,0.0,0.0,0.0,0.0,0.00885,0.0,0.00885,0.00885,0.0,0.0,0.004425,0.0,0.0,0.0,0.004425,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.090385,0.030769,0.040385,0.034615,0.021154,0.009615,0.021154,0.017308,0.007692,0.015385,0.001923,0.001923,0.007692,0.0,0.003846,0.009615,0.001923,0.0,0.015385,0.0,0.0,0.017308,0.007692,0.007692,0.001923,0.001923,0.0,0.003846,0.001923,0.001923,0.0,0.0,0.003846,0.0,0.005769,0.0,0.001923,0.009615,0.007692,0.003846,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.039427,0.032258,0.02509,0.035842,0.007168,0.035842,0.014337,0.010753,0.017921,0.003584,0.0,0.02509,0.010753,0.0,0.0,0.010753,0.007168,0.010753,0.0,0.0,0.0,0.0,0.003584,0.0,0.003584,0.0,0.0,0.003584,0.014337,0.003584,0.003584,0.0,0.003584,0.007168,0.003584,0.0,0.0,0.0,0.007168,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.081186,0.042526,0.064433,0.014175,0.012887,0.014175,0.014175,0.023196,0.002577,0.015464,0.002577,0.003866,0.006443,0.0,0.010309,0.010309,0.009021,0.001289,0.002577,0.011598,0.0,0.001289,0.0,0.016753,0.0,0.003866,0.002577,0.0,0.0,0.003866,0.002577,0.0,0.0,0.0,0.005155,0.0,0.002577,0.001289,0.001289,0.015464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.032258,0.028674,0.003584,0.02509,0.028674,0.007168,0.003584,0.017921,0.010753,0.0,0.014337,0.02509,0.003584,0.0,0.0,0.007168,0.003584,0.003584,0.0,0.003584,0.007168,0.017921,0.014337,0.010753,0.010753,0.02509,0.007168,0.003584,0.0,0.0,0.0,0.0,0.0,0.003584,0.0,0.0,0.007168,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
valid_data_fbow = fbow(valid_data, unique_words)
pd.DataFrame(valid_data_fbow).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,9960,9961,9962,9963,9964,9965,9966,9967,9968,9969,9970,9971,9972,9973,9974,9975,9976,9977,9978,9979,9980,9981,9982,9983,9984,9985,9986,9987,9988,9989,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.081481,0.044444,0.007407,0.022222,0.007407,0.007407,0.0,0.007407,0.022222,0.007407,0.0,0.0,0.007407,0.007407,0.0,0.051852,0.014815,0.0,0.0,0.014815,0.0,0.022222,0.007407,0.0,0.0,0.0,0.0,0.007407,0.0,0.007407,0.0,0.0,0.0,0.0,0.0,0.007407,0.0,0.007407,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.041169,0.02656,0.015936,0.015936,0.022576,0.015936,0.010624,0.017264,0.01328,0.00664,0.011952,0.015936,0.00664,0.0,0.01328,0.005312,0.001328,0.002656,0.0,0.00664,0.002656,0.010624,0.010624,0.005312,0.002656,0.007968,0.002656,0.00664,0.003984,0.01328,0.009296,0.0,0.002656,0.003984,0.0,0.001328,0.001328,0.001328,0.001328,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.130435,0.036789,0.043478,0.052397,0.024526,0.020067,0.026756,0.012263,0.003344,0.010033,0.0,0.001115,0.003344,0.001115,0.003344,0.007804,0.005574,0.011148,0.010033,0.001115,0.0,0.0,0.001115,0.00223,0.001115,0.001115,0.004459,0.003344,0.001115,0.0,0.0,0.001115,0.00223,0.0,0.012263,0.00223,0.001115,0.007804,0.005574,0.005574,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.037846,0.032023,0.0,0.018923,0.020378,0.018923,0.008734,0.014556,0.014556,0.018923,0.008734,0.02329,0.024745,0.001456,0.0,0.002911,0.004367,0.004367,0.0,0.0,0.004367,0.0,0.010189,0.010189,0.002911,0.008734,0.002911,0.001456,0.008734,0.0,0.002911,0.0,0.001456,0.0,0.0,0.0,0.002911,0.001456,0.002911,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.103175,0.035714,0.043651,0.02381,0.031746,0.019841,0.039683,0.007937,0.003968,0.015873,0.0,0.007937,0.003968,0.0,0.003968,0.007937,0.003968,0.007937,0.0,0.015873,0.0,0.0,0.0,0.0,0.0,0.003968,0.003968,0.007937,0.0,0.0,0.0,0.0,0.0,0.003968,0.019841,0.0,0.003968,0.0,0.003968,0.011905,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 2

# 2.a

# F1-score:

In [38]:
from sklearn.metrics import f1_score

In [39]:
def random_classifier(predict):
    return np.random.choice([1,2,3,4], len(predict))

In [40]:
test_random_predict = random_classifier(test_data)
train_random_predict = random_classifier(train_data)
valid_random_predict = random_classifier(valid_data)

In [41]:
f1_score_train = f1_score(train_data['label'], train_random_predict, average='macro')
f1_score_test = f1_score(test_data['label'], test_random_predict, average='macro')
f1_score_valid = f1_score(valid_data['label'], valid_random_predict, average='macro')


print(f'F1-score using random classifier: \n The train accuracy: {f1_score_train} \n The test accuracy: {f1_score_test} \n The valid accuracy: {f1_score_valid} ')


F1-score using random classifier: 
 The train accuracy: 0.2561339828846086 
 The test accuracy: 0.2129027629821903 
 The valid accuracy: 0.265155383029337 


In [42]:
def majority_classifier(predict):
    return np.full(len(predict), scipy.stats.mode(train_data['label'])[0][0])

In [43]:
test_majority_predict = majority_classifier(test_data)
train_majority_predict = majority_classifier(train_data)
valid_majority_predict = majority_classifier(valid_data)

In [44]:
f1_score_train_majority = f1_score(train_data['label'], train_majority_predict, average='macro')
f1_score_test_majority = f1_score(test_data['label'], test_majority_predict, average='macro')
f1_score_valid_majority = f1_score(valid_data['label'], valid_majority_predict, average='macro')


print(f'F1-score using majority classifier: \n The train accuracy: {f1_score_train_majority} \n The test accuracy: {f1_score_test_majority} \n The valid accuracy: {f1_score_valid_majority} ')



F1-score using majority classifier: 
 The train accuracy: 0.120996778472617 
 The test accuracy: 0.14183381088825217 
 The valid accuracy: 0.12424698795180723 


# 2.b

# Naive Bayes

In [45]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV, PredefinedSplit

params = {'alpha': np.arange(0.01, 1.01, 0.01)}



ps = PredefinedSplit(test_fold=[-1 if i < len(train_data['label']) else 0 for i in range(len(train_data['label']) + len(valid_data['label']))])
bernoulli_nb_grid = GridSearchCV(BernoulliNB(), params, cv=ps, n_jobs=-1)
train_x = scipy.sparse.vstack([train_data_bbow, valid_data_bbow])
train_y = np.concatenate([train_data['label'], valid_data['label']])
bernoulli_nb_grid.fit(train_x, train_y)
print(f'Best params for Bernouli Naive Bayes is: {bernoulli_nb_grid.best_params_}')


f1_score_train_bernouli = f1_score(train_data['label'], bernoulli_nb_grid.best_estimator_.predict(train_data_bbow), average='macro')
f1_score_test_bernouli = f1_score(test_data['label'], bernoulli_nb_grid.best_estimator_.predict(test_data_bbow), average='macro')
f1_score_valid_bernouli = f1_score(valid_data['label'], bernoulli_nb_grid.best_estimator_.predict(valid_data_bbow), average='macro')


print(f'F1-score using Naive Bayes classifier: \n The train accuracy: {f1_score_train_bernouli} \n The test accuracy: {f1_score_test_bernouli} \n The valid accuracy: {f1_score_valid_bernouli} ')




Best params for Bernouli Naive Bayes is: {'alpha': 0.18000000000000002}
F1-score using Naive Bayes classifier: 
 The train accuracy: 0.5401910358602476 
 The test accuracy: 0.4764242435510842 
 The valid accuracy: 0.5168645860223412 


# Decision Tree

In [46]:
from sklearn.tree import DecisionTreeClassifier

DecisionTreeClassifier().get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [75]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [np.arange(10, 46)],  #The maximum depth of the tree.
          'max_features': np.arange(0.1, 0.5, 0.1), 
          'min_samples_leaf': np.arange(1, 10), #The minimum number of samples required to be at a leaf node.
          'min_samples_split': np.arange(2,10), #The minimum number of samples required to split an internal node
         'criterion' : ['gini','entropy'],
          'ccp_alpha': np.arange(0.0, 1.0, 0.0001)
         }


ps = PredefinedSplit(test_fold=[-1 if i < len(train_data['label']) else 0 for i in range(len(train_data['label']) + len(valid_data['label']))])
decision_tree_grid = GridSearchCV(DecisionTreeClassifier(), params)
train_x = scipy.sparse.vstack([train_data_bbow, valid_data_bbow])
train_y = np.concatenate([train_data['label'], valid_data['label']])
decision_tree_grid.fit(train_data_bbow, train_data['label'])
print(f'Best params for Decision Tree is: {decision_tree_grid.best_params_}')


f1_score_train_decision_tree = f1_score(train_data['label'], decision_tree_grid.best_estimator_.predict(train_data_bbow), average='macro')
f1_score_test_decision_tree = f1_score(test_data['label'], decision_tree_grid.best_estimator_.predict(test_data_bbow), average='macro')
f1_score_valid_decision_tree = f1_score(valid_data['label'], decision_tree_grid.best_estimator_.predict(valid_data_bbow), average='macro')


print(f'F1-score using Decision Tree classifier: \n The train accuracy: {f1_score_train_decision_tree} \n The test accuracy: {f1_score_test_decision_tree} \n The valid accuracy: {f1_score_valid_decision_tree} ')






Best params for Decision Tree is: {'ccp_alpha': 0.0009, 'criterion': 'gini', 'max_depth': 45}
F1-score using Decision Tree classifier: 
 The train accuracy: 0.8455346789612719 
 The test accuracy: 0.8086417969581248 
 The valid accuracy: 0.7989773381576436 


# Linear SVM classifier

In [58]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

params = {'C': [0.1, 1, 10, 100, 1000],
          'max_iter': [5000]}

ps = PredefinedSplit(test_fold=[-1 if i < len(train_data['label']) else 0 for i in range(len(train_data['label']) + len(valid_data['label']))])
svc_grid = GridSearchCV(LinearSVC(), params, cv=ps, n_jobs=-1)
train_x = scipy.sparse.vstack([train_data_bbow, valid_data_bbow])
train_y = np.concatenate([train_data['label'], valid_data['label']])
svc_grid.fit(train_x, train_y)
print(f'Best params for Linear SVC is: {svc_grid.best_params_}')


f1_score_train_svc = f1_score(train_data['label'], svc_grid.best_estimator_.predict(train_data_bbow), average='macro')
f1_score_test_svc = f1_score(test_data['label'], svc_grid.best_estimator_.predict(test_data_bbow), average='macro')
f1_score_valid_svc = f1_score(valid_data['label'], svc_grid.best_estimator_.predict(valid_data_bbow), average='macro')


print(f'F1-score using Linear SVC: \n The train accuracy: {f1_score_train_svc} \n The test accuracy: {f1_score_test_svc} \n The valid accuracy: {f1_score_valid_svc} ')






Best params for Linear SVC is: {'C': 1, 'max_iter': 5000}
F1-score using Linear SVC: 
 The train accuracy: 0.8972922757641368 
 The test accuracy: 0.7781136671964518 
 The valid accuracy: 0.9036961201871073 




#Logistic Regression

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

params ={"C": [0.00001, 0.0001,0.001, 0.01, 0.1, 1, 10, 100,150,200, 1000],
         'max_iter': [5000]}

ps = PredefinedSplit(test_fold=[-1 if i < len(train_data['label']) else 0 for i in range(len(train_data['label']) + len(valid_data['label']))])
logistic_regression_grid = GridSearchCV(LogisticRegression(),params, cv=ps, n_jobs=-1)
train_x = scipy.sparse.vstack([train_data_bbow, valid_data_bbow])
train_y = np.concatenate([train_data['label'], valid_data['label']])
logistic_regression_grid.fit(train_x, train_y)
print(f'Best params for logistic regression is: {logistic_regression_grid.best_params_}')


f1_score_train_logistic_regression = f1_score(train_data['label'], logistic_regression_grid.best_estimator_.predict(train_data_bbow), average='macro')
f1_score_test_logistic_regression = f1_score(test_data['label'], logistic_regression_grid.best_estimator_.predict(test_data_bbow), average='macro')
f1_score_valid_logistic_regression = f1_score(valid_data['label'], logistic_regression_grid.best_estimator_.predict(valid_data_bbow), average='macro')


print(f'F1-score using logistic regression: \n The train accuracy: {f1_score_train_logistic_regression} \n The test accuracy: {f1_score_test_logistic_regression} \n The valid accuracy: {f1_score_valid_logistic_regression} ')







Best params for logistic regression is: {'C': 100, 'max_iter': 5000}
F1-score using logistic regression: 
 The train accuracy: 0.8985790623650851 
 The test accuracy: 0.7777015271060288 
 The valid accuracy: 0.8943560434022649 


# 3

# 3.a

# Gaussian NB

In [61]:
from sklearn.naive_bayes import GaussianNB 
from sklearn.model_selection import GridSearchCV, PredefinedSplit



gaussian_nb_grid = GaussianNB()
gaussian_nb_grid.fit(train_data_fbow, train_data['label'])
print(f'Best params for gaussian Naive Bayes is: {gaussian_nb_grid.get_params}')

f1_score_train_gaussian = f1_score(train_data['label'], gaussian_nb_grid.predict(train_data_fbow), average='macro')
f1_score_test_gaussian = f1_score(test_data['label'], gaussian_nb_grid.predict(test_data_fbow), average='macro')
f1_score_valid_gaussian = f1_score(valid_data['label'], gaussian_nb_grid.predict(valid_data_fbow), average='macro')


print(f'F1-score using Gaussian Bayes classifier: \n The train accuracy: {f1_score_train_gaussian} \n The test accuracy: {f1_score_test_gaussian} \n The valid accuracy: {f1_score_valid_gaussian} ')





Best params for gaussian Naive Bayes is: <bound method BaseEstimator.get_params of GaussianNB(priors=None, var_smoothing=1e-09)>
F1-score using Gaussian Bayes classifier: 
 The train accuracy: 0.6921230662083689 
 The test accuracy: 0.3567760615184497 
 The valid accuracy: 0.3608059565606281 


In [79]:
from sklearn.naive_bayes import GaussianNB 
from sklearn.model_selection import GridSearchCV

params ={'var_smoothing': np.arange(0.01, 6.0, 0.01)}


ps = PredefinedSplit(test_fold=[-1 if i < len(train_data['label']) else 0 for i in range(len(train_data['label']) + len(valid_data['label']))])
Gaussian_grid = GridSearchCV(GaussianNB(),params, cv=ps, n_jobs=-1)
train_x = scipy.sparse.vstack([train_data_fbow, valid_data_fbow])
train_y = np.concatenate([train_data['label'], valid_data['label']])
Gaussian_grid.fit(train_x.toarray(), train_y)
print(f'Best params for logistic regression is: {Gaussian_grid.best_params_}')


f1_score_train_Gaussian = f1_score(train_data['label'], Gaussian_grid.best_estimator_.predict(train_data_fbow), average='macro')
f1_score_test_Gaussian = f1_score(test_data['label'], Gaussian_grid.best_estimator_.predict(test_data_fbow), average='macro')
f1_score_valid_Gaussian = f1_score(valid_data['label'], Gaussian_grid.best_estimator_.predict(valid_data_fbow), average='macro')


print(f'F1-score using Gaussian NB: \n The train accuracy: {f1_score_train_Gaussian} \n The test accuracy: {f1_score_test_Gaussian} \n The valid accuracy: {f1_score_valid_Gaussian} ')








Best params for logistic regression is: {'var_smoothing': 0.06}
F1-score using Gaussian NB: 
 The train accuracy: 0.40333680721540416 
 The test accuracy: 0.3535493244672165 
 The valid accuracy: 0.402403181613708 


# Decision Tree

In [77]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [np.arange(10, 25)],  #The maximum depth of the tree.
          'max_features': np.arange(0.1, 0.5, 0.1), 
          'min_samples_leaf': np.arange(1, 10), #The minimum number of samples required to be at a leaf node.
          'min_samples_split': np.arange(2,10), #The minimum number of samples required to split an internal node
         'criterion' : ['gini','entropy'],
          'ccp_alpha': np.arange(0.0, 1.0, 0.01)
         }

ps = PredefinedSplit(test_fold=[-1 if i < len(train_data['label']) else 0 for i in range(len(train_data['label']) + len(valid_data['label']))])
decision_tree_grid = GridSearchCV(DecisionTreeClassifier(), params, cv=ps, n_jobs=-1)
train_x = scipy.sparse.vstack([train_data_fbow, valid_data_fbow])
train_y = np.concatenate([train_data['label'], valid_data['label']])
decision_tree_grid.fit(train_x, train_y)
print(f'Best params for Decision Tree is: {decision_tree_grid.best_params_}')


f1_score_train_decision_tree = f1_score(train_data['label'], decision_tree_grid.best_estimator_.predict(train_data_fbow), average='macro')
f1_score_test_decision_tree = f1_score(test_data['label'], decision_tree_grid.best_estimator_.predict(test_data_fbow), average='macro')
f1_score_valid_decision_tree = f1_score(valid_data['label'], decision_tree_grid.best_estimator_.predict(valid_data_fbow), average='macro')


print(f'F1-score using Decision Tree classifier: \n The train accuracy: {f1_score_train_decision_tree} \n The test accuracy: {f1_score_test_decision_tree} \n The valid accuracy: {f1_score_valid_decision_tree} ')






Best params for Decision Tree is: {'ccp_alpha': 0.003, 'criterion': 'entropy', 'max_depth': 22}
F1-score using Decision Tree classifier: 
 The train accuracy: 0.8246886986247811 
 The test accuracy: 0.7872661829902323 
 The valid accuracy: 0.8334348649809901 


# Linear SVM classifier

In [66]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

params = {'C': [0.1, 1, 10, 100, 1000,10000,100000],
          'max_iter': [10000]}
ps = PredefinedSplit(test_fold=[-1 if i < len(train_data['label']) else 0 for i in range(len(train_data['label']) + len(valid_data['label']))])
svc_grid = GridSearchCV(LinearSVC(), params, cv=ps, n_jobs=-1)
train_x = scipy.sparse.vstack([train_data_fbow, valid_data_fbow])
train_y = np.concatenate([train_data['label'], valid_data['label']])
svc_grid.fit(train_x, train_y)
print(f'Best params for Linear SVC is: {svc_grid.best_params_}')


f1_score_train_svc = f1_score(train_data['label'], svc_grid.best_estimator_.predict(train_data_fbow), average='macro')
f1_score_test_svc = f1_score(test_data['label'], svc_grid.best_estimator_.predict(test_data_fbow), average='macro')
f1_score_valid_svc = f1_score(valid_data['label'], svc_grid.best_estimator_.predict(valid_data_fbow), average='macro')


print(f'F1-score using Linear SVC: \n The train accuracy: {f1_score_train_svc} \n The test accuracy: {f1_score_test_svc} \n The valid accuracy: {f1_score_valid_svc} ')







Best params for Linear SVC is: {'C': 10000, 'max_iter': 10000}
F1-score using Linear SVC: 
 The train accuracy: 0.8879319611149765 
 The test accuracy: 0.68877357265114 
 The valid accuracy: 0.894562555624891 




# Logistic Regression

In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

params ={"C": [0.00001, 0.0001,0.001, 0.01, 0.1, 1, 10, 100,150,200, 1000],
         'max_iter': [10000]}


ps = PredefinedSplit(test_fold=[-1 if i < len(train_data['label']) else 0 for i in range(len(train_data['label']) + len(valid_data['label']))])
logistic_regression_grid = GridSearchCV(LogisticRegression(),params, cv=ps, n_jobs=-1)
train_x = scipy.sparse.vstack([train_data_fbow, valid_data_fbow])
train_y = np.concatenate([train_data['label'], valid_data['label']])
logistic_regression_grid.fit(train_x, train_y)
print(f'Best params for logistic regression is: {logistic_regression_grid.best_params_}')


f1_score_train_logistic_regression = f1_score(train_data['label'], logistic_regression_grid.best_estimator_.predict(train_data_fbow), average='macro')
f1_score_test_logistic_regression = f1_score(test_data['label'], logistic_regression_grid.best_estimator_.predict(test_data_fbow), average='macro')
f1_score_valid_logistic_regression = f1_score(valid_data['label'], logistic_regression_grid.best_estimator_.predict(valid_data_fbow), average='macro')


print(f'F1-score using logistic regression: \n The train accuracy: {f1_score_train_logistic_regression} \n The test accuracy: {f1_score_test_logistic_regression} \n The valid accuracy: {f1_score_valid_logistic_regression} ')








Best params for logistic regression is: {'C': 150, 'max_iter': 10000}
F1-score using logistic regression: 
 The train accuracy: 0.6774815152242367 
 The test accuracy: 0.49361988494676284 
 The valid accuracy: 0.667364231974758 
