In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from glob import glob
import re
import json
import sys 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
# import stemmer as hindi_stemmer


In [None]:
def tr_flatten(d,lc):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
        'contextual_label':lc[d['tweet_id']]
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] +' '+i['tweet'], #flattening comments(appending one after the other)
                'contextual_label':lc[i['tweet_id']]
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] +' '+ i['tweet'] +' '+ j['tweet'], #flattening replies
                        'contextual_label':lc[j['tweet_id']]
                    })
    return flat_text

In [None]:
def te_flatten(d):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] + i['tweet'],
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] + i['tweet'] + j['tweet'],
                    })
    return flat_text

In [None]:
regex_for_english_hindi_emojis="[^a-zA-Z#\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF\u0900-\u097F]"
def clean_tweet(tweet, english_stemmer, stopword):
    tweet = re.sub(r"@[A-Za-z0-9]+",' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ', tweet)
    tweet = re.sub(regex_for_english_hindi_emojis,' ', tweet)
    tweet = re.sub("RT ", " ", tweet)
    tweet = re.sub("\n", " ", tweet)
    tweet = re.sub(r" +", " ", tweet)
    tokens = []
    for token in tweet.split():
        if token not in stopword:
            token = english_stemmer.stem(token)
            token = hi_stem(token)
            tokens.append(token)
    return " ".join(tokens)

In [None]:
import re
words_dict  = { "तैराक":"तैर",
                "चालाक":"चाल",
                "कूलाक":"कूल",
                "बेलन":"बेल",
                "मिलाप":"मिल",
                "चुपचाप": "चुप",
                "निकास":"निकस",
                "लुकास":"लुक",
                }

suffixes = {
	    1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"],  
            2: ["तृ","ान","ैत","ने","ाऊ","ाव","कर", "ाओ", "िए", "ाई", "ाए", "नी", "ना", "ते", "ीं", "ती",
                "ता", "ाँ", "ां", "ों", "ें","ीय", "ति","या", "पन", "पा","ित","ीन","लु","यत","वट","लू"],     
            3: ["ेरा","त्व","नीय","ौनी","ौवल","ौती","ौता","ापा","वास","हास","काल","पान","न्त","ौना","सार","पोश","नाक",
                "ियल","ैया", "ौटी","ावा","ाहट","िया","हार", "ाकर", "ाइए", "ाईं", "ाया", "ेगी", "वान", "बीन",
                "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं","कला","िमा","कार",
                "गार", "दान","खोर"],     
            4: ["ावास","कलाप","हारा","तव्य","वैया", "वाला", "ाएगी", "ाएगा", "ाओगी", "ाओगे", 
                "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां",
                "त्वा","तव्य","कल्प","िष्ठ","जादा","क्कड़"],     
            5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां", "अक्कड़","तव्य:","निष्ठ"],
}

special_suffixes = ["र्", "ज्य","त्य"]
dict_special_suffixes = {"र्":"ृ",
                         "ज्य":"ज्",
                         "त्य":"त्"}

def hi_stem(word, clean=False,chars=None):
    if clean == True:
        word = clean_text(word, chars)
    
    ans = word
    bl = False
    
    if word in words_dict.keys():
        return words_dict[word]
    
    for L in 5, 4, 3, 2, 1:
        if len(word) > L + 1:
            for suf in suffixes[L]:
                if word.endswith(suf):
                    ans = word[:-L]
                    bl =True
        if bl == True:
            break
                    
    if bl == True:
        for suf in suffixes[1]:
            if ans.endswith(suf):
                # use case - गानेवाला
                ans = hi_stem(ans)
  
    for suf in special_suffixes:
        if ans.endswith(suf):
            l = len(suf)
            ans = ans[:-l]
            ans += dict_special_suffixes[suf]

    return ans

def clean_text(text, chars=None):
    if chars == None:        
        text = re.sub(r"[()\"#/@;:<>{}`+=~|!?,']", "", text)
    else:
        text = re.sub(r"[" +chars+ "()\"#/@;:<>{}`+=~|!?,']", "", text)
    return text

In [None]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
english_stopwords = stopwords.words("english")

with open('/content/drive/MyDrive/HASOC_files/final_stopwords.txt', encoding = 'utf-8') as f:
  hindi_stopwords = f.readlines()
  for i in range(len(hindi_stopwords)):
      hindi_stopwords[i] = re.sub('\n','',hindi_stopwords[i])

stopword = english_stopwords + hindi_stopwords
english_stemmer = SnowballStemmer("english")

In [None]:
base_addreess = "/content/drive/MyDrive/HASOC/Train/contextual_2022_train (1)"
directories = []
for i in glob(base_addreess+"/Train/Hinglish/*/"):
  for j in glob(i+'*/'):
      directories.append(j)
data = []
for i in directories:
  with open(i+'data.json', encoding='utf-8') as f:
      data.append(json.load(f))

contextual_labels = []
for i in directories:
  with open(i+'contextual_labels.json', encoding='utf-8') as f:
      contextual_labels.append(json.load(f))

data_label = []
for i in range(len(contextual_labels)):
  for j in tr_flatten(data[i], contextual_labels[i]):
      data_label.append(j)




In [None]:
# test

base_addreess = "/content/drive/MyDrive/hasoc_test/test_data"
directories = []
for i in glob(base_addreess+"/Train/Hinglish/*/"):
  for j in glob(i+'*/'):
      directories.append(j)
data = []
for i in directories:
  with open(i+'data.json', encoding='utf-8') as f:
      data.append(json.load(f))

# contextual_labels = []
# for i in directories:
#   with open(i+'contextual_labels.json', encoding='utf-8') as f:
#       contextual_labels.append(json.load(f))

data_test = []
for i in range(len(data)):
  for j in te_flatten(data[i]):
      data_test.append(j)

In [None]:
len(data_test)

996

In [None]:
data_label

In [None]:
train_len = len(data_label)
df = pd.DataFrame(data_label, columns = data_label[0].keys(), index = None)
print("Multiclass Distribution")
print(df['contextual_label'].value_counts())





Multiclass Distribution
NONE    2390
SHOF    1636
CHOF     888
Name: contextual_label, dtype: int64


In [None]:
train_len = len(data_label)
df_test = pd.DataFrame(data_test, columns = data_test[0].keys(), index = None)
print("Multiclass Distribution")
# print(df['contextual_label'].value_counts())

Multiclass Distribution


In [None]:
df_test

Unnamed: 0,tweet_id,text
0,1445930336039358469,"Last warning to Endia, stop sponsoring #earthq..."
1,1445931080071069703,"Last warning to Endia, stop sponsoring #earthq..."
2,1445969052451573763,"Last warning to Endia, stop sponsoring #earthq..."
3,1445933724726489089,"Last warning to Endia, stop sponsoring #earthq..."
4,1446062615621373954,"Last warning to Endia, stop sponsoring #earthq..."
...,...,...
991,1467751712978989056,"On December 6, 1992, a crowd of almost 150,000..."
992,1467810485701472256,"On December 6, 1992, a crowd of almost 150,000..."
993,1467829444563779590,"On December 6, 1992, a crowd of almost 150,000..."
994,1467858643798216705,"On December 6, 1992, a crowd of almost 150,000..."


In [None]:
df.contextual_label.unique()

array(['SHOF', 'CHOF', 'NONE'], dtype=object)

In [None]:
d = {'SHOF' : 0  , 'CHOF' : 1 , 'NONE' : 2}

In [None]:
df.contextual_label = df.contextual_label.map(d)

In [None]:
tweets = df.text
multi_class_y = df.contextual_label 

cleaned_tweets = [clean_tweet(tweet, english_stemmer, stopword) for tweet in tweets]

In [None]:
#applying class weight
from sklearn.utils import compute_class_weight
classweight = dict(zip([0,1,2],compute_class_weight(class_weight = 'balanced',y=df['contextual_label'].values,classes=[0,1,2])))
classweight

{0: 1.0012224938875305, 1: 1.8445945945945945, 2: 0.6853556485355649}

In [None]:
tweets_test = df_test.text

In [None]:
cleaned_tweets_test = [clean_tweet(tweet, english_stemmer, stopword) for tweet in tweets_test]

In [None]:
len(cleaned_tweets_test)

996

In [None]:
vectorizer = TfidfVectorizer(min_df = 5)

X = vectorizer.fit_transform(cleaned_tweets)
X = X.todense()



In [None]:
X_test = vectorizer.transform(cleaned_tweets_test)

In [None]:
X_test = X_test.todense()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, multi_class_y, test_size=0.2, random_state=42)
classifier = KNeighborsClassifier(5)
classifier.fit(X_train, y_train)





KNeighborsClassifier()

In [None]:
y_pred = classifier.predict(X_val)
print("With K-nearest Neighbour:")
print(classification_report(y_val, y_pred))



With K-nearest Neighbour:
              precision    recall  f1-score   support

           0       0.49      0.33      0.39       333
           1       0.58      0.51      0.54       174
           2       0.61      0.78      0.68       476

    accuracy                           0.58       983
   macro avg       0.56      0.54      0.54       983
weighted avg       0.56      0.58      0.56       983



LogisticRegression 

In [None]:
from sklearn import linear_model

In [None]:
reg = model = LogisticRegression(C = 5,penalty ='l2'  , solver = 'newton-cg' , random_state = 42)

In [None]:
reg.fit(X, multi_class_y)



LogisticRegression(C=5, random_state=42, solver='newton-cg')

In [None]:
y_pred = reg.predict(X_test)



In [None]:
len(X_test)

996

In [None]:
df_submit = df_test

In [None]:
df_submit

Unnamed: 0,tweet_id,label
0,1445930336039358469,NONE
1,1445931080071069703,NONE
2,1445969052451573763,NONE
3,1445933724726489089,NONE
4,1446062615621373954,NONE
...,...,...
991,1467751712978989056,NONE
992,1467810485701472256,SHOF
993,1467829444563779590,SHOF
994,1467858643798216705,SHOF


In [None]:
df_submit = df_test.drop(['text'], axis=1)

In [None]:
df_submit['label'] = y_pred

In [None]:
d = { 0 : 'SHOF'  ,   1 : 'CHOF',   2 : 'NONE'}

In [None]:
df_submit.label = df_submit.label.map(d)

In [None]:
df_submit.to_csv('task2.csv',index = False)

In [None]:
y_pred = reg.predict(X_val)
print("With K-Nearest Neighbour:")
print(classification_report(y_val, y_pred))

With K-Nearest Neighbour:
              precision    recall  f1-score   support

           0       0.51      0.48      0.49       333
           1       0.65      0.53      0.59       174
           2       0.67      0.75      0.71       476

    accuracy                           0.62       983
   macro avg       0.61      0.59      0.60       983
weighted avg       0.62      0.62      0.62       983





In [None]:
# f = open("result/lg_agn.txt",'w')
# f.write(f"LogisticRegression(C = {0},tol = {0},penalty ={0})")
# f.close()
from sklearn.metrics import f1_score
C = [1,5,25,50,100,200]
penalty = ["l2" , 'l1' ,'elasticnet']
solver = ['newton-cg', 'lbfgs', 'sag','liblinear','saga']
# l1_ratio = [0.0,0.3,0.5,0.8,1.0]

In [None]:
score = 0
for i in C:
  for k in penalty:
    for l in solver :
      print(i,k,l)
      try:
        model = LogisticRegression(C = i,penalty =k  , solver = l , random_state = 42)
        model.fit(X_train,y_train)
      except :
        continue
      pred_m = model.predict(X_val)
      f = f1_score(y_val,pred_m,average = 'macro')
      if(score<f):
          score = f
          # f = open("result/lg_agn.txt",'w')
          # f.write(f"LogisticRegression(C = {i},penalty = {k},solver={l})")
          # f.close()
          # joblib.dump(model,"result/agn_lg.joblib")
          print(i,k,l,score)
          # print(score)
              

1 l2 newton-cg




1 l2 newton-cg 0.5892821466587126
1 l2 lbfgs


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


1 l2 sag




1 l2 liblinear
1 l2 saga




1 l1 newton-cg
1 l1 lbfgs
1 l1 sag
1 l1 liblinear
1 l1 saga




1 elasticnet newton-cg
1 elasticnet lbfgs
1 elasticnet sag
1 elasticnet liblinear
1 elasticnet saga
5 l2 newton-cg




5 l2 newton-cg 0.5973793218361033
5 l2 lbfgs


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


5 l2 sag




5 l2 liblinear




5 l2 saga




5 l1 newton-cg
5 l1 lbfgs
5 l1 sag
5 l1 liblinear




5 l1 saga




5 elasticnet newton-cg
5 elasticnet lbfgs
5 elasticnet sag
5 elasticnet liblinear
5 elasticnet saga
25 l2 newton-cg




25 l2 lbfgs


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


25 l2 sag




25 l2 liblinear




25 l2 saga




25 l1 newton-cg
25 l1 lbfgs
25 l1 sag
25 l1 liblinear




25 l1 saga




25 elasticnet newton-cg
25 elasticnet lbfgs
25 elasticnet sag
25 elasticnet liblinear
25 elasticnet saga
50 l2 newton-cg




50 l2 lbfgs


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


50 l2 sag




50 l2 liblinear




50 l2 saga




50 l1 newton-cg
50 l1 lbfgs
50 l1 sag
50 l1 liblinear




50 l1 saga




50 elasticnet newton-cg
50 elasticnet lbfgs
50 elasticnet sag
50 elasticnet liblinear
50 elasticnet saga
100 l2 newton-cg




100 l2 lbfgs


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


100 l2 sag




100 l2 liblinear




100 l2 saga




100 l1 newton-cg
100 l1 lbfgs
100 l1 sag
100 l1 liblinear




100 l1 saga




100 elasticnet newton-cg
100 elasticnet lbfgs
100 elasticnet sag
100 elasticnet liblinear
100 elasticnet saga
200 l2 newton-cg




200 l2 lbfgs


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


200 l2 sag




200 l2 liblinear




200 l2 saga




200 l1 newton-cg
200 l1 lbfgs
200 l1 sag
200 l1 liblinear




200 l1 saga
200 elasticnet newton-cg
200 elasticnet lbfgs
200 elasticnet sag
200 elasticnet liblinear
200 elasticnet saga




SVM

In [None]:
from sklearn.svm import SVC

In [None]:
model3 = SVC(C = 1 ,kernel= 'linear')
model3.fit(X_train,y_train)



SVC(C=1, kernel='linear')

In [None]:
y_pred = model3.predict(X_val)
print("With K-Nearest Neighbour:")
print(classification_report(y_val, y_pred))



With K-Nearest Neighbour:
              precision    recall  f1-score   support

           0       0.51      0.47      0.49       333
           1       0.73      0.52      0.61       174
           2       0.65      0.76      0.70       476

    accuracy                           0.62       983
   macro avg       0.63      0.58      0.60       983
weighted avg       0.62      0.62      0.61       983



In [None]:
mx = 0
for kernel in ['linear', 'poly', 'rbf']:
  for C in [1,5,10,25,50,100,200]:
    if(kernel=='poly'):
      if(C != 200):
        for degree in [1,2,3,4]:
            model = SVC(C = C,kernel=kernel,degree=degree)
            model.fit(X_train,y_train)
            pred = model.predict(X_val)
            f = f1_score(y_val,pred,average='macro')
            if(mx<f):
                mx = max(mx,f)
                # s = open('result/gen_svm.txt' , 'w')
                # s.write(f"SVC(C= {C}, degree = {degree} , kernel = {kernel}) score={mx}")
                # s.close()
                # joblib.dump(model,'result/gen_svm.joblib')
                print(kernel,C,degree,mx)
            print(f,f"C = {C} , kernel = {kernel} , degree = {degree}")
    else:
        model = SVC(C = C,kernel=kernel)
        model.fit(X_train,y_train)
        pred = model.predict(X_val)
        f = f1_score(y_val,pred,average='macro')
        if(mx<f):
            mx = max(mx,f)
            # s = open('result/gen_svm.txt' , 'w')
            # s.write(f"SVC(C= {C}, degree = {1} , kernal = {kernel}) score={mx}")
            # s.close()
            # joblib.dump(model,'result/gen_svm.joblib')
            print(kernel,C,mx)
        print(f,f"C = {C} , kernel = {kernel}")



linear 1 0.6005841200306409
0.6005841200306409 C = 1 , kernel = linear




0.6004888112558731 C = 5 , kernel = linear




0.5841529141075688 C = 10 , kernel = linear




0.5735190795592245 C = 25 , kernel = linear




0.5635587617681056 C = 50 , kernel = linear




0.5355504580430828 C = 100 , kernel = linear




0.5245975453548121 C = 200 , kernel = linear




0.6005841200306409 C = 1 , kernel = poly , degree = 1




0.5962036552681353 C = 1 , kernel = poly , degree = 2




0.5749587430574524 C = 1 , kernel = poly , degree = 3




0.5614769643176415 C = 1 , kernel = poly , degree = 4




0.6004888112558731 C = 5 , kernel = poly , degree = 1




0.5839836728773963 C = 5 , kernel = poly , degree = 2




0.5728857756146596 C = 5 , kernel = poly , degree = 3




0.5599037527807936 C = 5 , kernel = poly , degree = 4




0.5841529141075688 C = 10 , kernel = poly , degree = 1




0.5764992199968791 C = 10 , kernel = poly , degree = 2




0.5566268986012476 C = 10 , kernel = poly , degree = 3




0.5446530832658408 C = 10 , kernel = poly , degree = 4




0.5735190795592245 C = 25 , kernel = poly , degree = 1




0.5538095385934766 C = 25 , kernel = poly , degree = 2




0.5428676870606619 C = 25 , kernel = poly , degree = 3




0.535996174789846 C = 25 , kernel = poly , degree = 4




0.5635587617681056 C = 50 , kernel = poly , degree = 1




0.5376862745326928 C = 50 , kernel = poly , degree = 2




0.5376574452233969 C = 50 , kernel = poly , degree = 3




0.5317891501574583 C = 50 , kernel = poly , degree = 4




0.5342660872088941 C = 100 , kernel = poly , degree = 1




0.5321382804571769 C = 100 , kernel = poly , degree = 2




0.5366541891188308 C = 100 , kernel = poly , degree = 3




0.5324367515143454 C = 100 , kernel = poly , degree = 4




0.5978230131633525 C = 1 , kernel = rbf




0.5926806633874888 C = 5 , kernel = rbf




0.5875194996593187 C = 10 , kernel = rbf




0.5606350087872517 C = 25 , kernel = rbf




0.5545613216164055 C = 50 , kernel = rbf




0.5534546005069955 C = 100 , kernel = rbf




0.5541178457685895 C = 200 , kernel = rbf


MNB

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
model = MultinomialNB(alpha=0.06891)
model.fit(X_train,y_train)



MultinomialNB(alpha=0.06891)

In [None]:
y_pred = model.predict(X_val)
# print("With K-Nearest Neighbour:")
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.49      0.49      0.49       333
           1       0.51      0.54      0.53       174
           2       0.68      0.66      0.67       476

    accuracy                           0.58       983
   macro avg       0.56      0.56      0.56       983
weighted avg       0.58      0.58      0.58       983





In [None]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
score =0
for i in np.arange(1e-05,1e-01,1e-04):
  print(i)
  model = MultinomialNB(alpha=i)
  model.fit(X_train,y_train)
  pred = model.predict(X_val)
  f = f1_score(y_val,pred,average='macro')
  if(score<f):
    score = f
    # f = open("result/gen_mnb.txt",'w')
    # f.write(f"MultinomialNB(alpha = {i})")
    # f.close()
    # joblib.dump(model,'result/gen_mnb.joblib')
    print(i,score)

1e-05
1e-05 0.5767259714042453
0.00011
0.00011 0.5772294685011076
0.00021




0.00021 0.5806848523834162
0.00031000000000000005
0.00041000000000000005
0.00051
0.0006100000000000001




0.00071
0.0008100000000000001
0.0009100000000000001




0.00101
0.00111
0.0012100000000000001




0.0013100000000000002
0.00141
0.00151
0.00161




0.0017100000000000001
0.0018100000000000002




0.00191
0.00201
0.0021100000000000003
0.00221




0.00231
0.0024100000000000002
0.00251
0.0026100000000000003




0.00271
0.00281
0.0029100000000000003
0.00301




0.0031100000000000004
0.00321
0.00331
0.0034100000000000003




0.00351
0.0036100000000000004
0.00371
0.00381




0.00391
0.00401
0.00411
0.00421




0.00431
0.00441
0.00451
0.0046099999999999995




0.00471
0.00481
0.0049099999999999994




0.00501
0.00511
0.00521




0.00531
0.00541
0.00551
0.0056099999999999995




0.00571
0.00581
0.0059099999999999995
0.00601




0.00611
0.00621
0.00631




0.00641
0.00651
0.0066099999999999996
0.00671




0.00681
0.00691
0.00701
0.00711




0.00721
0.00731
0.00741
0.00751




0.00761
0.00771
0.00781




0.00791
0.00801
0.00811
0.00821




0.00831
0.00841
0.00851
0.00861




0.00871
0.00881
0.00891
0.00901




0.00911
0.00921
0.00931




0.00941
0.00951
0.00961
0.00971




0.00981
0.00991
0.01001
0.010110000000000001




0.01021
0.01031
0.01041
0.01051




0.01061
0.01071
0.01081




0.01091
0.01101
0.01111
0.01121




0.01131
0.01141
0.01151
0.01161




0.01171
0.01181
0.01191
0.01201




0.012110000000000001
0.01221
0.01231




0.012410000000000001
0.01251
0.01261
0.01271




0.01281
0.01291
0.01301




0.01311
0.01321
0.01331
0.01341




0.01351
0.01361
0.01371
0.013810000000000001




0.01391
0.01401
0.014110000000000001




0.01421
0.01431
0.014410000000000001
0.01451




0.01461
0.01471
0.01481
0.01491




0.01501
0.01511
0.01521
0.01531




0.01541
0.01551
0.01561




0.015710000000000002
0.01581
0.01591
0.01601




0.01611
0.01621
0.01631
0.01641




0.01651
0.01661
0.01671




0.016810000000000002
0.01691
0.01701




0.01711
0.01721
0.01731
0.017410000000000002




0.01751
0.01761
0.01771
0.01781




0.01791
0.01801
0.01811
0.01821




0.01831
0.01841
0.018510000000000002




0.01861
0.01871
0.01881
0.01891




0.01901
0.019110000000000002
0.01921
0.01931




0.01941
0.01951
0.01961
0.019710000000000002




0.01981
0.01991
0.02001
0.02011




0.020210000000000002
0.02031
0.02041
0.02051




0.02061
0.02071
0.020810000000000002
0.02091




0.02101
0.02111
0.02121
0.02131




0.021410000000000002
0.02151
0.02161




0.02171
0.02181
0.02191
0.022010000000000002




0.02211
0.02221
0.02231
0.02241




0.022510000000000002
0.02261
0.02271
0.02281




0.02291
0.02301
0.023110000000000002
0.02321




0.02331
0.02341
0.02351
0.02361




0.023710000000000002
0.02381
0.02391
0.02401
0.02411
0.024210000000000002
0.024310000000000002




0.02441
0.02451
0.02461
0.02471
0.024810000000000002
0.02491




0.02501
0.02511
0.02521




0.02531
0.025410000000000002
0.02551




0.02561
0.02571
0.02581
0.025910000000000002




0.026010000000000002
0.02611
0.02621




0.02631
0.02641
0.026510000000000002




0.02661
0.02671
0.02681




0.02691
0.02701
0.027110000000000002




0.02721
0.02731
0.02741




0.02751
0.027610000000000003
0.027710000000000002




0.02781
0.02791
0.02801
0.02811




0.028210000000000002
0.028310000000000002
0.02841




0.02851
0.02861
0.02871
0.028810000000000002




0.02891
0.02901




0.02911
0.02921
0.029310000000000003




0.029410000000000002
0.02951
0.02961
0.02971




0.02981
0.029910000000000003
0.030010000000000002
0.03011




0.03021
0.03031
0.03041




0.030510000000000002
0.030610000000000002
0.03071




0.03081
0.03091
0.03101
0.031110000000000002




0.03121
0.031310000000000004
0.03141000000000001
0.03151




0.031610000000000006
0.03171
0.031810000000000005
0.03191000000000001




0.032010000000000004
0.03211000000000001
0.03221




0.032310000000000005
0.03241
0.032510000000000004
0.03261000000000001




0.03271
0.032810000000000006
0.03291




0.033010000000000005
0.03311000000000001
0.03321
0.033310000000000006




0.03341
0.033510000000000005
0.03361000000000001




0.033710000000000004
0.03381000000000001
0.03391
0.034010000000000006




0.03411
0.034210000000000004
0.03431000000000001




0.03441
0.034510000000000006
0.03461




0.034710000000000005
0.03481000000000001
0.034910000000000004




0.035010000000000006
0.03511
0.035210000000000005




0.03531000000000001
0.035410000000000004
0.03551000000000001
0.03561




0.035710000000000006
0.03581




0.035910000000000004
0.03601000000000001
0.03611
0.036210000000000006




0.03631
0.036410000000000005
0.03651000000000001




0.036610000000000004
0.036710000000000007
0.03681




0.036910000000000005
0.03701000000000001
0.037110000000000004
0.03721000000000001




0.03731
0.037410000000000006




0.03751
0.037610000000000005
0.03771000000000001




0.03781
0.037910000000000006
0.03801




0.038110000000000005
0.03821000000000001
0.038310000000000004
0.03841000000000001




0.03851
0.038610000000000005
0.03871000000000001




0.038810000000000004
0.03891000000000001
0.03901




0.039110000000000006
0.03921
0.039310000000000005




0.03941000000000001
0.03951
0.039610000000000006
0.03971




0.039810000000000005
0.03991000000000001




0.040010000000000004
0.04011000000000001
0.04021




0.040310000000000006
0.04041000000000001
0.040510000000000004
0.04061000000000001




0.04071
0.040810000000000006
0.04091




0.041010000000000005
0.04111000000000001
0.041210000000000004
0.041310000000000006




0.04141
0.041510000000000005




0.04161000000000001
0.041710000000000004
0.04181000000000001
0.04191




0.042010000000000006
0.04211000000000001




0.042210000000000004
0.04231000000000001
0.04241




0.042510000000000006
0.04261
0.042710000000000005




0.04281000000000001
0.042910000000000004
0.04301000000000001




0.04311
0.043210000000000005
0.04331000000000001




0.043410000000000004
0.04351000000000001
0.04361




0.043710000000000006
0.04381
0.043910000000000005




0.04401000000000001
0.04411
0.044210000000000006




0.04431
0.044410000000000005
0.04451000000000001




0.044610000000000004
0.04471000000000001
0.04481
0.044910000000000005




0.04501000000000001
0.045110000000000004




0.04521000000000001
0.04531
0.045410000000000006




0.04551
0.045610000000000005
0.04571000000000001




0.045810000000000003
0.045910000000000006
0.04601




0.046110000000000005
0.04621000000000001
0.046310000000000004
0.04641000000000001




0.04651
0.046610000000000006




0.04671000000000001
0.046810000000000004
0.04691000000000001




0.04701
0.047110000000000006
0.04721




0.047310000000000005
0.04741000000000001
0.047510000000000004




0.047610000000000006
0.04771
0.047810000000000005




0.04791000000000001
0.048010000000000004
0.04811000000000001




0.04821
0.048310000000000006
0.04841000000000001
0.048510000000000005




0.04861000000000001
0.04871
0.048810000000000006
0.04891




0.049010000000000005
0.04911000000000001
0.049210000000000004




0.04931000000000001
0.04941
0.049510000000000005




0.04961000000000001
0.049710000000000004
0.04981000000000001




0.04991
0.050010000000000006
0.05011000000000001
0.050210000000000005




0.05031000000000001
0.05041
0.050510000000000006
0.05061




0.050710000000000005
0.05081000000000001
0.050910000000000004
0.05101000000000001




0.05111
0.051210000000000006
0.05131000000000001
0.051410000000000004




0.05151000000000001
0.05161
0.051710000000000006
0.05181000000000001




0.051910000000000005
0.05201000000000001




0.052110000000000004
0.052210000000000006
0.05231
0.052410000000000005
0.05251000000000001
0.052610000000000004




0.05271000000000001
0.05281
0.052910000000000006




0.05301000000000001
0.053110000000000004
0.05321000000000001
0.05331




0.053410000000000006
0.05351000000000001
0.053610000000000005




0.05371000000000001
0.053810000000000004
0.053910000000000007
0.05401




0.054110000000000005
0.05421000000000001
0.054310000000000004
0.05441000000000001




0.05451
0.054610000000000006
0.05471000000000001
0.054810000000000005




0.05491000000000001
0.05501
0.055110000000000006




0.05521000000000001
0.055310000000000005
0.05541000000000001




0.055510000000000004
0.05561000000000001
0.05571
0.055810000000000005




0.05591000000000001
0.056010000000000004
0.05611000000000001
0.05621




0.056310000000000006
0.05641000000000001
0.056510000000000005
0.05661000000000001




0.05671
0.056810000000000006
0.05691000000000001




0.057010000000000005
0.05711000000000001
0.057210000000000004
0.05731000000000001
0.05741
0.057510000000000006
0.05761000000000001




0.057710000000000004
0.05781000000000001
0.05791




0.058010000000000006
0.05811000000000001
0.058210000000000005
0.05831000000000001
0.058410000000000004
0.058510000000000006




0.05861000000000001
0.058710000000000005
0.05881000000000001
0.058910000000000004




0.05901000000000001
0.05911
0.059210000000000006




0.05931000000000001
0.059410000000000004
0.05951000000000001
0.05961




0.059710000000000006
0.05981000000000001
0.059910000000000005




0.06001000000000001
0.060110000000000004
0.06021000000000001
0.06031




0.060410000000000005
0.06051000000000001
0.060610000000000004




0.06071000000000001
0.06081
0.060910000000000006




0.06101000000000001
0.061110000000000005
0.06121000000000001




0.06131
0.061410000000000006
0.06151000000000001




0.061610000000000005
0.06171000000000001
0.061810000000000004




0.06191000000000001
0.06201
0.062110000000000005
0.06221000000000001




0.062310000000000004
0.06241000000000001
0.06251




0.06261
0.06271




0.06281
0.06291
0.06301




0.06311
0.06321
0.06331
0.06341




0.06351
0.06361
0.06371




0.06381
0.06391
0.06401




0.06411
0.06421
0.06431




0.06441
0.06451
0.06461




0.06471
0.06480999999999999
0.06491




0.06501
0.06511
0.06521




0.06530999999999999
0.06541
0.06551




0.06561
0.06571
0.06581




0.06591
0.06601
0.06611




0.06621
0.06631
0.06641




0.06651
0.06661
0.06671




0.06681
0.06691
0.06701




0.06711
0.06721
0.06731




0.06741
0.06751
0.06761




0.06771
0.06781
0.06791




0.06801
0.06811
0.06820999999999999




0.06831
0.06841
0.06851




0.06861
0.06871
0.06881
0.06891




0.06901
0.06911
0.06921
0.06931




0.06941
0.06951
0.06961




0.06971
0.06981
0.06991




0.07001
0.07011
0.07021




0.07031
0.07041
0.07051
0.07061
0.07071
0.07081
0.07091




0.07101
0.07110999999999999
0.07121
0.07131
0.07141
0.07151




0.07161
0.07171
0.07181




0.07191
0.07201
0.07211




0.07221
0.07231
0.07241




0.07251
0.07261
0.07271




0.07281
0.07291
0.07301
0.07311
0.07321
0.07331
0.07341




0.07351
0.07361
0.07371
0.07381




0.07391
0.07401
0.07411




0.07421
0.07431
0.07441




0.07450999999999999
0.07461
0.07471




0.07481
0.07491
0.07501
0.07511




0.07521
0.07531
0.07541
0.07551




0.07561
0.07571
0.07581
0.07591




0.07601
0.07611
0.07621
0.07631




0.07641
0.07651
0.07661
0.07671




0.07681
0.07691
0.07701
0.07711




0.07721
0.07731
0.07741
0.07751
0.07761
0.07771
0.07781




0.07791
0.07801
0.07811




0.07821
0.07831
0.07841
0.07851




0.07861
0.07871
0.07881
0.07891




0.07901
0.07911
0.07921
0.07931




0.07941
0.07951
0.07961




0.07971
0.07981
0.07991




0.08001
0.08011
0.08021




0.08031
0.08041
0.08051




0.08061
0.08071
0.08081




0.08091
0.08101
0.08111




0.08121
0.08131
0.08141




0.08151
0.08161
0.08171




0.08181
0.08191
0.08201
0.08211




0.08221
0.08231
0.08241
0.08251




0.08261
0.08271
0.08281




0.08291
0.08301
0.08311




0.08321
0.08331
0.08341




0.08351
0.08361
0.08371




0.08381
0.08391
0.08401
0.08411




0.08421000000000001
0.08431




0.08441
0.08451
0.08461




0.08471
0.08481
0.08491




0.08501
0.08511
0.08521




0.08531
0.08541
0.08551




0.08561
0.08571
0.08581




0.08591
0.08601
0.08611




0.08621
0.08631
0.08641




0.08651
0.08661
0.08671




0.08681
0.08691
0.08701




0.08711
0.08721
0.08731




0.08741
0.08751
0.08761




0.08771
0.08781
0.08791




0.08801
0.08811
0.08821




0.08831
0.08841
0.08851




0.08861
0.08871
0.08881




0.08891
0.08901
0.08911




0.08921
0.08931
0.08941




0.08951
0.08961
0.08971




0.08981
0.08991
0.09001
0.09011




0.09021
0.09031




0.09041
0.09051000000000001
0.09061




0.09071
0.09081
0.09091




0.09101
0.09111
0.09121




0.09131
0.09141
0.09151




0.09161
0.09171
0.09181




0.09191
0.09201
0.09211




0.09221
0.09231
0.09241




0.09251
0.09261
0.09271




0.09281
0.09291
0.09301




0.09311
0.09321
0.09331




0.09341
0.09351
0.09361




0.09371
0.09381
0.09391000000000001




0.09401
0.09411
0.09421




0.09431
0.09441
0.09451




0.09461
0.09471
0.09481




0.09491
0.09501
0.09511




0.09521
0.09531
0.09541




0.09551
0.09561
0.09571




0.09581
0.09591
0.09601




0.09611
0.09621
0.09631




0.09641
0.09651
0.09661




0.09671
0.09681000000000001
0.09691




0.09701
0.09711




0.09721
0.09731000000000001




0.09741
0.09751




0.09761
0.09771




0.09781
0.09791




0.09801
0.09811




0.09821
0.09831




0.09841
0.09851




0.09861
0.09871




0.09881
0.09891
0.09901




0.09911
0.09921
0.09931




0.09941
0.09951
0.09961




0.09971000000000001
0.09981
0.09991




In [None]:
score

0.5806848523834162

In [None]:
0.00021

In [None]:
model = MultinomialNB(alpha=0.00021)
model.fit(X_train,y_train)



MultinomialNB(alpha=0.00021)

In [None]:
y_pred = model.predict(X_val)
# print("With K-Nearest Neighbour:")
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.54      0.52       333
           1       0.58      0.52      0.55       174
           2       0.69      0.66      0.67       476

    accuracy                           0.60       983
   macro avg       0.59      0.58      0.58       983
weighted avg       0.60      0.60      0.60       983





randomforest

In [None]:
from sklearn.ensemble import RandomForestClassifier 

In [None]:
from sklearn.metrics import f1_score
max_features = ['auto', 'sqrt', 'log2']
max_depth = [4,5,6,7,8]
criterion =['gini', 'entropy'] 

In [None]:
score=0
for i in max_features:
  for j in max_depth:
    for k in criterion:
      print(i,j,k)
      model=RandomForestClassifier(max_features=i,max_depth=j,criterion=k)
      model.fit(X_train,y_train)
      pred=model.predict(X_val)
      f=f1_score(y_val,pred,average='macro')
      if(score<f):
        score=f
        # f=open("result/gen_rf.txt",'w')
        # f.write(f"RandomForestClassifier(C={i},tol={j},penalty={k}) score={score}")
        # f.close()
        # joblib.dump(model,'result/gen_rf.joblib')
        print(i,j,k,score)

auto 4 gini




auto 4 gini 0.3754346311429257
auto 4 entropy




auto 5 gini




auto 5 entropy




auto 6 gini




auto 6 gini 0.3883136889351974
auto 6 entropy




auto 6 entropy 0.3883968845535463
auto 7 gini




auto 7 gini 0.40810472104676965
auto 7 entropy




auto 7 entropy 0.41171537728333735
auto 8 gini




auto 8 gini 0.4516704177562086
auto 8 entropy




sqrt 4 gini




sqrt 4 entropy




sqrt 5 gini




sqrt 5 entropy




sqrt 6 gini




sqrt 6 entropy




sqrt 7 gini




sqrt 7 entropy




sqrt 8 gini




sqrt 8 entropy




log2 4 gini




log2 4 entropy




log2 5 gini




log2 5 entropy




log2 6 gini




log2 6 entropy




log2 7 gini




log2 7 entropy




log2 8 gini




log2 8 entropy




In [None]:
model=RandomForestClassifier()
model.fit(X_train,y_train)



RandomForestClassifier()

In [None]:
y_pred = model.predict(X_val)
# print("With K-Nearest Neighbour:")
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.47      0.44      0.45       333
           1       0.66      0.53      0.59       174
           2       0.65      0.73      0.69       476

    accuracy                           0.60       983
   macro avg       0.59      0.57      0.58       983
weighted avg       0.59      0.60      0.59       983





XGboost

In [None]:
from xgboost import XGBClassifier

In [None]:
model=XGBClassifier()
model.fit(X_train,y_train)

XGBClassifier(objective='multi:softprob')

In [None]:
y_pred = model.predict(X_val)
# print("With K-Nearest Neighbour:")
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.35      0.42       333
           1       0.73      0.38      0.50       174
           2       0.60      0.82      0.69       476

    accuracy                           0.58       983
   macro avg       0.61      0.52      0.53       983
weighted avg       0.59      0.58      0.56       983



In [None]:
# g=open("result/gen_xgb.txt",'w')
# g.write(f"XGBClassifier(min_child_weight={0},gamma={0},subsample={0})")
# g.close()
from sklearn.metrics import f1_score
min_child_weight = [1, 5, 10]
gamma =[0.5, 1, 1.5, 2, 5]
subsample= [0.6, 0.8, 1.0]

In [None]:
from xgboost import XGBClassifier 
score=0 
for i in min_child_weight: 
  for j in gamma: 
    for k in subsample: 
      print(i,j,k) 
      model=XGBClassifier(min_child_weight=i,gamma=j,subsample=k,class_weight=classweight) 
      model.fit(X_train,y_train) 
      pred_m=model.predict(X_val) 
      f=f1_score(y_val,pred_m,average='macro') 
      
      if(score<f): 
        score=f 
        print(i,j,k,score)

ADABoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
score=0
for i in [50,70,80,90,100,120]:
  for j in np.arange(1e-02,1,1e-01):
    print(i,j)
    model=AdaBoostClassifier(n_estimators=i,learning_rate=j)
    model.fit(X_train,y_train)
    pred_m=model.predict(X_val)
    f=f1_score(y_val,pred_m,average='macro')
    if(score<f):
      score=f
      # g=open("result/gen_adb.txt",'w')
      # g.write(f"AdaBoost(n_estimators={i},learning_rate={j})")
      # g.close()
      print(i,j,score)
      # joblib.dump(model,"result/gen_adb.joblib")

In [None]:
model=AdaBoostClassifier(n_estimators=i,learning_rate=j)
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_val)
# print("With K-Nearest Neighbour:")
print(classification_report(y_val, y_pred))

ANN

In [None]:
le = LabelEncoder() #label encoding labels for training Dense Neural Network
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

y_train = to_categorical(y_train,num_classes=3)
#y_val = to_categorical(y_val,num_classes=3)
print(y_train.shape)


(3931, 3)


In [None]:
model = Sequential(
  [
      Dense(64, activation="relu"),
      Dense(32, activation="relu"),
      Dense(3, activation="softmax"),
  ]
)

model.compile('adam', loss='categorical_crossentropy', metrics = ['accuracy']) #compiling a neural network with 3 layers for classification
model.fit(X_train, y_train, epochs = 5, batch_size = 32)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f58cc8e1f50>

In [None]:
y_pred = model.predict(X_val)
y_pred = np.argmax(y_pred, axis = 1)
y_pred = y_pred.reshape(len(y_pred))    

print("With MLP:")
print(classification_report(y_val, y_pred)) 

With MLP:
              precision    recall  f1-score   support

           0       0.61      0.56      0.58       174
           1       0.66      0.74      0.70       476
           2       0.50      0.43      0.46       333

    accuracy                           0.61       983
   macro avg       0.59      0.58      0.58       983
weighted avg       0.60      0.61      0.60       983

