In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from glob import glob
import re
import json
import sys 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
# import stemmer as hindi_stemmer

In [None]:
import re
words_dict  = { "तैराक":"तैर",
                "चालाक":"चाल",
                "कूलाक":"कूल",
                "बेलन":"बेल",
                "मिलाप":"मिल",
                "चुपचाप": "चुप",
                "निकास":"निकस",
                "लुकास":"लुक",
                }

suffixes = {
	    1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"],  
            2: ["तृ","ान","ैत","ने","ाऊ","ाव","कर", "ाओ", "िए", "ाई", "ाए", "नी", "ना", "ते", "ीं", "ती",
                "ता", "ाँ", "ां", "ों", "ें","ीय", "ति","या", "पन", "पा","ित","ीन","लु","यत","वट","लू"],     
            3: ["ेरा","त्व","नीय","ौनी","ौवल","ौती","ौता","ापा","वास","हास","काल","पान","न्त","ौना","सार","पोश","नाक",
                "ियल","ैया", "ौटी","ावा","ाहट","िया","हार", "ाकर", "ाइए", "ाईं", "ाया", "ेगी", "वान", "बीन",
                "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं","कला","िमा","कार",
                "गार", "दान","खोर"],     
            4: ["ावास","कलाप","हारा","तव्य","वैया", "वाला", "ाएगी", "ाएगा", "ाओगी", "ाओगे", 
                "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां",
                "त्वा","तव्य","कल्प","िष्ठ","जादा","क्कड़"],     
            5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां", "अक्कड़","तव्य:","निष्ठ"],
}

special_suffixes = ["र्", "ज्य","त्य"]
dict_special_suffixes = {"र्":"ृ",
                         "ज्य":"ज्",
                         "त्य":"त्"}

def hi_stem(word, clean=False,chars=None):
    if clean == True:
        word = clean_text(word, chars)
    
    ans = word
    bl = False
    
    if word in words_dict.keys():
        return words_dict[word]
    
    for L in 5, 4, 3, 2, 1:
        if len(word) > L + 1:
            for suf in suffixes[L]:
                if word.endswith(suf):
                    ans = word[:-L]
                    bl =True
        if bl == True:
            break
                    
    if bl == True:
        for suf in suffixes[1]:
            if ans.endswith(suf):
                # use case - गानेवाला
                ans = hi_stem(ans)
  
    for suf in special_suffixes:
        if ans.endswith(suf):
            l = len(suf)
            ans = ans[:-l]
            ans += dict_special_suffixes[suf]

    return ans

def clean_text(text, chars=None):
    if chars == None:        
        text = re.sub(r"[()\"#/@;:<>{}`+=~|!?,']", "", text)
    else:
        text = re.sub(r"[" +chars+ "()\"#/@;:<>{}`+=~|!?,']", "", text)
    return text

In [None]:
words_dict  = { "तैराक":"तैर",
                "चालाक":"चाल",
                "कूलाक":"कूल",
                "बेलन":"बेल",
                "मिलाप":"मिल",
                "चुपचाप": "चुप",
                "निकास":"निकस",
                "लुकास":"लुक",
                }

In [None]:
def tr_flatten(d,lb):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
        'binary_label':lb[d['tweet_id']],
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] +' '+i['tweet'], #flattening comments(appending one after the other)
                'binary_label':lb[i['tweet_id']],
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] +' '+ i['tweet'] +' '+ j['tweet'], #flattening replies
                        'binary_label':lb[j['tweet_id']],
                    })
    return flat_text

In [None]:
def tr_flatten_test(d):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
        # 'binary_label':lb[d['tweet_id']],
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] +' '+i['tweet'], #flattening comments(appending one after the other)
                # 'binary_label':lb[i['tweet_id']],
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] +' '+ i['tweet'] +' '+ j['tweet'], #flattening replies
                        # 'binary_label':lb[j['tweet_id']],
                    })
    return flat_text

In [None]:
tr_flatten(data[0])

TypeError: ignored

In [None]:
def te_flatten(d):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] + i['tweet'],
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] + i['tweet'] + j['tweet'],
                    })
    return flat_text

In [None]:
regex_for_english_hindi_emojis="[^a-zA-Z#\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF\u0900-\u097F]"
def clean_tweet(tweet, english_stemmer, stopword):
    tweet = re.sub(r"@[A-Za-z0-9]+",' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ', tweet)
    tweet = re.sub(regex_for_english_hindi_emojis,' ', tweet)
    tweet = re.sub("RT ", " ", tweet)
    tweet = re.sub("\n", " ", tweet)
    tweet = re.sub(r" +", " ", tweet)
    tokens = []
    for token in tweet.split():
        if token not in stopword:
            token = english_stemmer.stem(token)
            token = hi_stem(token)
            tokens.append(token)
    return " ".join(tokens)

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:

    
english_stopwords = stopwords.words("english")

with open('/content/drive/MyDrive/HASOC_files/final_stopwords.txt', encoding = 'utf-8') as f:
    hindi_stopwords = f.readlines()
    for i in range(len(hindi_stopwords)):
        hindi_stopwords[i] = re.sub('\n','',hindi_stopwords[i])

stopword = english_stopwords + hindi_stopwords
english_stemmer = SnowballStemmer("english")

base_addreess = "/content/drive/MyDrive/HASOC/Train/contextual_2022_train (1)"
directories = []
for i in glob(base_addreess+"/Train/*/*/"):
    for j in glob(i+'*/'):
        directories.append(j)
# directories
# for i in directories:
#   print(i)
# print(glob(base_addreess+"/Train/*/*/"))
data = []
for i in directories:
    try:
        with open(i+'data.json', encoding='utf-8') as f:
            data.append(json.load(f))
    except:
        continue

In [None]:

# test


english_stopwords = stopwords.words("english")

with open('/content/drive/MyDrive/HASOC_files/final_stopwords.txt', encoding = 'utf-8') as f:
    hindi_stopwords = f.readlines()
    for i in range(len(hindi_stopwords)):
        hindi_stopwords[i] = re.sub('\n','',hindi_stopwords[i])

stopword = english_stopwords + hindi_stopwords
english_stemmer = SnowballStemmer("english")

base_addreess = "/content/drive/MyDrive/hasoc_test/test_data"
directories = []
for i in glob(base_addreess+"/Train/*/*/"):
    for j in glob(i+'*/'):
        directories.append(j)
# directories
# for i in directories:
#   print(i)
# print(glob(base_addreess+"/Train/*/*/"))
data = []
for i in directories:
    try:
        with open(i+'data.json', encoding='utf-8') as f:
            data.append(json.load(f))
    except:
        continue

In [None]:
data

[{'tweet_id': '1467731786151170049',
  'tweet': 'On December 6, 1992, a crowd of almost 150,000 people gather to listen to speeches by BJP and the Vishwa Hindu Parishad (VHP) leaders – including LK Advani and Murli Manohar Joshi – at the Babri Masjid in Ayodhya. \n\n#Ayodhya \n#BabriMasjid\nhttps://t.co/vjKjSkBxKH',
  'comments': [{'tweet_id': '1467735611318542336',
    'tweet': '@thewire_in Write timleline from begining, when Islamic Invaders demolished Mandir and Built mosque over it. 6 December was restoration process',
    'replies': [{'tweet_id': '1467794777189994498',
      'tweet': '@DrDoggu @thewire_in Tum jhut bol rahe go and read history'},
     {'tweet_id': '1467806305083543557',
      'tweet': '@DrDoggu @thewire_in dont forget this using gunpowder hindutva terriorists attacked dargah in MP , And many cases where you say to speak jai sri ram forcely , what you learn from sri ram chandar ji , we indian muslims never do this , many times your leaders say " mule kaate jaege....

In [None]:
data[0]

In [None]:
binary_labels = []
for i in directories:
    if('Hinglish' in i):
        with open(i+'binary_labels.json', encoding='utf-8') as f:
            binary_labels.append(json.load(f))
    else:
        try:
            with open(i+'labels.json', encoding='utf-8') as f:
                binary_labels.append(json.load(f))
        except:
            continue

In [None]:
binary_labels

In [None]:
data_label = []
for i in range(len(binary_labels)):
    for j in tr_flatten(data[i], binary_labels[i]):
        data_label.append(j)

In [None]:
data_test = []
for i in range(len(data)):
    for j in tr_flatten_test(data[i]):
        data_test.append(j)

In [None]:
len(data_test)

1077

In [None]:
train_len = len(data_label)
df = pd.DataFrame(data_label, columns = data_label[0].keys(), index = None)
df.loc[df['binary_label']=='NONE']='NOT'
print("Binary Distribution")
print(df['binary_label'].value_counts())

Binary Distribution
HOF    2612
NOT    2609
Name: binary_label, dtype: int64


In [None]:
train_len = len(data)
df_test = pd.DataFrame(data_test, columns = data_test[0].keys(), index = None)
# df.loc[df['binary_label']=='NONE']='NOT'
print("Binary Distribution")
# print(df_test['text'].value_counts())

Binary Distribution


In [None]:
df_test

Unnamed: 0,tweet_id,text
0,1467731786151170049,"On December 6, 1992, a crowd of almost 150,000..."
1,1467735611318542336,"On December 6, 1992, a crowd of almost 150,000..."
2,1467794777189994498,"On December 6, 1992, a crowd of almost 150,000..."
3,1467806305083543557,"On December 6, 1992, a crowd of almost 150,000..."
4,1467814387465424898,"On December 6, 1992, a crowd of almost 150,000..."
...,...,...
1072,1530970126769602560,Die Protagonisten der letzten 2 Jahre fordern ...
1073,1531008098911064072,Die Protagonisten der letzten 2 Jahre fordern ...
1074,1531010140744437760,Die Protagonisten der letzten 2 Jahre fordern ...
1075,1530498771473276929,Die Protagonisten der letzten 2 Jahre fordern ...


In [None]:
df

Unnamed: 0,tweet_id,text,binary_label
0,1443243483301437445,सत्ता में बैठे मठाधीशों के अहंकार को ठेस पहुँच...,HOF
1,1443243870523777024,सत्ता में बैठे मठाधीशों के अहंकार को ठेस पहुँच...,HOF
2,1443244146026696708,सत्ता में बैठे मठाधीशों के अहंकार को ठेस पहुँच...,HOF
3,1443244385160744962,सत्ता में बैठे मठाधीशों के अहंकार को ठेस पहुँच...,NOT
4,1443244745736667138,सत्ता में बैठे मठाधीशों के अहंकार को ठेस पहुँच...,NOT
...,...,...,...
5216,1484645122981642246,Frauen an der Macht: @VOGUE_Germany \nEin Port...,HOF
5217,1484709256532221953,Frauen an der Macht: @VOGUE_Germany \nEin Port...,HOF
5218,NOT,NOT,NOT
5219,1485356665859891208,Frauen an der Macht: @VOGUE_Germany \nEin Port...,HOF


In [None]:
d = {'HOF' : 1 , 'NOT' : 0}

In [None]:
df['label'] = df['binary_label'].map(d)

In [None]:
df['label']

0       1
1       1
2       1
3       0
4       0
       ..
5216    1
5217    1
5218    0
5219    1
5220    1
Name: label, Length: 5221, dtype: int64

In [None]:
df_test

Unnamed: 0,tweet_id,text
0,1367179784372047876,Habe heute in der Stadt viele Schwarze und Asi...
1,1371462211504373762,Habe heute in der Stadt viele Schwarze und Asi...
2,1486034666968731655,Max #Otte wäre ein wesentlich besserer Bundesp...
3,1486036441335537666,Max #Otte wäre ein wesentlich besserer Bundesp...
4,1486037004466937858,Max #Otte wäre ein wesentlich besserer Bundesp...
...,...,...
76,1530970126769602560,Die Protagonisten der letzten 2 Jahre fordern ...
77,1531008098911064072,Die Protagonisten der letzten 2 Jahre fordern ...
78,1531010140744437760,Die Protagonisten der letzten 2 Jahre fordern ...
79,1530498771473276929,Die Protagonisten der letzten 2 Jahre fordern ...


In [None]:
tweets = df.text
binary_y = df.label 



In [None]:
tweets_test = df_test.text

In [None]:
len(tweets_test)

1077

In [None]:
binary_y.unique()

array([1, 0])

In [None]:
binary_y

In [None]:
cleaned_tweets = [clean_tweet(tweet, english_stemmer, stopword) for tweet in tweets]

In [None]:
cleaned_tweets_test = [clean_tweet(tweet, english_stemmer, stopword) for tweet in tweets_test]

In [None]:
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(cleaned_tweets)
# X = X.todense()


In [None]:
X_test  = vectorizer.transform(cleaned_tweets_test)

In [None]:
# len(X_test)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, binary_y, test_size=0.3, random_state=42)


In [None]:
y_train

2610    0
4702    0
1879    1
1172    1
5095    1
       ..
466     1
3092    1
3772    1
5191    1
860     1
Name: label, Length: 3654, dtype: int64

KNN

In [None]:
classifier = KNeighborsClassifier(3)
classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [None]:
y_pred = classifier.predict(X_val)
print("With K-Nearest Neighbour:")
print(classification_report(y_val, y_pred))

With K-Nearest Neighbour:
              precision    recall  f1-score   support

           0       0.66      0.78      0.71       761
           1       0.75      0.62      0.68       806

    accuracy                           0.70      1567
   macro avg       0.70      0.70      0.69      1567
weighted avg       0.70      0.70      0.69      1567



LOGISTIC REGRESSION

In [None]:
from sklearn import linear_model

In [None]:
reg = linear_model.LogisticRegression(C=25,penalty = 'l1', solver = 'saga')

In [None]:
reg.fit(X_train, y_train)



LogisticRegression(C=25, penalty='l1', solver='saga')

In [None]:
y_pred = reg.predict(X_test)

In [None]:
len(y_pred)

1077

In [None]:
len(y_pred)

81

In [None]:
y_test = []

In [None]:
df_submit = df_test

In [None]:
df_submit

Unnamed: 0,tweet_id,label
0,1467731786151170049,NOT
1,1467735611318542336,HOF
2,1467794777189994498,HOF
3,1467806305083543557,NOT
4,1467814387465424898,HOF
...,...,...
1072,1530970126769602560,HOF
1073,1531008098911064072,HOF
1074,1531010140744437760,HOF
1075,1530498771473276929,HOF


In [None]:
df_submit = df_test.drop(['text'], axis=1)

In [None]:
df_submit['label'] = y_pred

In [None]:
d = {1 : 'HOF' , 0 : 'NOT'}

In [None]:
df_submit['label'] = df_submit['label'].map(d)

In [None]:
df_submit.to_csv('german.csv',index = False)

In [None]:
df_submit[]

In [None]:
tweet_id = list(df_test['tweet_id'])

In [None]:
tweet_id

In [None]:
tweet_dict={}

In [None]:
for i in range(len(tweet_id)):
  if(y_pred[i] == 1):
    tweet_dict[tweet_id[i]] = 'HOF'
  else:
    tweet_dict[tweet_id[i]] = 'NOT'

In [None]:
tweet_dict

In [None]:
import json

In [None]:
json_object = json.dumps(tweet_dict)

In [None]:
json_object

In [None]:
with open("labels.json", "w") as outfile:
    outfile.write(json_object)

In [None]:
y_pred = reg.predict(X_val)
print("With K-Nearest Neighbour:")
print(classification_report(y_val, y_pred))

With K-Nearest Neighbour:
              precision    recall  f1-score   support

           0       0.72      0.74      0.73       761
           1       0.75      0.72      0.74       806

    accuracy                           0.73      1567
   macro avg       0.73      0.73      0.73      1567
weighted avg       0.73      0.73      0.73      1567



In [None]:
# f = open("result/lg_agn.txt",'w')
# f.write(f"LogisticRegression(C = {0},tol = {0},penalty ={0})")
# f.close()
from sklearn.metrics import f1_score
C = [1,5,25,50,100,200]
penalty = ["l2" , 'l1' ,'elasticnet']
solver = ['newton-cg', 'lbfgs', 'sag','liblinear','saga']
# l1_ratio = [0.0,0.3,0.5,0.8,1.0]

In [None]:
import joblib

In [None]:
score = 0
for i in C:
  for k in penalty:
    for l in solver :
      print(i,k,l)
      try:
        model = LogisticRegression(C = i,penalty =k  , solver = l , random_state = 42)
        model.fit(X_train,y_train)
      except :
        continue
      pred_m = model.predict(X_val)
      f = f1_score(y_val,pred_m,average = 'macro')
      if(score<f):
          score = f
          # f = open("result/lg_agn.txt",'w')
          # f.write(f"LogisticRegression(C = {i},penalty = {k},solver={l})")
          # f.close()
          # joblib.dump(model,"result/agn_lg.joblib")
          print(i,k,l,score)
          # print(score)
              

1 l2 newton-cg
1 l2 newton-cg 0.7164953324228489
1 l2 lbfgs
1 l2 sag
1 l2 liblinear
1 l2 saga
1 l1 newton-cg
1 l1 lbfgs
1 l1 sag
1 l1 liblinear
1 l1 saga
1 elasticnet newton-cg
1 elasticnet lbfgs
1 elasticnet sag
1 elasticnet liblinear
1 elasticnet saga
5 l2 newton-cg
5 l2 newton-cg 0.7346342302862761
5 l2 lbfgs
5 l2 sag
5 l2 liblinear
5 l2 saga
5 l1 newton-cg
5 l1 lbfgs
5 l1 sag
5 l1 liblinear
5 l1 saga




5 elasticnet newton-cg
5 elasticnet lbfgs
5 elasticnet sag
5 elasticnet liblinear
5 elasticnet saga
25 l2 newton-cg
25 l2 lbfgs


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


25 l2 sag
25 l2 liblinear
25 l2 saga




25 l1 newton-cg
25 l1 lbfgs
25 l1 sag
25 l1 liblinear
25 l1 saga




25 elasticnet newton-cg
25 elasticnet lbfgs
25 elasticnet sag
25 elasticnet liblinear
25 elasticnet saga
50 l2 newton-cg
50 l2 lbfgs


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


50 l2 sag




50 l2 liblinear
50 l2 saga




50 l1 newton-cg
50 l1 lbfgs
50 l1 sag
50 l1 liblinear
50 l1 saga




50 elasticnet newton-cg
50 elasticnet lbfgs
50 elasticnet sag
50 elasticnet liblinear
50 elasticnet saga
100 l2 newton-cg
100 l2 lbfgs


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


100 l2 sag




100 l2 liblinear
100 l2 saga




100 l1 newton-cg
100 l1 lbfgs
100 l1 sag
100 l1 liblinear
100 l1 saga




100 elasticnet newton-cg
100 elasticnet lbfgs
100 elasticnet sag
100 elasticnet liblinear
100 elasticnet saga
200 l2 newton-cg
200 l2 lbfgs


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


200 l2 sag




200 l2 liblinear
200 l2 saga




200 l1 newton-cg
200 l1 lbfgs
200 l1 sag
200 l1 liblinear
200 l1 saga
200 elasticnet newton-cg
200 elasticnet lbfgs
200 elasticnet sag
200 elasticnet liblinear
200 elasticnet saga




SVM

In [None]:
from sklearn.svm import SVC

In [None]:
model3 = SVC(C = 5 ,kernel= 'rbf')
model3.fit(X,binary_y)

SVC(C=5)

In [None]:
y_pred = model3.predict(X_test)

In [None]:
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
y_pred = model3.predict(X_val)
print("With K-Nearest Neighbour:")
print(classification_report(y_val, y_pred))

With K-Nearest Neighbour:
              precision    recall  f1-score   support

           0       0.72      0.73      0.73       761
           1       0.74      0.74      0.74       806

    accuracy                           0.73      1567
   macro avg       0.73      0.73      0.73      1567
weighted avg       0.73      0.73      0.73      1567



In [None]:
mx = 0
for kernel in ['linear', 'poly', 'rbf']:
  for C in [1,5,10,25,50,100,200]:
    if(kernel=='poly'):
      if(C != 200):
        for degree in [1,2,3,4]:
            model = SVC(C = C,kernel=kernel,degree=degree)
            model.fit(X_train,y_train)
             pred = model.predict(X_val)
            f = f1_score(y_val,pred,average='macro')
            if(mx<f):
                mx = max(mx,f)
                # s = open('result/gen_svm.txt' , 'w')
                # s.write(f"SVC(C= {C}, degree = {degree} , kernel = {kernel}) score={mx}")
                # s.close()
                # joblib.dump(model,'result/gen_svm.joblib')
                print(kernel,C,degree,mx)
            print(f,f"C = {C} , kernel = {kernel} , degree = {degree}")
    else:
        model = SVC(C = C,kernel=kernel)
        model.fit(X_train,y_train)
        pred = model.predict(X_val)
        f = f1_score(y_val,pred,average='macro')
        if(mx<f):
            mx = max(mx,f)
            # s = open('result/gen_svm.txt' , 'w')
            # s.write(f"SVC(C= {C}, degree = {1} , kernal = {kernel}) score={mx}")
            # s.close()
            # joblib.dump(model,'result/gen_svm.joblib')
            print(kernel,C,mx)
        print(f,f"C = {C} , kernel = {kernel}")

linear 1 0.7128634681150396
0.7128634681150396 C = 1 , kernel = linear
linear 5 0.7226868815300338
0.7226868815300338 C = 5 , kernel = linear
0.7132182299301023 C = 10 , kernel = linear
0.7077831787729398 C = 25 , kernel = linear
0.7064401382409896 C = 50 , kernel = linear
0.702353180744856 C = 100 , kernel = linear
0.6969218485181543 C = 200 , kernel = linear
0.7128634681150396 C = 1 , kernel = poly , degree = 1
0.7127762271660726 C = 1 , kernel = poly , degree = 2
0.7105156953151779 C = 1 , kernel = poly , degree = 3
0.7007083347777354 C = 1 , kernel = poly , degree = 4
0.7220051484784408 C = 5 , kernel = poly , degree = 1
0.7225465879358643 C = 5 , kernel = poly , degree = 2
0.708603937594202 C = 5 , kernel = poly , degree = 3
0.7065009988361147 C = 5 , kernel = poly , degree = 4
0.713895017466446 C = 10 , kernel = poly , degree = 1
0.7171003702083681 C = 10 , kernel = poly , degree = 2
0.705190199871051 C = 10 , kernel = poly , degree = 3
0.6984362575220753 C = 10 , kernel = poly ,

MNB

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
score =0
for i in np.arange(1e-05,1e-01,1e-04):
  print(i)
  model = MultinomialNB(alpha=i)
  model.fit(X_train,y_train)
  pred = model.predict(X_val)
  f = f1_score(y_val,pred,average='macro')
  if(score<f):
    score = f
    # f = open("result/gen_mnb.txt",'w')
    # f.write(f"MultinomialNB(alpha = {i})")
    # f.close()
    # joblib.dump(model,'result/gen_mnb.joblib')
    print(i,score)

1e-05
1e-05 0.7120867592750202
0.00011
0.00021
0.00021 0.7122688258330776
0.00031000000000000005
0.00041000000000000005
0.00041000000000000005 0.7138589276014589
0.00051
0.0006100000000000001
0.00071
0.00071 0.7139128410879346
0.0008100000000000001
0.0008100000000000001 0.7140174387041736
0.0009100000000000001
0.00101
0.00101 0.7147237663802172
0.00111
0.00111 0.716085466779357
0.0012100000000000001
0.0012100000000000001 0.7167410821450861
0.0013100000000000002
0.00141
0.00151
0.00161
0.0017100000000000001
0.0018100000000000002
0.00191
0.00201
0.0021100000000000003
0.00221
0.00231
0.0024100000000000002
0.00251
0.00251 0.7168404264744972
0.0026100000000000003
0.00271
0.00281
0.00281 0.7182496179655837
0.0029100000000000003
0.00301
0.0031100000000000004
0.00321
0.00331
0.0034100000000000003
0.00351
0.0036100000000000004
0.00371
0.00381
0.00391
0.00401
0.00411
0.00421
0.00431
0.00441
0.00451
0.0046099999999999995
0.00471
0.00481
0.0049099999999999994
0.00501
0.00511
0.00521
0.00531
0.0054

In [None]:
model = MultinomialNB(alpha=0.06891)
model.fit(X_train,y_train)

MultinomialNB(alpha=0.06891)

In [None]:
y_pred = model.predict(X_val)
# print("With K-Nearest Neighbour:")
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.75      0.73       761
           1       0.75      0.71      0.73       806

    accuracy                           0.73      1567
   macro avg       0.73      0.73      0.73      1567
weighted avg       0.73      0.73      0.73      1567



randomforest

In [None]:
from sklearn.ensemble import RandomForestClassifier 

In [None]:
from sklearn.metrics import f1_score
max_features = ['auto', 'sqrt', 'log2']
max_depth = [4,5,6,7,8]
criterion =['gini', 'entropy'] 

In [None]:
score=0
for i in max_features:
  for j in max_depth:
    for k in criterion:
      print(i,j,k)
      model=RandomForestClassifier(max_features=i,max_depth=j,criterion=k)
      model.fit(X_train,y_train)
      pred=model.predict(X_val)
      f=f1_score(y_val,pred,average='macro')
      if(score<f):
        score=f
        # f=open("result/gen_rf.txt",'w')
        # f.write(f"RandomForestClassifier(C={i},tol={j},penalty={k}) score={score}")
        # f.close()
        # joblib.dump(model,'result/gen_rf.joblib')
        print(i,j,k,score)

auto 4 gini
auto 4 gini 0.5767508625497676
auto 4 entropy
auto 5 gini
auto 5 gini 0.6058755731900609
auto 5 entropy
auto 6 gini
auto 6 gini 0.6143004050341967
auto 6 entropy
auto 7 gini
auto 7 entropy
auto 7 entropy 0.6172945145306954
auto 8 gini
auto 8 gini 0.652632253352383
auto 8 entropy
sqrt 4 gini
sqrt 4 entropy
sqrt 5 gini
sqrt 5 entropy
sqrt 6 gini
sqrt 6 entropy
sqrt 7 gini
sqrt 7 entropy
sqrt 8 gini
sqrt 8 entropy
log2 4 gini
log2 4 entropy
log2 5 gini
log2 5 entropy
log2 6 gini
log2 6 entropy
log2 7 gini
log2 7 entropy
log2 8 gini
log2 8 entropy


ANN

In [None]:
le = LabelEncoder() #label encoding labels for training Dense Neural Network
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

In [None]:
model = Sequential(
    [
        Dense(64, activation="relu"),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid"),
    ]
)

In [None]:
model.compile('adam', loss='binary_crossentropy', metrics = ['accuracy']) #compiling a neural network with 3 layers for classification
model.fit(X_train, y_train, epochs = 5, batch_size = 32)


In [None]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype('int64')
y_pred = y_pred.reshape(len(y_pred))    

print("With MLP:")

print(classification_report(y_val, y_pred)) 

With MLP:
              precision    recall  f1-score   support

           0       0.67      0.77      0.72       522
           1       0.73      0.63      0.68       523

    accuracy                           0.70      1045
   macro avg       0.70      0.70      0.70      1045
weighted avg       0.70      0.70      0.70      1045

