In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

time: 5.43 ms (started: 2021-09-07 03:29:59 +00:00)


In [13]:
!pip install -qq ipython-autotime

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2.94 s (started: 2021-09-07 03:30:07 +00:00)


### Wrapper for applying different models and vectorizers



In [1]:
import nltk
nltk.download('punkt')

from sklearn.linear_model import LogisticRegression
from sklearn import naive_bayes
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [43]:
import pickle

time: 1.18 ms (started: 2021-09-07 04:06:14 +00:00)


In [47]:
def save_model(clf,task,model,vec):

  """
  Util function to save model to a pickle file
  --------------------------

  Params:
  clf   - trained model
  model - technique used ['LR','NB,'SVC']
  vec - vectorizer used ['CV','TFIDF']

  Return:
  modelfile - pickle file

  """
  
  filename = model + '_'+ vec + '_' + task  
  modelfile = filename + '.pkl'
  pickle.dump(clf,open(modelfile,'wb'))
  return modelfile

time: 4.49 ms (started: 2021-09-07 04:24:00 +00:00)


In [96]:
def save_prediction(pkl,task,model,vec):

  """
  util function to save predictions to a csv file
  -----------------------------------------------
  Parameters:
  modelfile - pkl file of saved model

  Return:
  csv : predictions

  """
  test = pd.read_csv('trac2_hin_test.csv')
  
  clf = pickle.load(open(pkl,'rb'))
  preds = clf.predict(test.Text)
  
  preds_df = pd.DataFrame(preds,columns = ['Label'])
  preds_df['ID'] = test['ID']
  print(preds_df)
  
  if task=='a':
    task_id2labels = {1:'OAG',2:'CAG',0:'NAG'}
  elif task=='b':
    task_id2labels = {0:'NGEN',1:'GEN'}

  preds_df = preds_df.replace({'Label':task_id2labels})
  
  csvfile = model + '_'+ vec + '_' + task + '.csv'
  preds_df.to_csv(csvfile)

time: 10.8 ms (started: 2021-09-07 05:45:55 +00:00)


In [2]:
def labelencoder():

  """
  Creates and returns label encoders for each task.

  Returns:
          task_labels (dict): label encoder dict of the given input task
  """
  
  task_a_labels = {'NAG':0,'OAG':1,'CAG':2}
  
  task_b_labels = {'NGEN':0,'GEN':1}

  return task_a_labels,task_b_labels

In [3]:
def get_vec(vectorizer):

  """
  Returns the vectorizer based on input

  Parameters:
          vectorizer(str): Type of vectorizer

  Returns:
          vec: instatiates vectorizer class
  """

  if vectorizer == "CV":
    vec = CountVectorizer(tokenizer=word_tokenize)
  
  elif vectorizer == "TFIDF":
    vec = TfidfVectorizer(tokenizer=word_tokenize,ngram_range=(1,3))

  else:
    raise ValueError("Vectorizer not found!")
  
  return vec

In [4]:
from sklearn import svm

In [5]:
def get_model(Model):

  """
  Returns the model based on input

  Parameters:
          model(str): Type of Model

  Returns:
          model: Model class
  """

  if Model == "LR":
    model = LogisticRegression()
    
  elif Model == "NB":
    model = naive_bayes.MultinomialNB()
  elif Model == "SVC":
    model = svm.SVC(kernel='linear')
  else:
    raise ValueError("Model not found!")

  return model


In [None]:
def BaselineModel(task,Model='LR',vectorizer='CV',path_to_data='trac2_hin_train.csv'):

  """
  Create Baseline Models using different vectorizers on different ML models
  and prints the classifcation report
  
  Parameters:
          path_to_data(str): data file path 
          Model (str) : model type
                      - LR : Logistic Regression
                      - NB : Naive Bayes
                         
          vectorizer (str) : vectorizer type
                      - CV : Count Vectorizer
                      - TFIDf : TfIdf Vectorizer

  Returns:
          model : trained model on the data

  """
  
  model_dict = {"LR":"Logistic Regression","NB":"Naive Bayes"}
  vec_dict = {"CV":"CountVectorizer","TFIDF":"TfidfVectorizer"}
  task_dict = {"A": "Aggresive Text Classification","B":"Gendered Text Classification"}
  
  data = pd.read_csv(path_to_data)
  data.columns = ['ID','Text','aggressive','gendered']

  task_a_labels,task_b_labels  = labelencoder()

  data = data.replace({'aggressive':task_a_labels,'gendered':task_b_labels})
  
  if task == "A":
    target = 'aggressive'
  elif task == "B":
    target = 'gendered'
  else:
    raise ValueError("Task not found")

  x_train,x_test,y_train,y_test = train_test_split(data['Text'],data[target],stratify=data[target],random_state=21)  

  vec = get_vec(vectorizer)

  vec.fit(x_train)

  x_train_cv = vec.transform(x_train)
  x_test_cv = vec.transform(x_test)

  model = get_model(Model)

  model.fit(x_train_cv,y_train)

  preds = model.predict(x_test_cv)

  report = metrics.classification_report(y_test,preds)
  print(model_dict[Model] + " with " + vec_dict[vectorizer])
  print("\n")
  print(report)

  return model,vec

time: 31.7 ms (started: 2021-09-05 03:52:34 +00:00)


In [15]:
def BaselineModelTest(task,Model='LR',vectorizer='CV',path_to_data='trac2_hin_train.csv'):

  """
  Create Baseline Models using different vectorizers on different ML models
  and prints the classifcation report
  
  Parameters:
          path_to_data(str): data file path 
          Model (str) : model type
                      - LR : Logistic Regression
                      - NB : Naive Bayes
                         
          vectorizer (str) : vectorizer type
                      - CV : Count Vectorizer
                      - TFIDf : TfIdf Vectorizer

  Returns:
          model : trained model on the data

  """
  
  model_dict = {"LR":"Logistic Regression","NB":"Naive Bayes","SVC": "SVM Classifier"}
  vec_dict = {"CV":"CountVectorizer","TFIDF":"TfidfVectorizer"}
  task_dict = {"A": "Aggresive Text Classification","B":"Gendered Text Classification"}
  
  data = pd.read_csv(path_to_data)
  data.columns = ['ID','Text','aggressive','gendered']

  task_a_labels,task_b_labels  = labelencoder()

  data = data.replace({'aggressive':task_a_labels,'gendered':task_b_labels})
  
  if task == "A":
    target = 'aggressive'
  elif task == "B":
    target = 'gendered'
  else:
    raise ValueError("Task not found")
   
  #data['Text'] = data['Text'].apply(preprocess_text)

  x_train,x_test,y_train,y_test = train_test_split(data['Text'],data[target],stratify=data[target],random_state=21)  

  vec = get_vec(vectorizer)

  vec.fit(x_train)

  x_train_cv = vec.transform(x_train)
  x_test_cv = vec.transform(x_test)

  model = get_model(Model)

  model.fit(x_train_cv,y_train)

  preds = model.predict(x_test_cv)

  report = metrics.classification_report(y_test,preds)
  print(model_dict[Model] + " with " + vec_dict[vectorizer])
  print("\n")
  print(report)

  return model,vec

time: 41.7 ms (started: 2021-09-07 03:31:30 +00:00)


In [None]:
def BaselineEvaluate(task,model,vec):

  """
  Evaluate the models on Dev and Test sets and prints the classification report
  
  Parameters:
          path_to_testdata (str): testdata file path 
          model  : trained model
  
  """
 
  dev_data = pd.read_csv(path_to_testdata)
  task_a_labels,task_b_labels  = labelencoder()
  dev_data = dev_data.replace({'Sub-task A':task_a_labels,'Sub-task B':task_b_labels})
  
  #dev_data['Text'] = dev_data['Text'].apply(preprocess_text)
  x_dev = vec.transform(dev_data['Text'])
  dev_preds = model.predict(x_dev)
  
  if task == "A":
    col = 'Sub-task A'

  elif task == "B":
    col = 'Sub-task B'

  else:
    raise ValueError("Task not found")


  dev_report = metrics.classification_report(dev_data[col],dev_preds)
  print(dev_report)

time: 13.4 ms (started: 2021-09-05 04:00:23 +00:00)


In [None]:
tt = pd.read_csv('trac2_hin_gold_b.csv')
tt.head()

Unnamed: 0,ID,Sub-task B
0,C52.17,NGEN
1,C52.39,NGEN
2,C52.73,NGEN
3,C60.3,NGEN
4,C60.43,NGEN


time: 22.1 ms (started: 2021-09-04 17:31:54 +00:00)


In [67]:
def BaselineEvaluateTest(task,model,vec,eval='dev'):
  
  """
  Evaluate the models on Dev and Test sets and prints the classification report
  
  Parameters: 
          task (str) : Type of Task
                      - A
                      - B
          eval (str) : evaluation set
                      - 'dev' (default)
                      - 'test'
          model  : trained model
          vec : tokenizer
  
  """
  task_a_labels,task_b_labels  = labelencoder()

  if task == "A":
    col = 'Sub-task A'
  elif task == "B":
    col = 'Sub-task B'
  else:
    raise ValueError("Task not found")

  if eval == 'dev':
    print("Dev set evaluation report")
    path_to_data ='trac2_hin_dev.csv'
  elif eval == "test":
    print("Test set evaluation report")
    path_to_data = 'trac2_hin_test.csv'
    path_to_labels = 'trac2_hin_gold_' + task.lower() + '.csv' 
    test_labels = pd.read_csv(path_to_labels)
    test_labels = test_labels.replace({'Sub-task A':task_a_labels,'Sub-task B':task_b_labels})
  else:
    raise ValueError("eval type not found")
  
  eval_data = pd.read_csv(path_to_data)
  eval_data = eval_data.replace({'Sub-task A':task_a_labels,'Sub-task B':task_b_labels})
  
  x_eval = vec.transform(eval_data['Text'])
  eval_preds = model.predict(x_eval)
  
  if eval=="test":
    
    eval_report = metrics.classification_report(test_labels[col],eval_preds)
  else:
   
    eval_report = metrics.classification_report(eval_data[col],eval_preds)
  print(eval_report)

time: 27.2 ms (started: 2021-09-07 04:40:14 +00:00)


### Task B Modelling

#### Logistic Regression Models

In [83]:
model,vec = BaselineModelTest('B','LR','CV')

Logistic Regression with CountVectorizer


              precision    recall  f1-score   support

           0       0.89      0.98      0.93       831
           1       0.79      0.37      0.50       165

    accuracy                           0.88       996
   macro avg       0.84      0.68      0.72       996
weighted avg       0.87      0.88      0.86       996

time: 1.79 s (started: 2021-09-07 05:17:59 +00:00)


In [81]:
BaselineEvaluateTest('B',model,vec,'dev')

Dev set evaluation report
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       845
           1       0.62      0.33      0.43       152

    accuracy                           0.87       997
   macro avg       0.75      0.65      0.68       997
weighted avg       0.85      0.87      0.85       997

time: 240 ms (started: 2021-09-07 04:54:46 +00:00)


In [82]:
BaselineEvaluateTest('B',model,vec,'test')

Test set evaluation report
              precision    recall  f1-score   support

           0       0.70      0.93      0.80       633
           1       0.87      0.57      0.69       567

    accuracy                           0.76      1200
   macro avg       0.79      0.75      0.74      1200
weighted avg       0.78      0.76      0.75      1200

time: 326 ms (started: 2021-09-07 04:54:49 +00:00)


In [97]:
lr_b_pkl = save_model(model,'b','lr','cv')

time: 2.83 ms (started: 2021-09-07 05:46:49 +00:00)


In [None]:
save_prediction(lr_b_pkl,'b','lr','cv')

In [71]:
model,vec = BaselineModelTest('B','LR','TFIDF')

Logistic Regression with TfidfVectorizer


              precision    recall  f1-score   support

           0       0.84      1.00      0.91       831
           1       0.88      0.04      0.08       165

    accuracy                           0.84       996
   macro avg       0.86      0.52      0.50       996
weighted avg       0.85      0.84      0.77       996

time: 2.72 s (started: 2021-09-07 04:41:11 +00:00)


In [72]:
BaselineEvaluateTest('B',model,vec)

Dev set evaluation report
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       845
           1       1.00      0.03      0.05       152

    accuracy                           0.85       997
   macro avg       0.93      0.51      0.49       997
weighted avg       0.87      0.85      0.79       997

time: 274 ms (started: 2021-09-07 04:41:21 +00:00)


In [73]:
BaselineEvaluateTest('B',model,vec,'test')

Test set evaluation report
              precision    recall  f1-score   support

           0       0.55      1.00      0.71       633
           1       0.95      0.10      0.19       567

    accuracy                           0.57      1200
   macro avg       0.75      0.55      0.45      1200
weighted avg       0.74      0.57      0.46      1200

time: 389 ms (started: 2021-09-07 04:41:26 +00:00)


#### NaiveBayesModels

In [74]:
model,vec = BaselineModelTest('B','NB','CV')

Naive Bayes with CountVectorizer


              precision    recall  f1-score   support

           0       0.86      0.97      0.92       831
           1       0.63      0.22      0.33       165

    accuracy                           0.85       996
   macro avg       0.75      0.60      0.62       996
weighted avg       0.82      0.85      0.82       996

time: 1.46 s (started: 2021-09-07 04:41:34 +00:00)


In [75]:
BaselineEvaluateTest('B',model,vec)

Dev set evaluation report
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       845
           1       0.68      0.20      0.31       152

    accuracy                           0.86       997
   macro avg       0.78      0.59      0.62       997
weighted avg       0.84      0.86      0.83       997

time: 219 ms (started: 2021-09-07 04:41:38 +00:00)


In [76]:
BaselineEvaluateTest('B',model,vec,'test')

Test set evaluation report
              precision    recall  f1-score   support

           0       0.65      0.95      0.77       633
           1       0.89      0.42      0.57       567

    accuracy                           0.70      1200
   macro avg       0.77      0.69      0.67      1200
weighted avg       0.76      0.70      0.68      1200

time: 306 ms (started: 2021-09-07 04:41:45 +00:00)


In [77]:
model,vec = BaselineModelTest('B','NB','TFIDF')

Naive Bayes with TfidfVectorizer


              precision    recall  f1-score   support

           0       0.83      1.00      0.91       831
           1       0.00      0.00      0.00       165

    accuracy                           0.83       996
   macro avg       0.42      0.50      0.45       996
weighted avg       0.70      0.83      0.76       996

time: 2.16 s (started: 2021-09-07 04:41:53 +00:00)


In [78]:
BaselineEvaluateTest('B',model,vec)

Dev set evaluation report
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       845
           1       0.00      0.00      0.00       152

    accuracy                           0.85       997
   macro avg       0.42      0.50      0.46       997
weighted avg       0.72      0.85      0.78       997

time: 271 ms (started: 2021-09-07 04:41:59 +00:00)


In [79]:
BaselineEvaluateTest('B',model,vec,'test')

Test set evaluation report
              precision    recall  f1-score   support

           0       0.53      1.00      0.70       633
           1       1.00      0.02      0.05       567

    accuracy                           0.54      1200
   macro avg       0.77      0.51      0.37      1200
weighted avg       0.75      0.54      0.39      1200

time: 391 ms (started: 2021-09-07 04:42:05 +00:00)


### Task A Modelling

#### Logistic Regression Models

In [17]:
model,vec = BaselineModelTest('A','LR','CV')

Logistic Regression with CountVectorizer


              precision    recall  f1-score   support

           0       0.73      0.86      0.79       561
           1       0.65      0.55      0.60       228
           2       0.49      0.34      0.40       207

    accuracy                           0.68       996
   macro avg       0.63      0.59      0.60       996
weighted avg       0.67      0.68      0.67       996

time: 2.74 s (started: 2021-09-07 03:31:44 +00:00)


In [21]:
BaselineEvaluateTest('A',model,vec)

Dev set Evaluation report
              precision    recall  f1-score   support

           0       0.73      0.88      0.80       578
           1       0.60      0.48      0.53       208
           2       0.42      0.26      0.32       211

    accuracy                           0.67       997
   macro avg       0.58      0.54      0.55       997
weighted avg       0.64      0.67      0.64       997

time: 214 ms (started: 2021-09-07 03:32:22 +00:00)


In [24]:
BaselineEvaluateTest('A',model,vec,'test')

Test set Evaluation report
              precision    recall  f1-score   support

           0       0.61      0.81      0.69       325
           1       0.89      0.73      0.80       684
           2       0.47      0.51      0.49       191

    accuracy                           0.72      1200
   macro avg       0.66      0.68      0.66      1200
weighted avg       0.75      0.72      0.72      1200

time: 331 ms (started: 2021-09-07 03:32:44 +00:00)


In [25]:
model,vec = BaselineModelTest('A','LR','TFIDF')

Logistic Regression with TfidfVectorizer


              precision    recall  f1-score   support

           0       0.69      0.93      0.79       561
           1       0.70      0.42      0.53       228
           2       0.49      0.24      0.32       207

    accuracy                           0.67       996
   macro avg       0.63      0.53      0.55       996
weighted avg       0.65      0.67      0.63       996

time: 7.33 s (started: 2021-09-07 03:33:00 +00:00)


In [26]:
BaselineEvaluateTest('A',model,vec)

Dev set Evaluation report
              precision    recall  f1-score   support

           0       0.68      0.91      0.78       578
           1       0.68      0.38      0.48       208
           2       0.46      0.24      0.31       211

    accuracy                           0.65       997
   macro avg       0.61      0.51      0.52       997
weighted avg       0.63      0.65      0.62       997

time: 267 ms (started: 2021-09-07 03:33:18 +00:00)


In [27]:
BaselineEvaluateTest('A',model,vec,'test')

Test set Evaluation report
              precision    recall  f1-score   support

           0       0.51      0.88      0.65       325
           1       0.92      0.65      0.76       684
           2       0.50      0.42      0.46       191

    accuracy                           0.67      1200
   macro avg       0.64      0.65      0.62      1200
weighted avg       0.74      0.67      0.68      1200

time: 383 ms (started: 2021-09-07 03:33:19 +00:00)


#### Pipeline for LR


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer,CountVectorizer
from sklearn import metrics
from sklearn.svm import SVC

In [120]:
test_labels_b = pd.read_csv('trac2_hin_gold_b.csv')
test_labels_b = test_labels_b.replace({'Sub-task B':task_b_labels})

time: 13.2 ms (started: 2021-09-07 06:09:40 +00:00)


In [128]:
test_labels_a = pd.read_csv('trac2_hin_gold_a.csv')
test_labels_a = test_labels_a.replace({'Sub-task A':task_a_labels})

time: 16.9 ms (started: 2021-09-07 06:12:43 +00:00)


In [129]:
test_labels = pd.merge(test_labels_a,test_labels_b,on='ID')

time: 9.53 ms (started: 2021-09-07 06:12:44 +00:00)


In [130]:
print(test_labels['Sub-task A'].unique())
print(test_labels['Sub-task B'].unique())

[0 1 2]
[0 1]
time: 5.44 ms (started: 2021-09-07 06:12:45 +00:00)


In [143]:
train = train.replace({'Sub-task B':task_b_labels})
dev = dev.replace({'Sub-task B':task_b_labels})

time: 5.42 ms (started: 2021-09-07 06:23:04 +00:00)


In [152]:
def make_pipeline(task):

  """
  Create pipeline for LR model (Task a and b)
  Returns :

  task_clf : trained model pipeline
  """

  text_clf = Pipeline([
    
      ('cv',CountVectorizer(ngram_range=(1,3))),
      ('clf',LogisticRegression())
    
  ])
  
  text_clf.fit(train.Text, train[f'Sub-task {task.upper()}'])

  predicted = text_clf.predict(dev.Text)
  print(metrics.classification_report(dev[f'Sub-task {task.upper()}'],predicted))
  
  test_preds = text_clf.predict(test.Text)
  print(metrics.classification_report(test_labels[f'Sub-task {task.upper()}'],test_preds))

  return text_clf

time: 9.66 ms (started: 2021-09-07 06:25:34 +00:00)


In [92]:
test = pd.read_csv('trac2_hin_test.csv')

time: 18.3 ms (started: 2021-09-07 05:45:40 +00:00)


Task B

In [153]:
lr_b = make_pipeline('b')

              precision    recall  f1-score   support

           0       0.88      0.99      0.93       845
           1       0.78      0.28      0.42       152

    accuracy                           0.88       997
   macro avg       0.83      0.63      0.67       997
weighted avg       0.87      0.88      0.85       997

              precision    recall  f1-score   support

           0       0.70      0.95      0.81       633
           1       0.90      0.55      0.68       567

    accuracy                           0.76      1200
   macro avg       0.80      0.75      0.74      1200
weighted avg       0.80      0.76      0.75      1200

time: 2.34 s (started: 2021-09-07 06:25:41 +00:00)


In [155]:
lr_pkl_b = save_model(lr_b,'b','lr','cv')

time: 490 ms (started: 2021-09-07 06:28:23 +00:00)


In [157]:
save_prediction(lr_pkl_b,'b','lr','cv')

      Label       ID
0         0   C52.17
1         0   C52.39
2         0   C52.73
3         0    C60.3
4         0   C60.43
...     ...      ...
1195      0  C8.5029
1196      1  C8.5037
1197      0  C8.5046
1198      1  C8.5047
1199      1  C8.5076

[1200 rows x 2 columns]
time: 189 ms (started: 2021-09-07 06:29:14 +00:00)


Task A

In [154]:
lr_a = make_pipeline('a')

              precision    recall  f1-score   support

           0       0.71      0.90      0.80       578
           1       0.63      0.47      0.54       208
           2       0.48      0.24      0.32       211

    accuracy                           0.67       997
   macro avg       0.61      0.54      0.55       997
weighted avg       0.65      0.67      0.64       997

              precision    recall  f1-score   support

           0       0.56      0.87      0.68       325
           1       0.91      0.70      0.79       684
           2       0.54      0.48      0.51       191

    accuracy                           0.71      1200
   macro avg       0.67      0.68      0.66      1200
weighted avg       0.75      0.71      0.72      1200

time: 7.42 s (started: 2021-09-07 06:26:37 +00:00)


In [94]:
lr_pkl = save_model(text_clf,'a','lr','tfidf')

time: 253 ms (started: 2021-09-07 05:45:43 +00:00)


In [95]:
save_prediction(lr_pkl,'a','lr','tfidf')

      Label       ID
0         0   C52.17
1         0   C52.39
2         0   C52.73
3         0    C60.3
4         0   C60.43
...     ...      ...
1195      1  C8.5029
1196      1  C8.5037
1197      1  C8.5046
1198      1  C8.5047
1199      1  C8.5076

[1200 rows x 2 columns]
time: 153 ms (started: 2021-09-07 05:45:44 +00:00)


#### Naive Bayes Models

In [28]:
model,vec = BaselineModelTest('A','NB','CV')

Naive Bayes with CountVectorizer


              precision    recall  f1-score   support

           0       0.72      0.88      0.79       561
           1       0.65      0.56      0.60       228
           2       0.43      0.23      0.30       207

    accuracy                           0.67       996
   macro avg       0.60      0.56      0.56       996
weighted avg       0.64      0.67      0.64       996

time: 1.41 s (started: 2021-09-07 03:33:25 +00:00)


In [29]:
BaselineEvaluateTest('A',model,vec)

Dev set Evaluation report
              precision    recall  f1-score   support

           0       0.74      0.88      0.81       578
           1       0.64      0.59      0.61       208
           2       0.50      0.28      0.36       211

    accuracy                           0.69       997
   macro avg       0.63      0.58      0.59       997
weighted avg       0.67      0.69      0.67       997

time: 217 ms (started: 2021-09-07 03:33:29 +00:00)


In [30]:
BaselineEvaluateTest('A',model,vec,'test')

Test set Evaluation report
              precision    recall  f1-score   support

           0       0.56      0.73      0.64       325
           1       0.86      0.76      0.80       684
           2       0.37      0.35      0.36       191

    accuracy                           0.68      1200
   macro avg       0.60      0.61      0.60      1200
weighted avg       0.70      0.68      0.69      1200

time: 316 ms (started: 2021-09-07 03:33:36 +00:00)


In [31]:
model,vec = BaselineModelTest('A','NB','TFIDF')

Naive Bayes with TfidfVectorizer


              precision    recall  f1-score   support

           0       0.57      1.00      0.73       561
           1       0.94      0.07      0.13       228
           2       1.00      0.01      0.03       207

    accuracy                           0.58       996
   macro avg       0.84      0.36      0.30       996
weighted avg       0.75      0.58      0.45       996

time: 2.11 s (started: 2021-09-07 03:33:41 +00:00)


In [32]:
BaselineEvaluateTest('A',model,vec)

Dev set Evaluation report
              precision    recall  f1-score   support

           0       0.59      1.00      0.74       578
           1       0.87      0.06      0.12       208
           2       1.00      0.02      0.05       211

    accuracy                           0.60       997
   macro avg       0.82      0.36      0.30       997
weighted avg       0.73      0.60      0.46       997

time: 255 ms (started: 2021-09-07 03:38:43 +00:00)


In [33]:
BaselineEvaluateTest('A',model,vec,'test')

Test set Evaluation report
              precision    recall  f1-score   support

           0       0.30      1.00      0.46       325
           1       0.99      0.17      0.29       684
           2       0.00      0.00      0.00       191

    accuracy                           0.37      1200
   macro avg       0.43      0.39      0.25      1200
weighted avg       0.65      0.37      0.29      1200

time: 376 ms (started: 2021-09-07 03:38:47 +00:00)


#### SVM Model for Task A

#### Using SVC

In [None]:
from sklearn.svm import SVC

time: 1.09 ms (started: 2021-09-04 19:07:05 +00:00)


In [None]:
model,vec = BaselineModelTest('A','SVC','CV')

SVM Classifier with CountVectorizer


              precision    recall  f1-score   support

           0       0.74      0.84      0.78       561
           1       0.65      0.56      0.60       228
           2       0.42      0.33      0.37       207

    accuracy                           0.67       996
   macro avg       0.60      0.58      0.59       996
weighted avg       0.65      0.67      0.66       996

time: 3.49 s (started: 2021-09-04 19:38:34 +00:00)


In [None]:
BaselineEvaluate('A',model,vec)

              precision    recall  f1-score   support

           0       0.75      0.85      0.79       578
           1       0.58      0.52      0.55       208
           2       0.44      0.32      0.37       211

    accuracy                           0.67       997
   macro avg       0.59      0.56      0.57       997
weighted avg       0.65      0.67      0.65       997

time: 474 ms (started: 2021-09-04 19:38:40 +00:00)


In [None]:
BaselineEvaluate('A',model,vec,'test')

              precision    recall  f1-score   support

           0       0.61      0.79      0.69       325
           1       0.90      0.72      0.80       684
           2       0.45      0.54      0.49       191

    accuracy                           0.71      1200
   macro avg       0.65      0.68      0.66      1200
weighted avg       0.75      0.71      0.72      1200

time: 662 ms (started: 2021-09-04 19:38:46 +00:00)


In [None]:
model,vec = BaselineModelTest('A','SVC','TFIDF')

SVM Classifier with TfidfVectorizer


              precision    recall  f1-score   support

           0       0.74      0.86      0.79       561
           1       0.66      0.53      0.59       228
           2       0.50      0.39      0.43       207

    accuracy                           0.69       996
   macro avg       0.63      0.59      0.61       996
weighted avg       0.67      0.69      0.67       996

time: 5.29 s (started: 2021-09-04 19:38:49 +00:00)


In [None]:
BaselineEvaluate('A',model,vec)

              precision    recall  f1-score   support

           0       0.74      0.87      0.80       578
           1       0.67      0.47      0.55       208
           2       0.46      0.36      0.40       211

    accuracy                           0.68       997
   macro avg       0.62      0.57      0.58       997
weighted avg       0.66      0.68      0.66       997

time: 774 ms (started: 2021-09-04 19:38:58 +00:00)


In [None]:
BaselineEvaluate('A',model,vec,'test')

              precision    recall  f1-score   support

           0       0.62      0.78      0.69       325
           1       0.89      0.74      0.81       684
           2       0.44      0.52      0.48       191

    accuracy                           0.71      1200
   macro avg       0.65      0.68      0.66      1200
weighted avg       0.75      0.71      0.72      1200

time: 1.05 s (started: 2021-09-04 19:39:02 +00:00)


#### Using SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier

time: 1.29 ms (started: 2021-09-04 18:59:15 +00:00)


In [None]:
sgd = SGDClassifier(loss='hinge',random_state=1)

time: 1.11 ms (started: 2021-09-04 19:02:45 +00:00)


In [None]:
sgd.fit()

In [35]:
train = pd.read_csv('trac2_hin_train.csv')
train.head()

Unnamed: 0,ID,Text,Sub-task A,Sub-task B
0,C4.131,Bollywood film dekhne ke samay logic ghar mein...,NAG,NGEN
1,C4.638,Chutiya movie...,NAG,NGEN
2,C38.598,Us jaat bnde ka khene ka matlab tha mar daluga...,OAG,NGEN
3,C4.2101.1,@Feminism Is CANCER *un feminist yeh sahi hai ...,OAG,NGEN
4,C29.14.2,Amrit Anand अब तो जुड़े ही है उनको बोलो जुड़ने,NAG,NGEN


time: 40.5 ms (started: 2021-09-07 03:47:53 +00:00)


In [36]:
task_a_labels,task_b_labels = labelencoder()

time: 1.91 ms (started: 2021-09-07 03:47:55 +00:00)


In [37]:
train = train.replace({'Sub-task A':task_a_labels})

time: 9.07 ms (started: 2021-09-07 03:47:57 +00:00)


In [38]:
train.head()

Unnamed: 0,ID,Text,Sub-task A,Sub-task B
0,C4.131,Bollywood film dekhne ke samay logic ghar mein...,0,NGEN
1,C4.638,Chutiya movie...,0,NGEN
2,C38.598,Us jaat bnde ka khene ka matlab tha mar daluga...,1,NGEN
3,C4.2101.1,@Feminism Is CANCER *un feminist yeh sahi hai ...,1,NGEN
4,C29.14.2,Amrit Anand अब तो जुड़े ही है उनको बोलो जुड़ने,0,NGEN


time: 24.6 ms (started: 2021-09-07 03:47:58 +00:00)


In [39]:
dev = pd.read_csv('trac2_hin_dev.csv')
dev.head()

dev = dev.replace({'Sub-task A':task_a_labels})

time: 21.2 ms (started: 2021-09-07 03:48:00 +00:00)


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC

text_clf = Pipeline([
    ('vect', CountVectorizer()),
  
    ('clf',SVC(kernel='linear',random_state=42))
   
])

text_clf.fit(train.Text, train['Sub-task A'])

predicted = text_clf.predict(dev.Text)

print(metrics.classification_report(dev['Sub-task A'],predicted))

              precision    recall  f1-score   support

           0       0.75      0.85      0.80       578
           1       0.59      0.54      0.56       208
           2       0.45      0.32      0.38       211

    accuracy                           0.67       997
   macro avg       0.60      0.57      0.58       997
weighted avg       0.65      0.67      0.66       997

time: 3.07 s (started: 2021-09-05 07:30:30 +00:00)


#### Grid Search on SVC

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = [{'vect__ngram_range': [(1,1),(1,2),(1,3)],'clf__kernel':['linear'],'clf__C':[1,2,0.5,3]},
              {'vect__ngram_range': [(1,1),(1,2),(1,3)],'clf__kernel':['rbf'],'clf__C':[1,2,0.5,3],'clf__gamma':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]

time: 3.67 ms (started: 2021-09-05 07:31:05 +00:00)


In [None]:
gs_clf = GridSearchCV(text_clf,parameters,cv=5,n_jobs=-1,scoring='f1_weighted')

time: 12.2 ms (started: 2021-09-05 07:31:29 +00:00)


In [None]:
gs_clf = gs_clf.fit(train.Text,train['Sub-task A'])

time: 24min 4s (started: 2021-09-05 07:31:39 +00:00)


In [None]:
gs_clf.best_score_

0.6508847204257624

time: 4.4 ms (started: 2021-09-05 08:20:17 +00:00)


In [None]:
gs_clf.best_params_

{'clf__C': 1, 'clf__kernel': 'linear', 'vect__ngram_range': (1, 2)}

time: 4.67 ms (started: 2021-09-05 08:20:20 +00:00)


Pipeline with  new params

In [None]:
task_a_labels,_ = labelencoder()

time: 1.42 ms (started: 2021-09-06 06:28:18 +00:00)


In [40]:
test = pd.read_csv('trac2_hin_test.csv')

test_labels = pd.read_csv('trac2_hin_gold_a.csv')

test_labels = test_labels.replace({'Sub-task A':task_a_labels})

time: 19.2 ms (started: 2021-09-07 03:48:08 +00:00)


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn import metrics
from sklearn.svm import SVC

text_clf = Pipeline([
  
    ('tfidf',TfidfVectorizer(ngram_range = (1,2))),
    ('clf',SVC(kernel='linear',random_state=42,C=2,probability=True))
   
])

text_clf.fit(train.Text, train['Sub-task A'])

predicted = text_clf.predict(dev.Text)
print(metrics.classification_report(dev['Sub-task A'],predicted))

test_preds = text_clf.predict(test.Text)
print(metrics.classification_report(test_labels['Sub-task A'],test_preds))

              precision    recall  f1-score   support

           0       0.77      0.81      0.79       578
           1       0.63      0.59      0.61       208
           2       0.47      0.43      0.45       211

    accuracy                           0.68       997
   macro avg       0.62      0.61      0.62       997
weighted avg       0.68      0.68      0.68       997

              precision    recall  f1-score   support

           0       0.66      0.71      0.68       325
           1       0.89      0.78      0.83       684
           2       0.44      0.59      0.50       191

    accuracy                           0.73      1200
   macro avg       0.66      0.69      0.67      1200
weighted avg       0.76      0.73      0.74      1200

time: 21.2 s (started: 2021-09-06 09:45:49 +00:00)


#### Save the best model **SVC**

In [None]:
import pickle
modelfile = 'svcmodel_task_a.pkl'
pickle.dump(text_clf,open(modelfile,'wb'))


time: 210 ms (started: 2021-09-06 09:46:15 +00:00)


In [None]:
test

Unnamed: 0,ID,Text
0,C52.17,ko
1,C52.39,ladkiyon video
2,C52.73,ki video gahrep
3,C60.3,o sadharon video bhai
4,C60.43,ba bhai kyea bola tum moza aaa giea 😌😌😌😂😂😂
...,...,...
1195,C8.5029,aree bhenchod chup ho lodu aurt
1196,C8.5037,abe saali bharwe itni gaand kyun fati hui he t...
1197,C8.5046,chachi ji... usne ek mara lekin lerki ne jo is...
1198,C8.5047,sun oye bhenkilodi itnaa maarunga saali tod du...


time: 22.4 ms (started: 2021-09-06 09:20:23 +00:00)


#### Save predictions to csv file


In [None]:
svm_preds_a = pd.DataFrame(test_preds,columns = ['Label'])

time: 1.71 ms (started: 2021-09-06 09:54:29 +00:00)


In [None]:
svm_preds_a['ID'] = test['ID']

time: 2.71 ms (started: 2021-09-06 09:54:31 +00:00)


In [None]:
svm_preds_a.head()

Unnamed: 0,Label,ID
0,1,C52.17
1,0,C52.39
2,0,C52.73
3,0,C60.3
4,0,C60.43


time: 18 ms (started: 2021-09-06 09:54:46 +00:00)


In [None]:
svm_preds_a['Label'].unique()

array([1, 0, 2])

time: 5.47 ms (started: 2021-09-06 09:54:58 +00:00)


In [None]:
task_a_labels

{'CAG': 2, 'NAG': 0, 'OAG': 1}

time: 3.2 ms (started: 2021-09-06 09:55:02 +00:00)


In [None]:
task_a_id2labels = {1:'OAG',2:'CAG',0:'NAG'}

time: 1.03 ms (started: 2021-09-06 09:55:07 +00:00)


In [None]:
svm_preds_a = svm_preds_a.replace({'Label':task_a_id2labels})

time: 5.1 ms (started: 2021-09-06 09:55:10 +00:00)


In [None]:
svm_preds_a['Label'].unique()

array(['OAG', 'NAG', 'CAG'], dtype=object)

time: 3.68 ms (started: 2021-09-06 09:55:13 +00:00)


In [None]:
svm_preds_a.to_csv('trac2_hin_SVC_preds_a.csv')

time: 5.54 ms (started: 2021-09-06 09:55:41 +00:00)


### BaselineModels after Text Cleaning (remove punctuations and stopwords)

In [None]:
!pip install --q lingualytics

[K     |████████████████████████████████| 2.8 MB 11.1 MB/s 
[K     |████████████████████████████████| 241 kB 62.6 MB/s 
[K     |████████████████████████████████| 1.5 MB 34.4 MB/s 
[K     |████████████████████████████████| 895 kB 60.6 MB/s 
[K     |████████████████████████████████| 3.3 MB 47.5 MB/s 
[K     |████████████████████████████████| 636 kB 56.9 MB/s 
[K     |████████████████████████████████| 50 kB 6.9 MB/s 
[?25htime: 8.56 s (started: 2021-09-05 03:54:25 +00:00)


In [None]:
from lingualytics.preprocessing import remove_lessthan,remove_punctuation,remove_stopwords
from lingualytics.stopwords import hinglish_stopwords,hi_stopwords
import string,re

time: 1.37 ms (started: 2021-09-05 04:03:50 +00:00)


In [None]:
def clean_text(text):

  """
  util function to clean the text
  ------------------------------
  Parameters-
  text : input text to preprocess

  Return-
  text : cleaned text
  """

  # lower case the text
  text = str(text).lower()
  # remove twitter user names
  text = re.sub('@[\w]+','',text)
  # remove hyperlinks
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.,*?>+', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

  return text

time: 4.81 ms (started: 2021-09-05 03:57:49 +00:00)


In [None]:
def preprocess_text(text):

  """
  Util function to preprocess text

  Parameters:
          text(str):  

  Returns:
          text(str): preprocessed text
  """
  
  text = clean_text(text)
  text = ' '.join(token for token in text.split() if token not in hinglish_stopwords)
  return text

time: 2.07 ms (started: 2021-09-05 03:58:06 +00:00)


In [None]:
for task in ['A','B']:
  print("------Task " + task + "----------")
  for Model in ['LR','NB','SVC']:
    for Vec in ['CV','TFIDF']:
      model,vec = BaselineModelTest(task,Model,Vec)

      BaselineEvaluateTest(task,model,vec)
      BaselineEvaluateTest(task,model,vec,eval='test')

------Task A----------
Logistic Regression with CountVectorizer


              precision    recall  f1-score   support

           0       0.73      0.86      0.79       561
           1       0.65      0.55      0.60       228
           2       0.49      0.34      0.40       207

    accuracy                           0.68       996
   macro avg       0.63      0.59      0.60       996
weighted avg       0.67      0.68      0.67       996

Dev set Evaluation report
              precision    recall  f1-score   support

           0       0.67      0.92      0.78       578
           1       0.65      0.40      0.49       208
           2       0.49      0.17      0.25       211

    accuracy                           0.65       997
   macro avg       0.60      0.50      0.51       997
weighted avg       0.63      0.65      0.61       997

Test set Evaluation report
              precision    recall  f1-score   support

           0       0.47      0.92      0.62       325
          