In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
main_path = "/content/drive/MyDrive/UdacityND/project_caps/"

In [3]:
# either use the unclean
jigsaw_path = main_path+"train.csv"
val_data_path = main_path+"validation_data.csv"
comments_to_score_path = main_path+"comments_to_score.csv"
ruddit_path = main_path+"ruddit_with_text.csv"
new_data_path = main_path+"train_data_version2.csv"

# or clean data
jigsaw_path = main_path+"jigsaw.csv"
val_data_path = main_path+"val_data.csv"
comments_to_score_path = main_path+"new_comments.csv"
ruddit_path = main_path+"ruddit.csv"
new_data_path = main_path+"new_toxic.csv"

In [4]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import random
random.seed(123)
import sys
from scipy.stats import rankdata
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
import pickle
import joblib

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
#Read in the data # ref: https://www.kaggle.com/andrej0marinchenko/best-score-0-856-jigsaw-for-beginners
def read_jigsaw_toxic_data(train_df_path,  fold_num, use_folds = False):
  '''
  Read in the toxic jigsaw data
  
  Inputs:
    train_df_path -> path of the file to read
    fold_num -> indicates the number of folds if they are usen
    use_folds -> whehter to use folds or not
  Output: 
    balanced_df -> a balanced df that has 3 coloumns, comment_text is the text of the comment, toxic_vs_not whether the comment is toxic or not,
  y the score of the toxicity calculated
  '''
  df = pd.read_csv(train_df_path)
  df = df.dropna()
  count_toxic = df['toxic_vs_not'].value_counts()[1]

  if(use_folds==False):
    balanced_df = pd.concat([df[df.y>0].sample(frac=1, random_state = 10*(fold_num+1)),df[df.y==0].sample(int(count_toxic), random_state = 10*(fold_num+1))], axis=0)
  else:
    balanced_df = pd.concat([df[df.y>0].sample(frac=0.8, random_state = 10*(fold_num+1)),df[df.y==0].sample(int(0.8*count_toxic), random_state = 10*(fold_num+1))], axis=0)
  fig2, ax2 = plt.subplots()
  ax2 = sns.histplot(data=balanced_df,x='y',bins = 10, binwidth=0.08)
  
  return balanced_df[['comment_text','toxic_vs_not','y']]
  

In [6]:
# ref: https://www.kaggle.com/andrej0marinchenko/best-score-0-856-jigsaw-for-beginners
def read_ruddit_toxic_data(ruddit_path, fold_num = 0, use_folds = False):
  '''
  Read in ruddit toxic data
  
  Inputs:
    ruddit_path  ->  path of the file to read
    fold_num -> indicates the number of folds if they are usen
    use_folds -> whehter to use folds or not
  
  Output: 
    df  -> a df that has 2 coloumns, comment_text is the text of the comment,
  y the score of the toxicity calculated
  '''
  df = pd.read_csv(ruddit_path)
  df = df.dropna()
  if(use_folds==True):
    df = df.sample(frac=0.8, random_state = 10*(fold_num+1))
  df.y.hist()
  return df[['comment_text','y']]

In [7]:
def read_new_toxic_data(new_path, fold_num = 0, use_folds = False):
  '''
  Read in alternative jigsaw data
  
  Inputs:
    new_path -> path of the file to read
    fold_num -> indicates the number of folds if they are usen
    use_folds -> whehter to use folds or not
  
  Output: 
    df  -> a balanced df that has 2 coloumns, comment_text is the text of the comment,
  y the score of the toxicity calculated
  '''
  df = pd.read_csv(new_path)
  df = df.dropna()
  if(use_folds==True):
    df = df.sample(frac=0.8, random_state = 10*(fold_num+1))
  df.y.hist()
  return df[['comment_text','y']]

In [8]:
def eval_on_val_data(val_data_path,  model):
  '''
  Loads validation data from val_data_path that contains 
  2 rows of comments rated as more toxic and less toxic
  this can be used as an indication of how good our actual
  model will be in the real test case
  
  Inputs:
    val_data_path -> is the path of the validation data
    model -> is the model we use for prediction
  
  Outputs:
    acc -> the average number of times we correctly predictied which comments are more toxic than the others
    acc_ranks -> is another accuracy metric that I used that uses rank data on the scores to make sure than 
    no two scores are the same
  '''
  all_val_data = pd.read_csv(val_data_path)
  all_val_data = all_val_data.dropna()
  less_toxic_data = all_val_data['less_toxic']
  more_toxic_data = all_val_data['more_toxic']
  less_toxic_scores = model.predict(less_toxic_data)
  less_toxic_scores_ranks = rankdata( less_toxic_scores, method='ordinal')

  more_toxic_scores = model.predict(more_toxic_data)
  more_toxic_scores_ranks = rankdata( more_toxic_scores, method='ordinal')

  correctness = np.where(more_toxic_scores > less_toxic_scores, 1, 0)
  correctness_ranks = np.where(more_toxic_scores_ranks > less_toxic_scores_ranks, 1, 0)
  acc = np.mean(correctness)
  acc_ranks = np.mean(correctness_ranks)
  
  return acc, acc_ranks

In [9]:
def train_model(model, n_folds, data_path, val_data_path, dataname):
  '''
  Train the model on the data in data_path
  
  Inputs:
    model -> the model or pipleine to be trained
    n_folds -> the number of folds to use
    data_path -> the path of the data to load
    dataname -> the name of the data
  Output:
    the average accuracy across folds for the training data, and the same for the evaluation on the validation data
  '''
  if(n_folds>1):
    use_folds = True
  else:
    use_folds = False
  model_scores = []
  accs = []
  accs_ranks = []
  for i in range(0,n_folds):
    if(dataname=='jigsaw'):
      df = read_jigsaw_toxic_data(data_path, i, use_folds)
    elif(dataname=='new_data'):
      df = read_new_toxic_data(data_path, i, use_folds)
    elif(dataname=='ruddit'):
      df = read_ruddit_toxic_data(data_path, i, use_folds)
    else:
      print("Wrong data name") 
    
    model.fit(df['comment_text'], df['y'])
    model_score = model.score(df['comment_text'], df['y'])
    model_scores.append(model_score)
    acc,acc_ranks = eval_on_val_data(val_data_path,  model)
    accs.append(acc)
    accs_ranks.append(acc_ranks)
  return sum(model_scores)/n_folds, sum(accs)/n_folds, sum(accs_ranks)/n_folds, model

In [10]:
def train_pipeline(train_data_path, val_data_path, dataname, n_folds):
  '''
  Basically performing a manual gridsearch, but it allows me more flexibility when choosing the evaluation method
  
  Inputs:
     train_data_path -> the path of the data to train on
     val_data_path -> the path of the validation data
     dataname   -> the name of the data
     n_folds  ->  the number of folds
  
  Outputs:
    best_val_acc -> a float indicating the best validation accuracy  
    best_ngram -> a tuple of ngram values that corresponded to the best_val_acc
    best_analyzer -> a str of the best analyzer that corresponds to the best_val_acc 
    best_alpha -> a float indicating the alpha value that corresponds to the best_val_acc 
  '''
  analyzer_values = ['word','char_wb']
  ngram_ranges = [(1,1),(3,5)]
  alphas = [0.01, 0.1, 1, 10, 100]
  best_val_acc = 0
  
  for analyzer in analyzer_values:
    for ngram_range in ngram_ranges:
      
      if(ngram_range==(1,1) and analyzer=='char_wb'):
        ngram_range = (5,10)
    
      for alpha in alphas:
        
          pipeline = Pipeline([
                  ('tfidf', TfidfVectorizer(min_df = 10, analyzer=analyzer, ngram_range=ngram_range)),
                  ('clf', Ridge(random_state=42, alpha=alpha))
          ])
          avg_train_acc, avg_val_acc, avg_val_acc_ranks, model = train_model(pipeline, n_folds, train_data_path, val_data_path, dataname)
          print("For analyzer = "+analyzer+ " ngram range is "+str(ngram_range)+" alpha is "+str(alpha)+" training r2: "+str(avg_train_acc)+ ", val_acc: "+str(avg_val_acc)+" val ranks acc: "+str(avg_val_acc_ranks))
          if(avg_val_acc > best_val_acc):
            best_val_acc = avg_val_acc
            best_ngram = ngram_range
            best_analyzer = analyzer
            best_alpha = alpha
            best_model = model
            

    
  return best_val_acc, best_ngram, best_analyzer, best_alpha

In [None]:
best_ridge_jigsaw = train_pipeline(jigsaw_path, val_data_path,  "jigsaw", 3)

In [None]:
best_ridge_jigsaw

In [None]:
best_ridge_ruddit = train_pipeline(ruddit_path, val_data_path,  "ruddit", 3)

In [None]:
best_ridge_ruddit

In [None]:
best_ridge_new_data = train_pipeline(new_data_path, val_data_path,  "new_data", 3)

In [None]:
best_ridge_new_data

In [None]:
def train_final(train_data_path, val_data_path, dataname, n_folds, analyzer, ngram_range, alpha):
  '''
  Uses parameters obtained from previous trains to train a model on the full dataset

  Inputs:
     train_data_path -> the path of the data to train on
     val_data_path -> the path of the validation data
     dataname   -> the name of the data
     n_folds  ->  the number of folds
     analyzer -> the type of tfidf analyzer to use
     ngram_range -> the ngram range to use with the tfidf analyzer
     alpha -> the regularization value to use with the ridge regression (will be set to 0 for linear)
  
  Outputs:
    model -> The final trained model 

  '''

  pipeline = Pipeline([
          ('tfidf', TfidfVectorizer(min_df = 10, analyzer=analyzer, ngram_range=ngram_range)),
          ('clf', Ridge(random_state=42, alpha=alpha))
  ])
  avg_train_acc, avg_val_acc, avg_val_acc_ranks, model = train_model(pipeline, n_folds, train_data_path, val_data_path, dataname)


  return model

In [None]:
def predict_on_new_comments(path_to_comments, model_jigsaw, model_ruddit, model_new_data, ws):
  '''
  Use models trained on different datasets to obtain the final prediction for the dataset that we need to score
  in the kaggle competition

  Inputs:
     path_to_comments -> the path of the new comments that we need to score
     model_jigsaw -> the trained model on jigsaw data
     model_ruddit  -> the trained model on ruddit data
     model_new_data   -> the trained model on alternative jigsaw data
     ws  ->  the weights of each model
     

  '''
  new_data = pd.read_csv(path_to_comments)
  comment_ids = new_data['comment_id'].to_numpy()
  scores = ws[0]*model_jigsaw.predict(new_data['text']) + ws[1]*model_ruddit.predict(new_data['text']) + ws[2]*model_new_data.predict(new_data['text'])
  df = pd.DataFrame({'comment_id':comment_ids, 'score':scores})
  df.to_csv('submission.csv', index=False)

In [None]:
# ref: https://stackoverflow.com/questions/5563808/how-to-generate-three-random-numbers-whose-sum-is-1
# ref: https://www.kaggle.com/sytuannguyen/overfitting-lb-is-easier-than-solving-the-problem
# ref: https://www.kaggle.com/andrej0marinchenko/best-score-0-856-jigsaw-for-beginners

def ensemble_all_data(best_ruddit, best_new_data, best_jigsaw):
  '''
  Train models that has best val results on all their data using the best prameters
  combine scores to get better results
  
  Inputs:
    best_ruddit -> the best parameters that were obtained from training on ruddit 
    best_new_data -> the best parameters that were obtained from training on alt jigsaw 
    best_jigsaw -> the best parameters that were obtained from training on jigsaw
  '''
  best_val_acc, ngram_range, analyzer, alpha = best_jigsaw
  model_jigsaw =  train_final(jigsaw_path, val_data_path,  "jigsaw", 1, analyzer, ngram_range, alpha)
  best_val_acc, ngram_range, analyzer, alpha = best_ruddit
  model_ruddit =  train_final(ruddit_path, val_data_path,  "ruddit", 1, analyzer, ngram_range, alpha)
  best_val_acc, ngram_range, analyzer, alpha = best_new_data
  model_new_data =  train_final(new_data_path, val_data_path,  "new_data", 1, analyzer, ngram_range, alpha)

  all_val_data = pd.read_csv(val_data_path)
  less_toxic_data = all_val_data['less_toxic']
  more_toxic_data = all_val_data['more_toxic']
  
  less_toxic_scores_jigsaw = model_jigsaw.predict(less_toxic_data)
  less_toxic_scores_ranks_jigsaw = rankdata( less_toxic_scores_jigsaw, method='ordinal')

  more_toxic_scores_jigsaw = model_jigsaw.predict(more_toxic_data)
  more_toxic_scores_ranks_jigsaw = rankdata( more_toxic_scores_jigsaw, method='ordinal')


  less_toxic_scores_ruddit = model_ruddit.predict(less_toxic_data)
  less_toxic_scores_ranks_ruddit = rankdata( less_toxic_scores_ruddit, method='ordinal')

  more_toxic_scores_ruddit = model_ruddit.predict(more_toxic_data)
  more_toxic_scores_ranks_ruddit = rankdata( more_toxic_scores_ruddit, method='ordinal')

  less_toxic_scores_new_data = model_new_data.predict(less_toxic_data)
  less_toxic_scores_ranks_new_data = rankdata( less_toxic_scores_new_data, method='ordinal')

  more_toxic_scores_new_data = model_new_data.predict(more_toxic_data)
  more_toxic_scores_ranks_new_data = rankdata( more_toxic_scores_new_data, method='ordinal')
  
  best_acc = 0
  best_acc_ranks = 0

  for i in range(0,500):


    a = random.uniform(0, 1)
    x = random.uniform(0, a)
    y = random.uniform(a, 1)
    w1 = x
    w2 = y-x
    w3 = 1-y
    
    more_toxic_scores = w1*more_toxic_scores_jigsaw + w2*more_toxic_scores_ruddit + w3*more_toxic_scores_new_data
    less_toxic_scores = w1*less_toxic_scores_jigsaw + w2*less_toxic_scores_ruddit + w3*less_toxic_scores_new_data
    
    more_toxic_scores_ranks = w1*more_toxic_scores_ranks_jigsaw + w2*more_toxic_scores_ranks_ruddit + w3*more_toxic_scores_ranks_new_data
    less_toxic_scores_ranks = w1*less_toxic_scores_ranks_jigsaw + w2*less_toxic_scores_ranks_ruddit + w3*less_toxic_scores_ranks_new_data

    correctness = np.where(more_toxic_scores > less_toxic_scores, 1, 0)
    correctness_ranks = np.where(more_toxic_scores_ranks > less_toxic_scores_ranks, 1, 0)
    
    acc = np.mean(correctness)
    acc_ranks = np.mean(correctness_ranks)
    if(acc>best_acc):
      best_acc = acc
      best_ws_acc = (w1,w2,w3)
      
    if(acc_ranks>best_acc_ranks):
      best_acc_ranks = acc_ranks
      best_ws_acc_ranks = (w1,w2,w3)
    
    print("For w1 = "+str(w1)+" w2 is "+str(w2)+" w3 is "+str(w3)+ " val acc is: "+str(acc)+" val acc ranks is: "+str(acc_ranks))
  predict_on_new_comments(comments_to_score_path, model_jigsaw, model_ruddit, model_new_data, best_ws_acc)
  print("Best accuracy on val data was "+str(best_acc))
  print("This correponded to these ws")
  print(best_ws_acc)
  #print(best_acc_ranks)
  #print(best_ws_acc_ranks)

In [None]:
# I can skip the training before this section, by uncommenting the lines below
#best_ridge_new_data = (0.6715048049244942, (1, 1), 'word', 1)
#best_ridge_ruddit = (0.6238541251494619, (3, 5), 'char_wb', 0.1)
#best_ridge_jigsaw = (0.6795868207785306, (3, 5), 'char_wb', 1)

In [None]:
ensemble_all_data(best_ridge_new_data, best_ridge_new_data, best_ridge_jigsaw)