In [None]:
import numpy as np
import os
import pickle
import pandas as pd

In [None]:
student_id = 2310618 # Note this is an interger and you need to input your id

In [None]:
# set same seeds for all libraries

#numpy seed
np.random.seed(student_id)

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Common Codes

In [None]:
# Add your code to initialize GDrive and data and models paths

GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = './CE807-24-SP/Assignment/Lab10/'
GOOGLE_DRIVE_PATH = os.path.join('gdrive', 'MyDrive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print('List files: ', os.listdir(GOOGLE_DRIVE_PATH))

DATA_PATH = os.path.join(GOOGLE_DRIVE_PATH, 'data', '8') # Make sure to replace 0 with last digit of your student Regitration number
train_file = os.path.join(DATA_PATH, 'train.csv')
print('Train file: ', train_file)

val_file = os.path.join(DATA_PATH, 'valid.csv')
print('Validation file: ', val_file)

test_file = os.path.join(DATA_PATH, 'test.csv')
print('Test file: ', test_file)


MODEL_PATH = os.path.join(GOOGLE_DRIVE_PATH, 'model', str(student_id)) # Make sure to use your student Regitration number
MODEL_Gen_DIRECTORY = os.path.join(MODEL_PATH, 'Model_Gen') # Model Generative directory
print('Model Generative directory: ', MODEL_Gen_DIRECTORY)

MODEL_Gen_File = MODEL_Gen_DIRECTORY + '.zip'


MODEL_Dis_DIRECTORY = os.path.join(MODEL_PATH, 'Model_Dis') # Model Discriminative directory
print('Model Discriminative directory: ', MODEL_Dis_DIRECTORY)

MODEL_Dis_File = MODEL_Dis_DIRECTORY + '.zip'


In [None]:
train_df = pd.read_csv(train_file)
train_df.head()

In [None]:
import matplotlib.pyplot as plt
# Count the number of toxic and non-toxic comments
num_tox_comm = train_df['toxicity'].sum()
num_non_tox_comm = len(train_df) - num_tox_comm

# Create a pie chart
labels = ['Toxic', 'Non-toxic']
sizes = [num_tox_comm, num_non_tox_comm]
colors = ['red', 'green']
explode = (0.1, 0)

plt.figure(figsize=(8, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Toxic Comments in the Dataset')
plt.axis('equal')
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

def compute_performance(y_true, y_pred):
    """
    Calculates and prints different performance metrics like Accuracy, Recall, Precision, F1 score,
    the Confusion Matrix, AUROC (Area Under the Receiver Operating Characteristic Curve), and plots the ROC curve.

    Args:
        y_true: numpy array or list
        y_pred: numpy array or list

    Returns:
        tuple: (Accuracy, F1 score, Precision, Recall Confusion Matrix)
    """

    # Evaluation metrics
    accuracy_file = accuracy_score(y_true, y_pred)
    recall_file = recall_score(y_true, y_pred)
    precision_file = precision_score(y_true, y_pred)
    f1_file = f1_score(y_true, y_pred)
    confusion_mat = confusion_matrix(y_true, y_pred)


    # Print metrics
    print("Accuracy Score: ", accuracy_file)
    print("Recall Score: ", recall_file)
    print("Precision Score: ", precision_file)
    print("F1 Score: ", f1_file)
    print("Confusion Matrix: ", confusion_mat)

    return accuracy_file, f1_file, precision_file, recall_file, confusion_mat


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import math
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

nltk.download('stopwords')
nltk.download('wordnet')

def data_commentcleaning(x):
    """
    Perform data cleaning on the input DataFrame x.

    Args:
        x: DataFrame containing 'comment' column to be cleaned.

    Returns:
        Cleaned DataFrame.
    """
    # Read the input DataFrame (df)
    train_data = pd.read_csv(x)

    # Convert comment column text to lowercase
    train_data['comment'] = train_data['comment'].str.lower()

    # Remove non-alphanumeric characters
    train_data['comment'] = train_data['comment'].apply(lambda x: re.sub("[^A-Za-z0-9]", ' ', x))

    # Remove stopwords
    stop_cleancomment = stopwords.words('english')
    train_data['comment'] = train_data['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_cleancomment]))

    # Tokenization
    tokenizer_cleancomment = RegexpTokenizer("[\w']+")
    train_data['comment'] = train_data['comment'].apply(lambda x: tokenizer_cleancomment.tokenize(x))

    # Lemmatize words in the comments column
    lemmatizer_cleanedcolumn = nltk.stem.WordNetLemmatizer()
    train_data['comment'] = train_data['comment'].apply(lambda x: [lemmatizer_cleanedcolumn.lemmatize(y) for y in x])

    # Stemming the words
    stemming_cleancomment = SnowballStemmer("english")
    train_data['comment'] = train_data['comment'].apply(lambda x: [stemming_cleancomment.stem(y) for y in x])

    # Join the list of words back into a single string
    train_data['comment'] = train_data['comment'].apply(lambda x: ",".join(x))


    return train_data


In [None]:
def save_model(model,model_dir):
  # save the model to disk
  # Check if the Model directory exists

  # Note you might have to modify this based on your requirement

  if not os.path.exists(model_dir):
      # Create the directory if it doesn't exist
      os.makedirs(model_dir)
      print(f"Directory '{model_dir}' created successfully.")
  else:
      print(f"Directory '{model_dir}' already exists.")

  model_file = os.path.join(model_dir, 'model.sav')
  pickle.dump(model, open(model_file, 'wb'))

  print('Saved model to ', model_file)

  return model_file

def load_model(model_file):
    # load model from disk

    # Note you might have to modify this based on your requirement

    model = pickle.load(open(model_file, 'rb'))

    print('Loaded model from ', model_file)

    return model

# Let's download GDrive Link into a directory

In [None]:
import requests

def extract_file_id_from_url(url):
    # Extract the file ID from the URL
    file_id = None
    if 'drive.google.com' in url:
        file_id = url.split('/')[-2]
    elif 'https://docs.google.com' in url:
        file_id = url.split('/')[-1]

    return file_id

def download_file_from_drive(file_id, file_path):
    # Construct the download URL
    download_url = f"https://drive.google.com/uc?id={file_id}"

    # Download the file
    response = requests.get(download_url)
    if response.status_code == 200:
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print("File downloaded successfully!",file_path)
    else:
        print("Failed to download the file.")

def download_zip_file_from_link(file_url,file_path):

  file_id = extract_file_id_from_url(file_url)
  if file_id:
      download_file_from_drive(file_id, file_path)
  else:
      print("Invalid Google Drive URL.")

# Zip and Unzip a GDrive File

In [None]:
import zipfile
import shutil
import os

# Function to zip a directory
def zip_directory(directory, zip_filename):
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(directory):
            for file in files:
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(directory, '..')))
        print('Created a zip file',zip_filename)

# Function to unzip a zip file
def unzip_file(zip_filename, extract_dir):
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print('Extracted a zip file to',extract_dir)

# Get Sharable link of your Zip file in Gdrive

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


def get_gdrive_link(file_path):
    # Authenticate and create PyDrive client
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

    # Find the file in Google Drive
    file_name = file_path.split('/')[-1]
    file_list = drive.ListFile({'q': f"title='{file_name}'"}).GetList()

    # Get the file ID and generate the shareable link
    if file_list:
        file_id = file_list[0]['id']
        gdrive_link = f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
        return gdrive_link
    else:
        return "File not found in Google Drive"

def get_shareable_link(url):

    file_id = extract_file_id_from_url(url)

    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

    try:
        file_obj = drive.CreateFile({'id': file_id})
        file_obj.FetchMetadata()
        file_obj.InsertPermission({
            'type': 'anyone',
            'value': 'anyone',
            'role': 'reader'
        })

        # Get the shareable link
        return file_obj['alternateLink']
    except Exception as e:
        print("Error:", e)
        return None

#Method Generative Start
In this section you will write all details of your Method 1.

You will have to enter multiple code and text cell.

Your code should follow the standard ML pipeline

Data reading
Data clearning, if any
Convert data to vector/tokenization/vectorization
Model Declaration/Initialization/building
Training and validation of the model using training and validation dataset
Save the trained model
Load and Test the model on testing set
Save the output of the model
You could add any other step(s) based on your method's requirement.

After finishing the above, you need to usd splited data as defined in the assignment and then do the same for all 4 sets. Your code should not be copy-pasted 4 time, make use of function.

## Training Generative Method Code
Your test code should be a stand alone code that must take `train_file`, `val_file`,  and `model_dir` as input. You could have other things as also input, but these three are must. You would load both files, and train using the `train_file` and validating using the `val_file`. You will `print` / `display`/ `plot` all performance metrics, loss(if available) and save the output model in the `model_dir`.

Note that at the testing time, you need to use the same pre-processing and model. So, it would be good that you make those as seperate function/pipeline whichever it the best suited for your method. Don't copy-paste same code twice, make it a fucntion/class whichever is best.

In [None]:
def train_Gen(train_file, val_file, model_dir):
    """
    Train the Generative Model using Gaussian Naive Bayes.

    Args:
        train_file: Path to the training file containing comments.
        val_file: Path to the validation file containing comments.
        model_dir: Path to the directory to save the trained model.

    Returns:
        model_gdrive_link: Google Drive link for the trained model.
    """

    # Import libraries
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.naive_bayes import GaussianNB

    # Perform data cleaning on the train and validation datasets
    train_commentdata = data_commentcleaning(train_file)
    valid_commentdata = data_commentcleaning(val_file)

    # TF-IDF VECTORIZER
    global gendis_vectorizer
    # Initialize TF-IDF vectorizer
    gendis_vectorizer = TfidfVectorizer()
    # Perform  Transform train comments into TF-IDF features
    X_traincomment = gendis_vectorizer.fit_transform(train_commentdata.comment).toarray()
    # Target variable toxicity for training
    y_traincomment = train_commentdata.toxicity
    # Transform validation comments into TF-IDF features
    X_validcomment = gendis_vectorizer.transform(valid_commentdata.comment).toarray()

    # Gaussian Naive Bayes Classifier
    # Initialize Gaussian Naive Bayes classifier
    Gauss_classifier = GaussianNB()
    # Train the model
    Gauss_classifier.fit(X_traincomment, y_traincomment)
    # Predict toxicity for validation comments
    y_predcomment = Gauss_classifier.predict(X_validcomment)

    # Calculate performance metrics
    compute_performance(valid_commentdata['toxicity'], y_predcomment)
    # Save the Gaussian Naive Bayes model
    save_model(Gauss_classifier, model_dir)
    # Zip the model directory for sharing
    zip_directory(model_dir, MODEL_Gen_File)
    # Get the Google Drive link for the zipped model
    model_gdrive_link = get_gdrive_link(MODEL_Gen_File)
    print(model_gdrive_link)

    # Get the shareable link
    get_shareable_link(model_gdrive_link)

    return model_gdrive_link


## Testing Method 1 Code
Your test code should be a stand alone code that must take `test_file`, `model_file` and `output_dir` as input. You could have other things as also input, but these three are must. You would load both files, and generate output based on inputs. Then you will `print` / `display`/ `plot` all performance metrics, and save the output file in the `output_dir`  

In [None]:
def test_Gen(test_file, MODEL_PATH, model_gdrive_link):
    """
    Test the Generative Model on the test data.

    Args:
      test_file: Path to the test file containing comments.
      MODEL_PATH: Path to the directory containing the trained model.
      model_gdrive_link: Google Drive link for the trained model.

    Returns:
      test_file: Path to the output file containing the test results.
    """

    # Import necessary libraries
    from sklearn.feature_extraction.text import TfidfVectorizer
    import pickle

    # Clean the test data
    test_gendata = data_commentcleaning(test_file)

    # Transform test comments into TF-IDF features
    X_gentest = gendis_vectorizer.transform(test_gendata.comment).toarray()

    # Paths for temporary directory and file
    test_model_file = MODEL_PATH + '/Model_Gen.zip'
    test_model_path = MODEL_PATH + '/Model_Gen/'

    # Download and unzip the model file
    download_zip_file_from_link(model_gdrive_link, test_model_file)
    print('Model downloaded to', test_model_file)
    unzip_file(test_model_file, test_model_path)
    print('\n Model is downloaded to ', test_model_path)

    # Load the trained model
    pickle_file = os.path.join(test_model_path, 'model.sav')
    model = load_model(pickle_file)

    # Predict using the model
    y_genpred = model.predict(X_gentest)
    test_gendata['out_label_model_Gen'] = y_genpred

    # Save the model output in the same output file
    test_gendata.to_csv(test_file, index=False)
    print('\n Output is saved in ', test_file)

    return test_file


In [None]:
def val_gen(val_file, MODEL_PATH, model_gdrive_link):
    """

    Args:
      val_file: has the val file info
      MODEL_PATH:Path to the directory containing the trained model.
      model_gdrive_link: link for the trained model

    Returns: val_file

    """


    # Import necessary libraries
    from sklearn.feature_extraction.text import TfidfVectorizer
    import pickle

    # Clean the test data
    val_gendata = data_commentcleaning(val_file)

    # Transform test comments into TF-IDF features
    X_genval = gendis_vectorizer.transform(val_gendata.comment).toarray()

    # Paths for temporary directory and file
    test_model_file = MODEL_PATH + '/Model_Gen.zip'
    test_model_path = MODEL_PATH + '/Model_Gen/'

    # Download and unzip the model file
    download_zip_file_from_link(model_gdrive_link, test_model_file)
    print('Model downloaded to', test_model_file)
    unzip_file(test_model_file, test_model_path)
    print('\n Model is downloaded to ', test_model_path)

    # Load the trained model
    pickle_file = os.path.join(test_model_path, 'model.sav')
    valmodel = load_model(pickle_file)

    # Predict using the model
    y_genpred = valmodel.predict(X_genval)
    val_gendata['out_label_model_Gen'] = y_genpred

    # Save the model output in the same output file
    val_gendata.to_csv(val_file, index=False)
    print('\n Output is saved in ', val_file)

    return val_file


## Method Generative End


# Method Discriminative Start

In this section you will write all details of your Method 2.

You will have to enter multiple `code` and `text` cell.

Your code should follow the standard ML pipeline


*   Data reading
*   Data clearning, if any
*   Convert data to vector/tokenization/vectorization
*   Model Declaration/Initialization/building
*   Training and validation of the model using training and validation dataset
*   Save the trained model
*   Load and Test the model on testing set
*   Save the output of the model

You could add any other step(s) based on your method's requirement.

After finishing the above, you need to usd splited data as defined in the assignment and then do the same for all 4 sets. Your code should not be copy-pasted 4 time, make use of `function`.


## Training Method Discriminative Code
Your test code should be a stand alone code that must take `train_file`, `val_file`,  and `model_dir` as input. You could have other things as also input, but these three are must. You would load both files, and train using the `train_file` and validating using the `val_file`. You will `print` / `display`/ `plot` all performance metrics, loss(if available) and save the output model in the `model_dir`.

Note that at the testing time, you need to use the same pre-processing and model. So, it would be good that you make those as seperate function/pipeline whichever it the best suited for your method. Don't copy-paste same code twice, make it a fucntion/class whichever is best.

In [None]:
def train_dis(train_file, val_file, model_dir):
    """
    Train the discriminative model using Gradient Boosting Classifier.

    Args:
        train_file: File path of the training data.
        val_file: File path of the validation data.
        model_dir: Directory path to save the trained model.

    Returns:
        Google Drive link for the trained model.
    """
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.feature_extraction.text import TfidfVectorizer

    # Perform data cleaning on the training and validation datasets
    train_disdata = data_commentcleaning(train_file)
    valid_disdata = data_commentcleaning(val_file)

    # TF-IDF VECTORIZER
    gendis_vectorizer = TfidfVectorizer()
    X_distrain = gendis_vectorizer.fit_transform(train_disdata.comment).toarray()
    y_distrain = train_disdata.toxicity
    X_disvalid = gendis_vectorizer.transform(valid_disdata.comment).toarray()

    # Define the pipeline
    pipe_dis = Pipeline([('GBC', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=student_id))])
    # Train the pipeline
    pipe_dis.fit(X_distrain, y_distrain)
    # perdict the toxicity
    y_dispred = pipe_dis.predict(X_disvalid)

    # Save the trained model
    save_model(pipe_dis, model_dir)

    # Calculate performance metrics
    compute_performance(valid_disdata['toxicity'], y_dispred)

    # Zip the model directory for sharing
    zip_directory(model_dir, MODEL_Dis_File)

    # Get the Google Drive link for the zipped model
    model_gdrive_link = get_gdrive_link(MODEL_Dis_File)
    print(model_gdrive_link)

    # Get the shareable link
    get_shareable_link(model_gdrive_link)

    return model_gdrive_link


## Testing Method Discriminative Code
Your test code should be a stand alone code that must take `test_file`, `model_file` and `output_dir` as input. You could have other things as also input, but these three are must. You would load both files, and generate output based on inputs. Then you will `print` / `display`/ `plot` all performance metrics, and save the output file in the `output_dir`  

In [None]:
def test_dis(test_file, MODEL_PATH, model_gdrive_link):
    """


    Args:
        test_file: Path to the test file containing comments.
        MODEL_PATH: Path to the directory containing the trained model.
        model_gdrive_link: Google Drive link for the trained model.

    Returns:
        test_file: Path to the output file containing the test results.
    """

    # Import necessary libraries
    from sklearn.feature_extraction.text import TfidfVectorizer
    import pickle

    # Clean the test data
    test_disdata = data_commentcleaning(test_file)

    # TF-IDF VECTORIZER
    X_disctest = gendis_vectorizer.transform(test_disdata.comment).toarray()

    # Paths for temporary directory and file
    test_model_file = MODEL_PATH + '/Model_Dis.zip'
    test_model_path = MODEL_PATH + '/Model_Dis/'

    # Download and unzip the model file
    download_zip_file_from_link(model_gdrive_link, test_model_file)
    print('Model downloaded to', test_model_file)
    unzip_file(test_model_file, test_model_path)
    print('\n Model is downloaded to ', test_model_path)

    # Load the trained model
    pickle_file = os.path.join(test_model_path, 'model.sav')
    model = load_model(pickle_file)

    # Predict using the model
    y_discpred = model.predict(X_disctest)
    test_disdata['out_label_model_Dis'] = y_discpred

    # Save the model output in the same output file
    test_disdata.to_csv(test_file, index=False)
    print('\n Output is saved in ', test_file)

    return test_file


In [None]:
def val_dis(val_file, MODEL_PATH, model_gdrive_link):
    """
    Test the Discriminative Model on the valid data.

    Args:
        val_file: Path to the val file containing comments.
        MODEL_PATH: Path to the directory containing the trained model.
        model_gdrive_link: Google Drive link for the trained model.

    Returns:
        val_file: Path to the output file containing the test results.
    """

    # Import necessary libraries
    from sklearn.feature_extraction.text import TfidfVectorizer
    import pickle

    # Clean the test data
    val_disdata = data_commentcleaning(val_file)

    # TF-IDF VECTORIZER
    X_disctval = gendis_vectorizer.transform(val_disdata.comment).toarray()

    # Paths for temporary directory and file
    test_model_file = MODEL_PATH + '/Model_Dis.zip'
    test_model_path = MODEL_PATH + '/Model_Dis/'

    # Download and unzip the model file
    download_zip_file_from_link(model_gdrive_link, test_model_file)
    print('Model downloaded to', test_model_file)
    unzip_file(test_model_file, test_model_path)
    print('\n Model is downloaded to ', test_model_path)

    # Load the trained model
    pickle_file = os.path.join(test_model_path, 'model.sav')
    model = load_model(pickle_file)

    # Predict using the model
    y_discpredva = model.predict(X_disctval)
    val_disdata['out_label_model_Dis'] = y_discpredva

    # Save the model output in the same output file
    val_disdata.to_csv(val_file, index=False)
    print('\n Output is saved in ', val_file)

    return val_file



## Discriminative Method  End


# Other Method/model Start

In [None]:
import argparse

# Define argparse-like function
def parse_arguments(option):
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--option', '-o',  type=str, default=option, help='Description of your option.')
    args = parser.parse_args(args=[])
    return args

# Function to perform some action based on selected option
def perform_action(option):
    print("Performing action with option:", option)

    if option == '0':
      print('\n Okay Exiting!!! ')

    elif option == '1':
      print('\n Training Generative Model')
      model_gdrive_link = train_Gen(train_file,val_file,MODEL_Gen_DIRECTORY)
      print('Make sure to pass model URL in Testing',model_gdrive_link)

    elif option == '2':
      print('\n\n Pass the URL Not Variable !!!')
      print('\n Testing Generative Model')
      model_gen_url = 'https://drive.google.com/file/d/1--TDKFx7qpOuXgauAVi-k9BVM_0RmPly/view?usp=sharing'
      test_Gen(test_file,MODEL_PATH ,model_gen_url)

    elif option == '3':
      print('\n Training Disciminative Model')
      model_gdrive_link = train_dis(train_file,val_file,MODEL_Dis_DIRECTORY)
      print('Make sure to pass model URL in Testing',model_gdrive_link)
      print('\n\n Pass the URL Not Variable !!!')

    elif option == '4':
      print('\n\n Pass the URL Not Variable !!!')
      print('\n Testing Disciminative Model')
      model_dis_url = 'https://drive.google.com/file/d/1-WHHgGoKmRt9pG-s2XGx9vV6LXFfAPYo/view?usp=sharing'
      test_dis(test_file, MODEL_PATH, model_dis_url)


    elif option == '5':
      print('\n\n Pass the URL Not Variable !!!')
      print('\n validation Disciminative Model')
      model_dis_url = 'https://drive.google.com/file/d/1-WHHgGoKmRt9pG-s2XGx9vV6LXFfAPYo/view?usp=sharing'
      val_gen(val_file, MODEL_PATH, model_dis_url)


    elif option == '6':
      print('\n\n Pass the URL Not Variable !!!')
      print('\n validation Disciminative Model')
      model_dis_url = 'https://drive.google.com/file/d/1-WHHgGoKmRt9pG-s2XGx9vV6LXFfAPYo/view?usp=sharing'
      val_dis(val_file, MODEL_PATH, model_dis_url)


    else:
      print('Wrong Option Selected. \n\nPlease select Correct option')
      main()


def main():

    # Get option from user input
    user_option = input("0. To Exit Code\n"
                     "1. Train Model Generative\n"
                    "2. Test Model Generative\n"
                    "3. Train Model Discriminative\n"
                    "4. Test Model Discriminative\n"
                    "5. Valid Model Generative\n"
                    "6. Valid Model Discriminative\n"
                    "Enter your option: ")

    args = parse_arguments(user_option)
    option = args.option
    perform_action(option)

In [None]:
main()