In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re, string
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# For model-building
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn import metrics

# Bag of Words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# For word embedding
import gensim
from gensim.models import Word2Vec
import pysbd
import warnings
import matplotlib.pyplot as plt

# Ignoring warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /Users/kimiazand/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kimiazand/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kimiazand/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Reading the data from a CSV file
data = pd.read_csv(mimic-iii-random-500-subjects-cannabis-use-keyword-search-sentence-results-annotated-all-sorted.csv', sep=',')

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['match sentence'], data['Label'], test_size=0.2, random_state=42)

In [None]:
# Creating DataFrames for the training and testing sets
Train=pd.DataFrame(pd.Series(X_train).array,pd.Series(y_train).array)
Test=pd.DataFrame(pd.Series(X_test).array,pd.Series(y_test).array)

# Saving the testing set to a CSV file
path='/split/' 
Test.to_csv(path+'cannabis_Test.csv')

# Saving the training set to a CSV file
Train.to_csv(path+'cannbis_Train.csv') 

# Reading the testing and training sets from the CSV files
path='/split/'
test_set=pd.read_csv(path+'cannabis_Test.csv')
train_set=pd.read_csv(path+'cannbis_Train.csv')

In [None]:
# define a function named "seg" that takes a string argument named "data"
def seg(data):
    # create a new instance of the Segmenter class from pysbd, configured for the English language
    seg = pysbd.Segmenter(language="en", clean=False)
    # use the segmenter to split the input string into a list of sentences
    return seg.segment(data)

In [None]:
seg_train = []
seg_test = []

# Iterate over the values in the X_train Series object and segment each string using the "seg" function, then append the segmented data to seg_train list
for value in X_train:
    seg_train.append(seg(value))

# Iterate over the values in the X_test Series object and segment each string using the "seg" function, then append the segmented data to seg_test list
for value in X_test:
    seg_test.append(seg(value))

In [None]:
# Initialize the WordNetLemmatizer
wl = WordNetLemmatizer()

# Define a function named "get_wordnet_pos" that takes a part of speech tag as input and returns the corresponding WordNet POS tag
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ  # adjective
    elif tag.startswith('V'):
        return wordnet.VERB  # verb
    elif tag.startswith('N'):
        return wordnet.NOUN  # noun
    elif tag.startswith('R'):
        return wordnet.ADV  # adverb
    else:
        return wordnet.NOUN  # default to noun if the tag is not one of the above
    
# Define a function named "lemmatizer" that takes a string as input, tokenizes it into words, tags the words with their part of speech, lemmatizes each word based on its part of speech, and returns the lemmatized string as output
def lemmatizer(string):
    # Use the pos_tag function from the nltk library to tag the words in the input string with their part of speech
    word_pos_tags = pos_tag(word_tokenize(string))
    # Use a list comprehension to apply the WordNetLemmatizer to each word in the input string, using its part of speech to determine the appropriate lemmatization
    lemmatized_words = [wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for tag in word_pos_tags]
    # Use the "join" method to concatenate the lemmatized words back into a single string, separated by spaces
    return " ".join(lemmatized_words)

In [None]:
# Initialize empty lists to store the lemmatized and segmented versions of the training and test data
lem_seg_train = []
lem_seg_test = []

# Iterate over each sublist in the seg_train list of lists
for sublist in seg_train:
    # Initialize an empty list to store the lemmatized strings for the current sublist
    lem_sublist = []
    # Iterate over each string in the current sublist and apply the lemmatizer function to lemmatize the string
    for string in sublist:
        lem_sublist.append(lemmatizer(string))
    # Append the list of lemmatized strings for the current sublist to the lem_seg_train list of lists
    lem_seg_train.append(lem_sublist)

# Iterate over each sublist in the seg_test list of lists
for sublist in seg_test:
    # Initialize an empty list to store the lemmatized strings for the current sublist
    lem_sublist = []
    # Iterate over each string in the current sublist and apply the lemmatizer function to lemmatize the string
    for string in sublist:
        lem_sublist.append(lemmatizer(string))
    # Append the list of lemmatized strings for the current sublist to the lem_seg_test list of lists
    lem_seg_test.append(lem_sublist)

In [None]:
# Create a new dataframe called 'df1' from the 'lem_seg_train' object
df1 = pd.DataFrame(lem_seg_train)

# Create a new dataframe called 'df2' from the 'lem_seg_test' object
df2 = pd.DataFrame(lem_seg_test)

In [None]:
# Define a function to join non-null values in a row into a string
def join_columns(row):
    return ' '.join(row.dropna().astype(str))

# Apply the 'join_columns' function to each row in 'df1' to create 'X_train_df'
X_train_df = df1.apply(join_columns, axis=1)

# Apply the 'join_columns' function to each row in 'df2' to create 'X_test_df'
X_test_df = df2.apply(join_columns, axis=1)

# Set 'X_train' equal to 'X_train_df' and 'X_test' equal to 'X_test_df'
X_train = X_train_df
X_test = X_test_df

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,5))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Define the parameter grid to search over
param_grid = {
    'C': [0, 2, 4, 8, 10, 12, 14, 16, 18 ,20],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

# Train a logistic regression classifier using GridSearchCV to find the best hyperparameters
clf = LogisticRegression(max_iter=10, multi_class='ovr')
grid_search = GridSearchCV(clf, param_grid, cv=100, n_jobs=-1)
grid_search.fit(X_train_tfidf, y_train)
clf = grid_search.best_estimator_

# Predict the labels for the test set
y_pred = clf.predict(X_test_tfidf)

# Print the best hyperparameters and classification report
print("Best hyperparameters:", grid_search.best_params_)
print(classification_report(y_test, y_pred))

In [None]:
# Define the TF-IDF vectorizer and SVM model
tfidf = TfidfVectorizer(ngram_range=(1, 5))
svm = LinearSVC()

# Define the pipeline with TF-IDF vectorizer and SVM model
pipeline = Pipeline([
    ('tfidf', tfidf),
    ('svm', svm)
])

# Define the hyperparameters to search over
param_grid = {
    'svm__penalty': ['l1', 'l2'],
    'svm__C': [0.1, 1, 10]
}

# Define the GridSearchCV object to search over the hyperparameters
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1)

# Train the GridSearchCV object on the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)
print(classification_report(y_test, y_pred))