In [421]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer


# nltk.download('stopwords')
# nltk.download('wordnet')

# Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

# Import Tf-idf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Enable Jupyter Notebook's intellisense
%config IPCompleter.greedy=True

# Import the Label Encoder
from sklearn.preprocessing import LabelEncoder

# Import the train test split
from sklearn.model_selection import train_test_split

In [422]:
import pandas as pd

data = pd.read_csv(
    r"C:\Users\Simrun Sharma\Desktop\NLP\NLP_final\01_intermediate-files\smokers_train_all_separated.csv"
)
print(data.sample(6))

     Smoking Status                                               Text
223         UNKNOWN  report status: unsigned\r\ned discharge notifi...
6    CURRENT SMOKER  report status: signed\r\ndischarge summary nam...
215         UNKNOWN  report status: unsigned admission date: 02/22/...
303     PAST SMOKER  admission date: 09/20/2000 report status: sign...
335         UNKNOWN  admission date: 11/16/2005 report status:  dis...
137         UNKNOWN  report status: unsigned\r\ndischarge summary n...


In [423]:
data["Smoking Status’"] = data["Smoking Status"].replace("SMOKER", "PAST SMOKER")

In [424]:
le = LabelEncoder()
data["Smoking_enc"] = le.fit_transform(data["Smoking Status"])
display(data.sample(6))

Unnamed: 0,Smoking Status,Text,Smoking Status’,Smoking_enc
345,UNKNOWN,admission date: 03/25/2003 report status: sign...,UNKNOWN,4
280,NON-SMOKER,admission date: 10/14/2004 report status: sign...,NON-SMOKER,1
272,CURRENT SMOKER,admission date: 07/08/1995 report status: sign...,CURRENT SMOKER,0
38,NON-SMOKER,admission date: 09/16/2004 report status: sign...,NON-SMOKER,1
66,PAST SMOKER,admission date: 03/30/1993 report status: sign...,PAST SMOKER,2
335,UNKNOWN,admission date: 11/16/2005 report status: dis...,UNKNOWN,4


In [425]:
data["Smoking_enc"] = data["Smoking_enc"].astype("int64")
print(data.dtypes)

Smoking Status     object
Text               object
Smoking Status’    object
Smoking_enc         int64
dtype: object


In [426]:
print(data["Smoking Status"].value_counts())

UNKNOWN           252
NON-SMOKER         66
PAST SMOKER        36
CURRENT SMOKER     35
SMOKER              9
Name: Smoking Status, dtype: int64


In [427]:
import re

clean_messages = data["Text"].str.lower()
clean_text = []
for message in clean_messages:
    pattern = r"(\S+\s){0,5}\S*(smok|tobacco|cigar|pack|ppd)\S*(\s\S+){0,5}"
    match = re.search(pattern, message, re.IGNORECASE)

    if match:
        matched_text = match.group(0)
        clean_text.append(matched_text)
    else:
        sentence = "no information"
        clean_text.append(sentence)

data["Text"] = clean_text
print(data["Text"])

0      to excess , pipe and cigar smoker for many yea...
1       the patient has a 20 pack-year smoking history .
2      has been smoking approximately 10 cigarettes a...
3      and vomiting . social history: smoker for grea...
4                   1-2 packs per day . hospital course:
                             ...                        
393                                       no information
394                                       no information
395          a / p repair vag pack / foley , ebl minimal
396                                       no information
397                                       no information
Name: Text, Length: 398, dtype: object


In [428]:
no_info_messages = data["Text"].str.contains("no information").sum()
print(no_info_messages)

230


In [429]:
from sklearn.feature_extraction.text import CountVectorizer

smoking_status = data["Smoking Status"]
smoking_enc = data["Smoking_enc"]
# Transform the data with the count vectorizer
count_vect = CountVectorizer(max_features=6, stop_words="english")

data = count_vect.fit_transform(
    data["Text"]
).toarray()  # Returns a sparse matric which we convert to an array

# Put together a df with the results
train_bow_df = pd.DataFrame(
    data, columns=count_vect.get_feature_names_out(), index=smoking_status
)
train_bow_df[
    "Smoking_enc"
] = smoking_enc.tolist()  # tolist() needed since indices are different
train_bow_df

# # Apply the remove_most_recurring_words function to each element in the "text" column
# data["Text"] = data["Text"].apply(remove_most_recurring_words)
# print(data["Text"])

Unnamed: 0_level_0,alcohol,history,information,smoking,tobacco,use,Smoking_enc
Smoking Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CURRENT SMOKER,0,0,0,0,0,0,0
CURRENT SMOKER,0,1,0,1,0,0,0
CURRENT SMOKER,0,0,0,1,0,0,0
CURRENT SMOKER,0,1,0,0,0,0,0
CURRENT SMOKER,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
UNKNOWN,0,0,1,0,0,0,4
UNKNOWN,0,0,1,0,0,0,4
UNKNOWN,0,0,0,0,0,0,4
UNKNOWN,0,0,1,0,0,0,4


In [430]:
# tokenized_messages = data["Text"].str.lower().apply(word_tokenize)
# print(tokenized_messages)

In [431]:
# data["Text"] = tokenized_messages
# data.sample(6)

In [432]:
X_train = train_bow_df.drop(columns="Smoking_enc")
y_train = train_bow_df["Smoking_enc"]

print(X_train)

# X_train = data["Text"]
# y_train = data["Smoking_enc"]

                alcohol  history  information  smoking  tobacco  use
Smoking Status                                                      
CURRENT SMOKER        0        0            0        0        0    0
CURRENT SMOKER        0        1            0        1        0    0
CURRENT SMOKER        0        0            0        1        0    0
CURRENT SMOKER        0        1            0        0        0    0
CURRENT SMOKER        0        0            0        0        0    0
...                 ...      ...          ...      ...      ...  ...
UNKNOWN               0        0            1        0        0    0
UNKNOWN               0        0            1        0        0    0
UNKNOWN               0        0            0        0        0    0
UNKNOWN               0        0            1        0        0    0
UNKNOWN               0        0            1        0        0    0

[398 rows x 6 columns]


In [433]:
print(X_train.shape)
print(y_train.shape)

(398, 6)
(398,)


In [434]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, test_size=0.2, random_state=34, stratify=y_train
)

# X_train = X_train.astype(str)
# X = X.astype(str)
# X_test = X_test.astype(str)

In [435]:
# # Create the tf-idf vectorizer
# vectorizer = TfidfVectorizer(strip_accents="ascii")


# # First fit the vectorizer with our training set
# tfidf_train = vectorizer.fit_transform(X_train)
# print(tfidf_train)
# print(tfidf_train.shape)
# print(vectorizer.get_feature_names_out())

# Now we can fit our test data with the same vectorizer
# tfidf_test = vectorizer.transform(X_test)

In [436]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

alpha = 0.5
nb = MultinomialNB(alpha=alpha)
scores = cross_validate(nb, X_train, y_train, return_train_score=True)
scores = pd.DataFrame(scores)
print(scores.head())
# alpha = 0.001  # You can adjust the value of alpha

# # Initialize the Multinomial Naive Bayes classifier
# nb = MultinomialNB(alpha=alpha)

# # Fit the model
nb.fit(X_train, y_train)

# # Print the accuracy score
print("Accuracy:", nb.score(X_test, y_test))

   fit_time  score_time  test_score  train_score
0  0.005830    0.002241    0.781250     0.787402
1  0.006034    0.002998    0.781250     0.795276
2  0.005015    0.004255    0.765625     0.795276
3  0.005506    0.002517    0.793651     0.784314
4  0.005716    0.003541    0.793651     0.788235
Accuracy: 0.775
