In [None]:
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2
  Using cached pybind11-2.10.4-py3-none-any.whl (222 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp39-cp39-linux_x86_64.whl size=4395609 sha256=630e683be7c00411d443d5eed6261695822052101b78d6e5044c8eab9c8086e7
  Stored in directory: /root/.cache/pip/wheels/64/57/bc/1741406019061d5664914b070bd3e71f6244648732bc96109e
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.4


In [None]:
#@title Setup & Config
#importing important libraries
import fasttext
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap


%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 8, 6



###connection to the colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
path='/split/' 
df_train=pd.read_csv(path+'cannbis_Train.csv')
df_test=pd.read_csv(path+'cannabis_Test.csv')

In [None]:
df_train.info(),df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Label           1007 non-null   int64 
 1   match_sentence  1007 non-null   object
dtypes: int64(1), object(1)
memory usage: 15.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253 entries, 0 to 252
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Label           253 non-null    int64 
 1   match_sentence  253 non-null    object
dtypes: int64(1), object(1)
memory usage: 4.1+ KB


(None, None)

In [None]:
# add the prefix '__label__' to the original labels of Train datset
df_train['label'] = df_train['Label'].apply(lambda x: '__label__' + str(x))
df_train.head()

Unnamed: 0,Label,match_sentence,label
0,0,WET READ VERSION #1 __________________________...,__label__0
1,1,"SOCIAL HISTORY: Tobacco one pack per day, occa...",__label__1
2,1,"Over the last week he has had little appetite,...",__label__1
3,0,"P: Continue to monitor, check CBG this a.m.",__label__0
4,1,Experimentation with marijuana.,__label__1


In [None]:
#Dropping the orinial label column
df_train = df_train.drop('Label', axis=1)
df_train.head()

Unnamed: 0,match_sentence,label
0,WET READ VERSION #1 __________________________...,__label__0
1,"SOCIAL HISTORY: Tobacco one pack per day, occa...",__label__1
2,"Over the last week he has had little appetite,...",__label__1
3,"P: Continue to monitor, check CBG this a.m.",__label__0
4,Experimentation with marijuana.,__label__1


In [None]:
#prepare train label_text col by concatenating two columns
df_train['label_text'] = df_train['label'] + ' ' + df_train['match_sentence']
df_train.head(5)

Unnamed: 0,match_sentence,label,label_text
0,WET READ VERSION #1 __________________________...,__label__0,__label__0 WET READ VERSION #1 _______________...
1,"SOCIAL HISTORY: Tobacco one pack per day, occa...",__label__1,__label__1 SOCIAL HISTORY: Tobacco one pack pe...
2,"Over the last week he has had little appetite,...",__label__1,__label__1 Over the last week he has had littl...
3,"P: Continue to monitor, check CBG this a.m.",__label__0,"__label__0 P: Continue to monitor, check CBG t..."
4,Experimentation with marijuana.,__label__1,__label__1 Experimentation with marijuana.


In [None]:
#Then Dropping the previous text column (match_sentence)
df1 = df_train.drop('match_sentence', axis=1)
df1.head()

Unnamed: 0,label,label_text
0,__label__0,__label__0 WET READ VERSION #1 _______________...
1,__label__1,__label__1 SOCIAL HISTORY: Tobacco one pack pe...
2,__label__1,__label__1 Over the last week he has had littl...
3,__label__0,"__label__0 P: Continue to monitor, check CBG t..."
4,__label__1,__label__1 Experimentation with marijuana.


###Prepare the test dataset for Fasttext implementation

In [None]:
# add the prefix '__label__' to the original labels of Test datset
df_test['label'] = df_test['Label'].apply(lambda x: '__label__' + str(x))
df_test.head()

Unnamed: 0,Label,match_sentence,label
0,0,Rate increased by 2 this afternoon following a...,__label__0
1,0,Spoke with her about starting plans to get inf...,__label__0
2,1,"Denies any ETOH, occassional marijuana, used t...",__label__1
3,1,"[**3-30**] drinks per week, smokes 1 ppd x 5 y...",__label__1
4,2,The patient has a positive history of alcohol ...,__label__2


In [None]:
df_test = df_test.drop('Label', axis=1)
df_test.head()

Unnamed: 0,match_sentence,label
0,Rate increased by 2 this afternoon following a...,__label__0
1,Spoke with her about starting plans to get inf...,__label__0
2,"Denies any ETOH, occassional marijuana, used t...",__label__1
3,"[**3-30**] drinks per week, smokes 1 ppd x 5 y...",__label__1
4,The patient has a positive history of alcohol ...,__label__2


In [None]:
df_test['label_text'] = df_test['label'] + ' ' + df_test['match_sentence']
df_test.head(5)

Unnamed: 0,match_sentence,label,label_text
0,Rate increased by 2 this afternoon following a...,__label__0,__label__0 Rate increased by 2 this afternoon ...
1,Spoke with her about starting plans to get inf...,__label__0,__label__0 Spoke with her about starting plans...
2,"Denies any ETOH, occassional marijuana, used t...",__label__1,"__label__1 Denies any ETOH, occassional mariju..."
3,"[**3-30**] drinks per week, smokes 1 ppd x 5 y...",__label__1,"__label__1 [**3-30**] drinks per week, smokes ..."
4,The patient has a positive history of alcohol ...,__label__2,__label__2 The patient has a positive history ...


In [None]:
df2 = df_test.drop('match_sentence', axis=1)
df2.head()

Unnamed: 0,label,label_text
0,__label__0,__label__0 Rate increased by 2 this afternoon ...
1,__label__0,__label__0 Spoke with her about starting plans...
2,__label__1,"__label__1 Denies any ETOH, occassional mariju..."
3,__label__1,"__label__1 [**3-30**] drinks per week, smokes ..."
4,__label__2,__label__2 The patient has a positive history ...


###Preprocessing the text column

In [None]:
import re

txt = "  NINA's | Bookcase/Bookshelf (3-Shelf/Shelve, White) | ? . hi"
txt = re.sub(r'[^\w\s\']',' ', txt)
txt = re.sub(' +', ' ', txt)
txt.strip().lower()

"nina's bookcase bookshelf 3 shelf shelve white hi"

In [None]:
#Creating function for text preprocessing
def preprocess(txt):
    txt = re.sub(r'[^\w\s\']',' ', txt)
    txt = re.sub(' +', ' ', txt)
    return txt.strip().lower() 

In [None]:
df1['new_label'] = df1['label_text'].map(preprocess)
df1.head()

Unnamed: 0,label,label_text,new_label
0,__label__0,__label__0 WET READ VERSION #1 _______________...,__label__0 wet read version 1 ________________...
1,__label__1,__label__1 SOCIAL HISTORY: Tobacco one pack pe...,__label__1 social history tobacco one pack per...
2,__label__1,__label__1 Over the last week he has had littl...,__label__1 over the last week he has had littl...
3,__label__0,"__label__0 P: Continue to monitor, check CBG t...",__label__0 p continue to monitor check cbg thi...
4,__label__1,__label__1 Experimentation with marijuana.,__label__1 experimentation with marijuana


In [None]:
train_df = df1.drop('label_text', axis=1)
train_df.head()

Unnamed: 0,label,new_label
0,__label__0,__label__0 wet read version 1 ________________...
1,__label__1,__label__1 social history tobacco one pack per...
2,__label__1,__label__1 over the last week he has had littl...
3,__label__0,__label__0 p continue to monitor check cbg thi...
4,__label__1,__label__1 experimentation with marijuana


###For Test data Preparation

In [None]:
df2['new_label'] = df2['label_text'].map(preprocess)
df2.head()

Unnamed: 0,label,label_text,new_label
0,__label__0,__label__0 Rate increased by 2 this afternoon ...,__label__0 rate increased by 2 this afternoon ...
1,__label__0,__label__0 Spoke with her about starting plans...,__label__0 spoke with her about starting plans...
2,__label__1,"__label__1 Denies any ETOH, occassional mariju...",__label__1 denies any etoh occassional marijua...
3,__label__1,"__label__1 [**3-30**] drinks per week, smokes ...",__label__1 3 30 drinks per week smokes 1 ppd x...
4,__label__2,__label__2 The patient has a positive history ...,__label__2 the patient has a positive history ...


In [None]:
test_df = df2.drop('label_text', axis=1)
test_df.head()

Unnamed: 0,label,new_label
0,__label__0,__label__0 rate increased by 2 this afternoon ...
1,__label__0,__label__0 spoke with her about starting plans...
2,__label__1,__label__1 denies any etoh occassional marijua...
3,__label__1,__label__1 3 30 drinks per week smokes 1 ppd x...
4,__label__2,__label__2 the patient has a positive history ...


In [None]:
#Size of the train and test dataset
train_df.shape, test_df.shape

((1007, 2), (253, 2))

In [None]:
# Prefixing each row of the category column with '__label__'
train_df[["label","new_label"]].to_csv('texts_train.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None)

test_df[["label","new_label"]].to_csv('texts_test.txt', 
                                     index = False, 
                                     sep = ' ',
                                     header = None )


In [None]:
import time

In [None]:
#Showing the time required to run the train dataset
start = time.time()
model = fasttext.train_supervised('texts_train.txt', wordNgrams=3, epoch = 150, lr = 0.8)
end = time.time()
print(end - start)

3.4939093589782715


In [None]:
#getting the prediction value from the test set
y_pred = test_df.iloc[:, 1].apply(lambda x: model.predict(x)[0][0])
y_true = test_df.iloc[:, 0]

In [None]:
y_true

0      __label__0
1      __label__0
2      __label__1
3      __label__1
4      __label__2
          ...    
248    __label__2
249    __label__0
250    __label__1
251    __label__1
252    __label__2
Name: label, Length: 253, dtype: object

In [None]:
y_pred

0      __label__0
1      __label__0
2      __label__3
3      __label__1
4      __label__2
          ...    
248    __label__2
249    __label__0
250    __label__1
251    __label__1
252    __label__2
Name: new_label, Length: 253, dtype: object

###Classification report Geneartion

In [None]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

  __label__0       0.97      0.99      0.98       135
  __label__1       0.84      0.81      0.82        67
  __label__2       0.77      0.83      0.80        41
  __label__3       0.75      0.60      0.67        10

    accuracy                           0.90       253
   macro avg       0.83      0.81      0.82       253
weighted avg       0.90      0.90      0.90       253

