In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp '/content/drive/MyDrive/All Projects/TraumaChat/dataset_v2.xlsx' -d '/content/'

In [None]:
df = pd.read_excel('dataset_v2.xlsx')
df.head()

Unnamed: 0,Sno,Text,Category
0,1,I've been feeling overwhelmed lately with work...,Trauma
1,2,Life has been quite stressful for me recently....,Trauma
2,3,I've been enjoying some quality time with fami...,Non-Trauma
3,4,I've been feeling anxious and restless lately....,Trauma
4,5,I'm grateful for the support system I have. It...,Non-Trauma


In [None]:
df.columns

Index(['Sno', 'Text', 'Category'], dtype='object')

In [None]:
df['Category'].value_counts()

Trauma        102
Non-Trauma     98
Name: Category, dtype: int64

In [None]:
import plotly.graph_objects as go

# Extracting data
sentiment_labels = df['Category'].value_counts().index
sentiment_values = df['Category'].value_counts().values

# Creating a figure with differently styled bars
plot_figure = go.Figure(data=[
    go.Bar(x=sentiment_labels, y=sentiment_values,
           marker_color='rgba(55, 128, 191, 0.7)',
           marker_line_color='rgba(55, 128, 191, 1.0)',
           marker_line_width=2, opacity=0.6)
])

# Updating the layout
plot_figure.update_layout(
    title_text="Sentiments Overview",
    xaxis=dict(title="Sentiment Types", showgrid=False),
    yaxis=dict(title="Count", showgrid=False),
    plot_bgcolor='rgba(240, 240, 240, 0.95)',
    paper_bgcolor='rgba(245, 245, 245, 1)'
)

# Display the modified plot
plot_figure.show()


In [None]:
dataset = df
dataset['Label'] = dataset.Category.replace({'Trauma': 1, 'Non-Trauma': 0})
selected_data = dataset[["Text", "Label"]]
sample_data = selected_data.head(1000)
sample_data.shape


(200, 2)

#Cleaning Text

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem import WordNetLemmatizer
import string

nlp_processor = spacy.load('en_core_web_sm')
lemmatizer_tool = WordNetLemmatizer()

def preprocess_text(input_text):
    filtered_words = []

    processed_text = nlp_processor(input_text)
    named_entities = {ent.text for ent in processed_text.ents}

    for token in processed_text:
        if token.text not in named_entities:
            filtered_words.append(token.text)
    clean_text = " ".join(filtered_words)

    # Further text preprocessing
    clean_text = clean_text.lower().strip()
    clean_text = clean_text.replace("</br>", " ")
    clean_text = clean_text.replace("-", " ")
    clean_text = "".join([char for char in clean_text if char not in string.punctuation and not char.isdigit()])
    clean_text = " ".join([word for word in clean_text.split() if word not in STOP_WORDS])
    clean_text = "".join([lemmatizer_tool.lemmatize(word) for word in clean_text])

    return clean_text

dataset['Text'] = dataset['Text'].apply(preprocess_text)
dataset.head()

Unnamed: 0,Sno,Text,Category,Label
0,1,ve feeling overwhelmed lately work personal is...,Trauma,1
1,2,life stressful recently m having trouble findi...,Trauma,1
2,3,ve enjoying quality time family friends s refr...,Non-Trauma,0
3,4,ve feeling anxious restless lately nt calm mind,Trauma,1
4,5,m grateful support system helps navigate tough...,Non-Trauma,0


#TFID

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = dataset['Text'].tolist()
vectorizer = TfidfVectorizer(max_df=0.85, max_features=20000)
transformed_docs = vectorizer.fit_transform(corpus)
corpus_array = transformed_docs.toarray()
X, y = corpus_array, dataset['Category']

In [None]:
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42
X_training, X_validation, y_training, y_validation = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y)

print("Training data dimensions:", X_training.shape, y_training.shape)
print("Validation data dimensions:", X_validation.shape, y_validation.shape)


Training data dimensions: (150, 316) (150,)
Validation data dimensions: (50, 316) (50,)
