# importing libraries and data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score

In [None]:
#importing data
df = pd.read_csv("Combined Data.csv")
df.head()

In [None]:
#remomving integer index column and adding index to dataframe
df.reset_index(drop=True, inplace = True)
df = df.drop('Unnamed: 0', axis = 1)
df.head()

# EDA

## null value analysis

In [None]:
print(df.isnull().sum())

In [None]:
#dropping missing statements
df = df.dropna()
print(df.isnull().sum())

In [None]:
#checking unique status values
df['status'].nunique()

## plots and distributions

In [None]:
sentvals=df['status'].value_counts()
print(sentvals)
df['status'].value_counts().plot(kind='bar', title='Distribution of Sentiments')

In [None]:
#statementh length
df['statelen'] = df['statement'].apply(lambda s : len(s) if s == s else 0)
sns.histplot(
   df['statelen'], kde=True,
    stat="density"
)
plt.title('Statement Length Distribution')
plt.show()

## box distribution

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(
    y=df['statelen'].values,
    name='Statement Length',
    boxpoints=False, # no data points
    boxmean='sd' # represent mean and standard deviation
))
fig.update_layout(title='Statement length distribution')
fig.show()

In [None]:
# Sort by statelen to get top 10 shortest and longest statements
shortest_statements = df.nsmallest(10, 'statelen')
longest_statements = df.nlargest(10, 'statelen')

# Create an interactive barplot for shortest statements
fig_shortest = px.bar(
    shortest_statements,
    x='statelen',
    y='status',
    color='status',
    orientation='h',
    title='Top 10 Shortest Statements with Status',
    labels={'statelen': 'Length of Statement', 'status': 'Status'},
    hover_data={'statement': True, 'statelen': True, 'status': True},
    template = "plotly_dark"
)

# Customize layout for shortest statements plot
fig_shortest.update_layout(
    yaxis={'categoryorder':'total ascending'},
    height=400,
    margin=dict(l=150, r=50, t=50, b=50),
    hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial")

)

# Create an interactive barplot for longest statements
fig_longest = px.bar(
    longest_statements,
    x='statelen',
    y='status',
    color='status',
    orientation='h',
    title='Top 10 Longest Statements with Status',
    labels={'statelen': 'Length of Statement', 'status': 'Status'},
    template = "plotly_dark",
    hover_data={'statement': True, 'statelen': True, 'status': True}
)

# Customize layout for longest statements plot
fig_longest.update_layout(
    yaxis={'categoryorder':'total ascending'},
    height=400,
    margin=dict(l=150, r=50, t=50, b=50),
    hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"
    )
)

# Show the plots
fig_shortest.show()
fig_longest.show()

In [None]:
fig = go.Figure()
for stat in df.status.unique():
    fig.add_trace(go.Box(
        y=df.loc[df['status'] == stat, 'statelen'].values,
        name=f'{stat}',
        boxpoints=False, # no data points
        boxmean='sd' # represent mean and standard deviation
    ))
fig.update_layout(title='statement length distribution for each status')
fig.show()

In [None]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df['statelen'].quantile(0.25)
Q3 = df['statelen'].quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bound for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers
filtered_df = df[(df['statelen'] >= lower_bound) & (df['statelen'] <= upper_bound)]
# Plot the distribution of statement lengths without outliers
filtered_df['statelen'].hist(bins=100)
plt.title('Distribution of Statement Lengths (Without Outliers)')
plt.xlabel('Length of Statements')
plt.ylabel('Frequency')
plt.show()

In [None]:
filtered_df["statelen"]=filtered_df["statement"].apply(len)

fig=plt.figure(figsize=(10,6))

sns.kdeplot(
    x=filtered_df["statelen"],
    hue=filtered_df["status"]
)
plt.show()

## Data cleaning

### duplicate analysis

In [None]:
duplidf = filtered_df.loc[filtered_df.statement.duplicated(keep=False)]
print("number of duplicated values : ",duplidf.shape[0])
duplidf.head()

checking if the duplicated statements have same column value

In [None]:
groupduplidf = duplidf.groupby(['statement'])['status'].apply(set).reset_index()
groupduplidf.head(10)

In [None]:
#checking how many statements reflect more than one diagnosis
groupduplidf.loc[groupduplidf['status'].apply(len) > 1]

In [None]:
#checking for statements which have an unusual amount of multi similarity
groupduplidf.loc[groupduplidf['status'].apply(len) > 2]

meaning #NAME? has insconsistent status
there fore we will remove this statement

In [None]:
# remove "#NAME?"
filtered_df = filtered_df.loc[filtered_df['statement'] != "#NAME?"]
# drop duplicate, keep only first row
filtered_df.drop_duplicates(subset=['statement'],keep='first', inplace=True)

### case folding

In [None]:
#cleaning text
import string
def cleantext(state):
    #Case folding
    state = state.str.lower()
    state = state.str.replace(r'[^\w\s]', '', regex = True)
    #remove characters that are not alphanumeric or whitespace, leaving only letters, numbers, and spaces
    state = state.str.replace("\n" , '', regex = True)
    state = state.str.replace('\d', '', regex = True)
    state = state.str.replace(r'\[.*?\]', '', regex = True)
    state = state.str.replace(r'https?://\S+|www\.\S+', '', regex =True)
    state = state.str.replace(r'<.*?>+', '', regex = True)
    state = state.str.replace(r'\w*\d\w*', '', regex = True)
    return state
#The result is a cleaner text with only relevant words, stripped of extraneous information

In [None]:
filtered_df["statement"] = cleantext(filtered_df["statement"])

In [None]:
filtered_df.head(10)

## removing stopwords

In [None]:
def removestopwords(text):
    stop_words = stopwords.words('english')
    text = text.apply(lambda x: " ".join(x for x in str(x).split() if x not in stop_words))
    return text

In [None]:
filtered_df["statement"] = removestopwords(filtered_df["statement"])

In [None]:
filtered_df.head(10)

## encoding

In [None]:
filtered_df["status"] = filtered_df["status"].map({'Anxiety':0, 'Normal':1, 'Depression':2, 'Suicidal':3, 'Stress':4, "Bipolar": 5, "Personality disorder": 6})

# data pre - processing

In [None]:
fdf = filtered_df.copy()
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus=[]
for i in range (0,len(fdf)):
  review = re.sub('[^a-zA-Z0-9]',' ',df['statement'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in stopwordslist]
  review = ' '.join(review)
  corpus.append(review)

In [None]:
y = fdf['status']

In [None]:
#train test split
x_train,x_test,y_train,y_test=train_test_split(corpus,y,random_state=42,test_size=0.2)
print(x_train.shape,y_train.shape,x_test.shape,y_test.shape)

## TF-IDF vectorization

In [None]:
vec=TfidfVectorizer()
x_train_tfidf=vec.fit_transform(x_train)
x_test_tfidf=vec.transform(x_test)

In [None]:
print(x_train_tfidf.shape,x_test_tfidf.shape)

In [None]:
print(x_train_tfidf)

# model training

## 1. Random forest classifier

In [None]:
rf=RandomForestClassifier()
rf.fit(x_train,y_train)

In [None]:
pred=rf.predict(x_test_tfidf)
print(classification_report(y_test,pred))
cm = confusion_matrix(y_test,pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
#confusion matrix
disp.plot()
plt.show()